import spacy
print("spaCy is ready for NLP tasks!")
# Project: https://spacy.io by Explosion AI
# spaCy is to production what NLTK is to research
pip install spacy
python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")
doc = nlp("Bonjour le monde!") # Works with multilingual support
doc = nlp("Apple is looking at buying U.K. startup.")
for ent in doc.ents:
print(ent.text, ent.label_)
print(nlp.pipe_names) # ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
# Extract entities from resumes or support tickets
# Visit https://spacy.io/usage for docs and tutorials
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("SpaCy makes NLP easy!")
for token in doc:
print(token.text, token.pos_, token.dep_)
doc = nlp("Let's learn spaCy.")
tokens = [token.text for token in doc]
print(tokens)
doc = nlp("U.S.A. is a country.")
for token in doc:
print(token.text)
for token in doc:
print(token.text, token.lemma_, token.pos_, token.is_alpha)
for sent in doc.sents:
print(sent.text)
from spacy.symbols import ORTH
special_case = [{ORTH: "¯\\_(ツ)_/¯"}]
nlp.tokenizer.add_special_case("¯\\_(ツ)_/¯", special_case)
doc = nlp("¯\\_(ツ)_/¯ is a shrug.")
print([token.text for token in doc])
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
custom_nlp = English()
custom_tokenizer = Tokenizer(custom_nlp.vocab)
doc = custom_tokenizer("Custom.tokenizer,loaded")
print([token.text for token in doc])
doc = nlp.make_doc("Just tokenize me.")
print(type(doc)) # <class 'spacy.tokens.doc.Doc'>
print(doc.text)
for token in doc:
print(token.text, token.is_stop, token.is_punct)
# Use regex cleaning before passing to tokenizer
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("SpaCy makes NLP tasks easy.")
for token in doc:
print(token.text, token.pos_)
for token in doc:
print(f"{token.text} - POS: {token.pos_}, Tag: {token.tag_}")
print(token.pos_, token.tag_) # e.g., NOUN, NNS
for token in doc:
print(token.text, token.pos_, token.dep_)
# Use spaCy's training loop with POS-labeled examples
# Calculate accuracy manually or use spaCy Scorer
text = "Let's book a room."
# "book" might be misclassified
features = [(token.text, token.pos_) for token in doc]
from spacy import displacy
displacy.render(doc, style="dep")
# Extract nouns from sentence
nouns = [token.text for token in doc if token.pos_ == "NOUN"]
doc = nlp("Elon Musk founded SpaceX in California.")
for ent in doc.ents:
print(ent.text, ent.label_)
print(spacy.explain("ORG")) # 'Companies, agencies, institutions'
for ent in doc.ents:
print(ent.text, ent.label_, spacy.explain(ent.label_))
locations = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
from spacy.pipeline import EntityRuler
ruler = nlp.add_pipe("entity_ruler")
ruler.add_patterns([{"label": "SOFTWARE", "pattern": "ChatGPT"}])
# Use spaCy CLI: python -m spacy train config.cfg --paths.train ./train.spacy
from spacy.training import Example
example = Example.from_dict(doc, {"entities": [(0, 4, "PERSON")]})
# Consider using additional span groups or custom components
displacy.render(doc, style="ent", jupyter=True)
# Extract person names for building user profiles
people = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
# Example: Parsing a sentence
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("She enjoys reading books.")
for token in doc:
print(token.text, "->", token.dep_, "->", token.head.text)
doc = nlp("Cats chase mice.")
for token in doc:
print(token.text, token.dep_, token.head.text)
# See dependency labels
for token in doc:
print(token.text, token.dep_)
print(doc[1].text, "is", doc[1].dep_, "of", doc[1].head.text)
from spacy import displacy
displacy.render(doc, style="dep", jupyter=True)
# Requires annotations: (text, {"heads": [...], "deps": [...]})
# Extract subject-object pairs for relations
# Manually check token.head and token.dep_ for unexpected results
doc = nlp("Alice emailed Bob")
for ent in doc.ents:
print(ent.text, ent.label_)
nlp = spacy.load("en_core_web_sm", disable=["ner"])
doc = nlp("The striped cats are playing.")
print([token.lemma_ for token in doc])
for token in doc:
print(token.text, "->", token.lemma_)
print(doc[2].morph) # Get morphology of token
print(doc[3].text, doc[3].lemma_, doc[3].morph)
# For Spanish: nlp = spacy.load("es_core_news_sm")
from spacy.lookups import Lookups
lookups = Lookups()
lookups.add_table("lemma_lookup", {"better": "good"})
nlp.get_pipe("lemmatizer").initialize(lambda: lookups)
# Morph features like Number=Plur or Tense=Past are informative
doc = nlp("She has gone")
print([token.lemma_ for token in doc])
# Compare token.lemma_ to gold standard lemmas
# Query normalization: convert "running" and "ran" to "run"
# Basic text classification overview
texts = ["I love AI", "I hate spam"]
labels = [1, 0] # 1=positive, 0=negative
import spacy
from spacy.pipeline.textcat import Config
nlp = spacy.blank("en")
textcat = nlp.add_pipe("textcat")
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")
train_data = [("I love this!", {"cats": {"POSITIVE": True, "NEGATIVE": False}})]
# Train model example
nlp.begin_training()
for i in range(10):
for text, annotations in train_data:
nlp.update([text], [annotations])
# Simple evaluation loop
doc = nlp("I love this!")
print(doc.cats)
{"cats": {"POSITIVE": True, "SPORTS": True}}
# Use data augmentation or class weights in training
# Insert custom pipeline components before textcat
nlp = spacy.load("en_core_web_sm")
nlp.to_disk("model_dir")
# Load with spacy.load("model_dir")
# Match "New York" phrase using rules
from spacy.matcher import Matcher
nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)
pattern = [{"LOWER": "new"}, {"LOWER": "york"}]
matcher.add("GPE", [pattern])
doc = nlp("I live in New York")
matches = matcher(doc)
print(matches)
from spacy.matcher import PhraseMatcher
phrases = [nlp(text) for text in ["New York", "San Francisco"]]
phrasematcher = PhraseMatcher(nlp.vocab)
phrasematcher.add("GPE", None, *phrases)
pattern = [{"LEMMA": "buy"}, {"POS": "DET", "OP": "?"}, {"POS": "NOUN"}]
matcher.add("BUY_PATTERN", [pattern])
matcher.add("GREETINGS", [[{"LOWER": "hi"}], [{"LOWER": "hello"}]])
def on_match(matcher, doc, id, matches):
print("Match found:", matches)
matcher.add("PATTERN", [pattern], on_match)
# Keep patterns concise and test extensively
# Extract phone numbers, product codes using Matcher
nlp.add_pipe(matcher)
# Test with doc and print matched spans
nlp = spacy.load("en_core_web_sm")
print(nlp.pipe_names)
@spacy.Language.component("custom_component")
def custom_component(doc):
print("Processing doc")
return doc
nlp.add_pipe("custom_component", last=True)
nlp.add_pipe("custom_component", before="ner")
def custom_component(doc):
# add custom attributes or annotations
return doc
from spacy.tokens import Doc
Doc.set_extension("is_custom", default=False)
with nlp.select_pipes(disable=["ner"]):
doc = nlp("Test")
docs = nlp.pipe(texts, batch_size=20)
nlp.to_disk("my_model")
nlp2 = spacy.load("my_model")
print(nlp.pipe_names)
# Build a custom pipeline for financial document processing
# Word vectors example using spaCy
import spacy
nlp = spacy.load("en_core_web_md")
doc = nlp("dog cat banana")
print(doc[0].vector[:5]) # first 5 dims of 'dog' vector
print(doc.vector[:5]) # document vector example
for token in doc:
print(token.text, token.vector[:3])
print(doc[0].similarity(doc[1])) # similarity between 'dog' and 'cat'
# Already done by loading "en_core_web_md"
# Train custom vectors with spacy vectors CLI (outside code)
dog = doc[0].vector
cat = doc[1].vector
banana = doc[2].vector
result = dog - cat + banana
print(result[:5])
# Use vectors as input in downstream ML models
# Use datasets like WordSim-353 for evaluation
# Example: find most similar word in vocab
most_similar = max(nlp.vocab.vectors.keys(), key=lambda k: nlp.vocab.vectors.similarity(k, doc[0].orth))
print(nlp.vocab.strings[most_similar])
# CLI command example:
# python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
# Example JSON format snippet
{
"text": "Apple is looking at buying U.K. startup.",
"entities": [(0, 5, "ORG")]
}
# Minimal config snippet example
[training]
batch_size = 32
[components.ner]
max_epochs = 20
# Example Python snippet for training NER
import spacy
from spacy.training import Example
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
ner.add_label("ORG")
# Training loop omitted for brevity
# Add textcat pipe
textcat = nlp.add_pipe("textcat")
textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")
# Add parser pipe
parser = nlp.add_pipe("parser")
# Add dependency labels
# python -m spacy train config.cfg --output ./model_output
# Logs print after each epoch in CLI
# nlp.evaluate(...) method used in scripts
nlp.to_disk("./my_model")
# Load later
nlp2 = spacy.load("./my_model")
nlp.to_disk("model_dir")
import spacy
nlp = spacy.load("model_dir")
doc = nlp("This is a test.")
# Use version tags in filenames or metadata files
# Save only NER component
ner = nlp.get_pipe("ner")
ner.to_disk("ner_dir")
# Use huggingface_hub CLI or API
# Use requirements.txt and README.md alongside model files
# Use directory structure or model registry tools
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")
ner.from_disk("ner_dir")
# Use caching libraries or persistent server processes
# Adjust batch size during inference for speed
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup.")
displacy.render(doc, style="dep") # Render dependency tree
displacy.render(doc, style="dep") # Default dependency parse visualization
displacy.render(doc, style="ent") # Highlight named entities
options = {"colors": {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}}
displacy.render(doc, style="ent", options=options)
displacy.serve(doc, style="dep")
html = displacy.render(doc, style="ent", page=True)
with open("ent.html", "w") as f:
f.write(html)
displacy.render(doc, style="dep", jupyter=True)
# Interactive features are built-in in served visualizations
# Modify options dict for colors and font size
# Use custom extension attributes for visualization
# List available models
!python -m spacy validate
nlp_fr = spacy.load("fr_core_news_sm")
doc = nlp_fr("Ceci est une phrase en français.")
print([token.text for token in doc]) # Tokenization respects French rules
# Create blank model and train on annotated data
nlp = spacy.blank("xx") # blank multi-language model
from langdetect import detect
text = "Hola, ¿cómo estás?"
lang = detect(text)
print(lang) # es for Spanish
# Use langdetect or fasttext for detection before processing
# Add custom components for language-specific processing
# Research needed; no simple code
# Evaluate with accuracy, F1 on multilingual corpora
# Integrate with translation APIs and spaCy pipelines
# Example: wrap PyTorch model as spaCy pipeline component
# Use spaCy tokenizer + TensorFlow model input pipeline
# Create custom layer extending Thinc API
from spacy_transformers import TransformerModel
# Add transformer to spaCy pipeline
# Train transformer-based pipeline on your dataset
# spaCy medium model includes GloVe vectors
# Use HuggingFace Trainer API alongside spaCy preprocessing
# Use spacy convert or third-party tools
# spaCy pipelines allow custom components chaining
# Research papers and open-source projects
# Build lean pipeline by disabling unnecessary components
nlp = spacy.load("en_core_web_sm", exclude=["parser", "ner"])
texts = ["Text one.", "Text two.", "Text three."]
for doc in nlp.pipe(texts, batch_size=50):
print(doc.text)
for doc in nlp.pipe(texts, n_process=4):
print(doc.text)
import gc
gc.collect()
# Use simpler tokenizer or disable unnecessary tokenization extensions
nlp = spacy.load("en_core_web_sm") # smaller model
import spacy
spacy.require_gpu()
nlp = spacy.load("en_core_web_trf")
import cProfile
cProfile.run('nlp("Some large text")')
for doc in nlp.pipe(large_texts, batch_size=1000):
# process docs
pass
# Example: Docker container with FastAPI for model serving
nlp.to_disk("model_dir")
nlp2 = spacy.load("model_dir")
from fastapi import FastAPI
import spacy
app = FastAPI()
nlp = spacy.load("en_core_web_sm")
@app.post("/predict")
def predict(text: str):
doc = nlp(text)
return {"entities": [(ent.text, ent.label_) for ent in doc.ents]}
# See example above for basic FastAPI + spaCy integration
# Dockerfile example
FROM python:3.10-slim
RUN pip install spacy fastapi uvicorn
COPY . /app
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
# Deploy FastAPI app to cloud container services
# Export model with spacy to ONNX for edge inference
import logging
logging.basicConfig(level=logging.INFO)
# Kubernetes or cloud autoscaling example
# Use OAuth or API keys for access control
# GitHub Actions or Jenkins pipeline for spaCy app
# Basic intent recognition example
doc = nlp("Book a flight")
if "book" in doc.text.lower():
print("Booking intent detected")
for ent in doc.ents:
print(ent.text, ent.label_)
# Summarization via sentence scoring (simplified)
from textblob import TextBlob
blob = TextBlob(doc.text)
print(blob.sentiment.polarity)
# Train textcat component and classify docs
from spacy import displacy
displacy.render(doc, style="ent")
# Custom NER for resumes with spaCy
# Preprocess social data, analyze with spaCy + sentiment
# Use similarity queries with spaCy vectors
# Flask or FastAPI to serve NLP app
# Core spaCy + ecosystem libraries like Prodigy, Thinc
import spacy
nlp = spacy.load("en_core_web_sm")
# Browse spaCy Universe for tools: https://spacy.io/universe
# Example CLI: prodigy ner.manual en_core_web_sm ./data.jsonl --label PERSON,ORG
import spacy_transformers
nlp = spacy_transformers.load("en_core_web_trf")
# Use streamlit_spacy for dashboards
# Install third-party models with pip, then load like core models
@Language.component("custom_component")
def custom_component(doc):
# custom logic
return doc
nlp.add_pipe("custom_component", last=True)
# https://github.com/explosion/spaCy/discussions
# See CONTRIBUTING.md in spaCy repo for details
# https://spacy.io/usage/releases
# Load transformer pipeline
import spacy_transformers
nlp = spacy_transformers.load("en_core_web_trf")
# Use explainer libraries like Captum or SHAP with NLP models
# Analyze embeddings for bias using WEAT or similar tests
# Use Prodigy for active learning loops to improve model with minimal labels
# Select uncertain predictions for annotation
# Examples: masked language modeling, next sentence prediction
# Fine-tune base model on medical or legal texts
# Combine spaCy text embeddings with image features from CNNs
# Explore papers on arXiv or conferences like ACL, NeurIPS
# Follow spaCy roadmap on GitHub for upcoming features