import spacy print("spaCy is ready for NLP tasks!")
# Project: https://spacy.io by Explosion AI
# spaCy is to production what NLTK is to research
pip install spacy python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm") doc = nlp("Bonjour le monde!") # Works with multilingual support
doc = nlp("Apple is looking at buying U.K. startup.") for ent in doc.ents: print(ent.text, ent.label_)
print(nlp.pipe_names) # ['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']
# Extract entities from resumes or support tickets
# Visit https://spacy.io/usage for docs and tutorials
import spacy nlp = spacy.load("en_core_web_sm") doc = nlp("SpaCy makes NLP easy!") for token in doc: print(token.text, token.pos_, token.dep_)
doc = nlp("Let's learn spaCy.") tokens = [token.text for token in doc] print(tokens)
doc = nlp("U.S.A. is a country.") for token in doc: print(token.text)
for token in doc: print(token.text, token.lemma_, token.pos_, token.is_alpha)
for sent in doc.sents: print(sent.text)
from spacy.symbols import ORTH special_case = [{ORTH: "¯\\_(ツ)_/¯"}] nlp.tokenizer.add_special_case("¯\\_(ツ)_/¯", special_case) doc = nlp("¯\\_(ツ)_/¯ is a shrug.") print([token.text for token in doc])
from spacy.tokenizer import Tokenizer from spacy.lang.en import English custom_nlp = English() custom_tokenizer = Tokenizer(custom_nlp.vocab) doc = custom_tokenizer("Custom.tokenizer,loaded") print([token.text for token in doc])
doc = nlp.make_doc("Just tokenize me.")
print(type(doc)) # <class 'spacy.tokens.doc.Doc'> print(doc.text)
for token in doc: print(token.text, token.is_stop, token.is_punct)
# Use regex cleaning before passing to tokenizer
import spacy nlp = spacy.load("en_core_web_sm") doc = nlp("SpaCy makes NLP tasks easy.") for token in doc: print(token.text, token.pos_)
for token in doc: print(f"{token.text} - POS: {token.pos_}, Tag: {token.tag_}")
print(token.pos_, token.tag_) # e.g., NOUN, NNS
for token in doc: print(token.text, token.pos_, token.dep_)
# Use spaCy's training loop with POS-labeled examples
# Calculate accuracy manually or use spaCy Scorer
text = "Let's book a room." # "book" might be misclassified
features = [(token.text, token.pos_) for token in doc]
from spacy import displacy displacy.render(doc, style="dep")
# Extract nouns from sentence nouns = [token.text for token in doc if token.pos_ == "NOUN"]
doc = nlp("Elon Musk founded SpaceX in California.") for ent in doc.ents: print(ent.text, ent.label_)
print(spacy.explain("ORG")) # 'Companies, agencies, institutions'
for ent in doc.ents: print(ent.text, ent.label_, spacy.explain(ent.label_))
locations = [ent.text for ent in doc.ents if ent.label_ == "GPE"]
from spacy.pipeline import EntityRuler ruler = nlp.add_pipe("entity_ruler") ruler.add_patterns([{"label": "SOFTWARE", "pattern": "ChatGPT"}])
# Use spaCy CLI: python -m spacy train config.cfg --paths.train ./train.spacy
from spacy.training import Example example = Example.from_dict(doc, {"entities": [(0, 4, "PERSON")]})
# Consider using additional span groups or custom components
displacy.render(doc, style="ent", jupyter=True)
# Extract person names for building user profiles people = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]
# Example: Parsing a sentence import spacy nlp = spacy.load("en_core_web_sm") doc = nlp("She enjoys reading books.") for token in doc: print(token.text, "->", token.dep_, "->", token.head.text)
doc = nlp("Cats chase mice.") for token in doc: print(token.text, token.dep_, token.head.text)
# See dependency labels for token in doc: print(token.text, token.dep_)
print(doc[1].text, "is", doc[1].dep_, "of", doc[1].head.text)
from spacy import displacy displacy.render(doc, style="dep", jupyter=True)
# Requires annotations: (text, {"heads": [...], "deps": [...]})
# Extract subject-object pairs for relations
# Manually check token.head and token.dep_ for unexpected results
doc = nlp("Alice emailed Bob") for ent in doc.ents: print(ent.text, ent.label_)
nlp = spacy.load("en_core_web_sm", disable=["ner"])
doc = nlp("The striped cats are playing.") print([token.lemma_ for token in doc])
for token in doc: print(token.text, "->", token.lemma_)
print(doc[2].morph) # Get morphology of token
print(doc[3].text, doc[3].lemma_, doc[3].morph)
# For Spanish: nlp = spacy.load("es_core_news_sm")
from spacy.lookups import Lookups lookups = Lookups() lookups.add_table("lemma_lookup", {"better": "good"}) nlp.get_pipe("lemmatizer").initialize(lambda: lookups)
# Morph features like Number=Plur or Tense=Past are informative
doc = nlp("She has gone") print([token.lemma_ for token in doc])
# Compare token.lemma_ to gold standard lemmas
# Query normalization: convert "running" and "ran" to "run"
# Basic text classification overview texts = ["I love AI", "I hate spam"] labels = [1, 0] # 1=positive, 0=negative
import spacy from spacy.pipeline.textcat import Config nlp = spacy.blank("en") textcat = nlp.add_pipe("textcat") textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE")
train_data = [("I love this!", {"cats": {"POSITIVE": True, "NEGATIVE": False}})]
# Train model example nlp.begin_training() for i in range(10): for text, annotations in train_data: nlp.update([text], [annotations])
# Simple evaluation loop doc = nlp("I love this!") print(doc.cats)
{"cats": {"POSITIVE": True, "SPORTS": True}}
# Use data augmentation or class weights in training
# Insert custom pipeline components before textcat
nlp = spacy.load("en_core_web_sm")
nlp.to_disk("model_dir") # Load with spacy.load("model_dir")
# Match "New York" phrase using rules
from spacy.matcher import Matcher nlp = spacy.load("en_core_web_sm") matcher = Matcher(nlp.vocab) pattern = [{"LOWER": "new"}, {"LOWER": "york"}] matcher.add("GPE", [pattern]) doc = nlp("I live in New York") matches = matcher(doc) print(matches)
from spacy.matcher import PhraseMatcher phrases = [nlp(text) for text in ["New York", "San Francisco"]] phrasematcher = PhraseMatcher(nlp.vocab) phrasematcher.add("GPE", None, *phrases)
pattern = [{"LEMMA": "buy"}, {"POS": "DET", "OP": "?"}, {"POS": "NOUN"}] matcher.add("BUY_PATTERN", [pattern])
matcher.add("GREETINGS", [[{"LOWER": "hi"}], [{"LOWER": "hello"}]])
def on_match(matcher, doc, id, matches): print("Match found:", matches) matcher.add("PATTERN", [pattern], on_match)
# Keep patterns concise and test extensively
# Extract phone numbers, product codes using Matcher
nlp.add_pipe(matcher)
# Test with doc and print matched spans
nlp = spacy.load("en_core_web_sm") print(nlp.pipe_names)
@spacy.Language.component("custom_component") def custom_component(doc): print("Processing doc") return doc nlp.add_pipe("custom_component", last=True)
nlp.add_pipe("custom_component", before="ner")
def custom_component(doc): # add custom attributes or annotations return doc
from spacy.tokens import Doc Doc.set_extension("is_custom", default=False)
with nlp.select_pipes(disable=["ner"]): doc = nlp("Test")
docs = nlp.pipe(texts, batch_size=20)
nlp.to_disk("my_model") nlp2 = spacy.load("my_model")
print(nlp.pipe_names)
# Build a custom pipeline for financial document processing
# Word vectors example using spaCy import spacy nlp = spacy.load("en_core_web_md") doc = nlp("dog cat banana") print(doc[0].vector[:5]) # first 5 dims of 'dog' vector
print(doc.vector[:5]) # document vector example
for token in doc: print(token.text, token.vector[:3])
print(doc[0].similarity(doc[1])) # similarity between 'dog' and 'cat'
# Already done by loading "en_core_web_md"
# Train custom vectors with spacy vectors CLI (outside code)
dog = doc[0].vector cat = doc[1].vector banana = doc[2].vector result = dog - cat + banana print(result[:5])
# Use vectors as input in downstream ML models
# Use datasets like WordSim-353 for evaluation
# Example: find most similar word in vocab most_similar = max(nlp.vocab.vectors.keys(), key=lambda k: nlp.vocab.vectors.similarity(k, doc[0].orth)) print(nlp.vocab.strings[most_similar])
# CLI command example: # python -m spacy train config.cfg --output ./output --paths.train ./train.spacy --paths.dev ./dev.spacy
# Example JSON format snippet { "text": "Apple is looking at buying U.K. startup.", "entities": [(0, 5, "ORG")] }
# Minimal config snippet example [training] batch_size = 32 [components.ner] max_epochs = 20
# Example Python snippet for training NER import spacy from spacy.training import Example nlp = spacy.blank("en") ner = nlp.add_pipe("ner") ner.add_label("ORG") # Training loop omitted for brevity
# Add textcat pipe textcat = nlp.add_pipe("textcat") textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE")
# Add parser pipe parser = nlp.add_pipe("parser") # Add dependency labels
# python -m spacy train config.cfg --output ./model_output
# Logs print after each epoch in CLI
# nlp.evaluate(...) method used in scripts
nlp.to_disk("./my_model") # Load later nlp2 = spacy.load("./my_model")
nlp.to_disk("model_dir")
import spacy nlp = spacy.load("model_dir") doc = nlp("This is a test.")
# Use version tags in filenames or metadata files
# Save only NER component ner = nlp.get_pipe("ner") ner.to_disk("ner_dir")
# Use huggingface_hub CLI or API
# Use requirements.txt and README.md alongside model files
# Use directory structure or model registry tools
nlp = spacy.blank("en") ner = nlp.add_pipe("ner") ner.from_disk("ner_dir")
# Use caching libraries or persistent server processes
# Adjust batch size during inference for speed
import spacy from spacy import displacy nlp = spacy.load("en_core_web_sm") doc = nlp("Apple is looking at buying U.K. startup.") displacy.render(doc, style="dep") # Render dependency tree
displacy.render(doc, style="dep") # Default dependency parse visualization
displacy.render(doc, style="ent") # Highlight named entities
options = {"colors": {"ORG": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}} displacy.render(doc, style="ent", options=options)
displacy.serve(doc, style="dep")
html = displacy.render(doc, style="ent", page=True) with open("ent.html", "w") as f: f.write(html)
displacy.render(doc, style="dep", jupyter=True)
# Interactive features are built-in in served visualizations
# Modify options dict for colors and font size
# Use custom extension attributes for visualization
# List available models !python -m spacy validate
nlp_fr = spacy.load("fr_core_news_sm") doc = nlp_fr("Ceci est une phrase en français.")
print([token.text for token in doc]) # Tokenization respects French rules
# Create blank model and train on annotated data nlp = spacy.blank("xx") # blank multi-language model
from langdetect import detect text = "Hola, ¿cómo estás?" lang = detect(text) print(lang) # es for Spanish
# Use langdetect or fasttext for detection before processing
# Add custom components for language-specific processing
# Research needed; no simple code
# Evaluate with accuracy, F1 on multilingual corpora
# Integrate with translation APIs and spaCy pipelines
# Example: wrap PyTorch model as spaCy pipeline component
# Use spaCy tokenizer + TensorFlow model input pipeline
# Create custom layer extending Thinc API
from spacy_transformers import TransformerModel # Add transformer to spaCy pipeline
# Train transformer-based pipeline on your dataset
# spaCy medium model includes GloVe vectors
# Use HuggingFace Trainer API alongside spaCy preprocessing
# Use spacy convert or third-party tools
# spaCy pipelines allow custom components chaining
# Research papers and open-source projects
# Build lean pipeline by disabling unnecessary components nlp = spacy.load("en_core_web_sm", exclude=["parser", "ner"])
texts = ["Text one.", "Text two.", "Text three."] for doc in nlp.pipe(texts, batch_size=50): print(doc.text)
for doc in nlp.pipe(texts, n_process=4): print(doc.text)
import gc gc.collect()
# Use simpler tokenizer or disable unnecessary tokenization extensions
nlp = spacy.load("en_core_web_sm") # smaller model
import spacy spacy.require_gpu() nlp = spacy.load("en_core_web_trf")
import cProfile cProfile.run('nlp("Some large text")')
for doc in nlp.pipe(large_texts, batch_size=1000): # process docs pass
# Example: Docker container with FastAPI for model serving
nlp.to_disk("model_dir") nlp2 = spacy.load("model_dir")
from fastapi import FastAPI import spacy app = FastAPI() nlp = spacy.load("en_core_web_sm") @app.post("/predict") def predict(text: str): doc = nlp(text) return {"entities": [(ent.text, ent.label_) for ent in doc.ents]}
# See example above for basic FastAPI + spaCy integration
# Dockerfile example FROM python:3.10-slim RUN pip install spacy fastapi uvicorn COPY . /app CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
# Deploy FastAPI app to cloud container services
# Export model with spacy to ONNX for edge inference
import logging logging.basicConfig(level=logging.INFO)
# Kubernetes or cloud autoscaling example
# Use OAuth or API keys for access control
# GitHub Actions or Jenkins pipeline for spaCy app
# Basic intent recognition example doc = nlp("Book a flight") if "book" in doc.text.lower(): print("Booking intent detected")
for ent in doc.ents: print(ent.text, ent.label_)
# Summarization via sentence scoring (simplified)
from textblob import TextBlob blob = TextBlob(doc.text) print(blob.sentiment.polarity)
# Train textcat component and classify docs
from spacy import displacy displacy.render(doc, style="ent")
# Custom NER for resumes with spaCy
# Preprocess social data, analyze with spaCy + sentiment
# Use similarity queries with spaCy vectors
# Flask or FastAPI to serve NLP app
# Core spaCy + ecosystem libraries like Prodigy, Thinc import spacy nlp = spacy.load("en_core_web_sm")
# Browse spaCy Universe for tools: https://spacy.io/universe
# Example CLI: prodigy ner.manual en_core_web_sm ./data.jsonl --label PERSON,ORG
import spacy_transformers nlp = spacy_transformers.load("en_core_web_trf")
# Use streamlit_spacy for dashboards
# Install third-party models with pip, then load like core models
@Language.component("custom_component") def custom_component(doc): # custom logic return doc nlp.add_pipe("custom_component", last=True)
# https://github.com/explosion/spaCy/discussions
# See CONTRIBUTING.md in spaCy repo for details
# https://spacy.io/usage/releases
# Load transformer pipeline import spacy_transformers nlp = spacy_transformers.load("en_core_web_trf")
# Use explainer libraries like Captum or SHAP with NLP models
# Analyze embeddings for bias using WEAT or similar tests
# Use Prodigy for active learning loops to improve model with minimal labels
# Select uncertain predictions for annotation
# Examples: masked language modeling, next sentence prediction
# Fine-tune base model on medical or legal texts
# Combine spaCy text embeddings with image features from CNNs
# Explore papers on arXiv or conferences like ACL, NeurIPS
# Follow spaCy roadmap on GitHub for upcoming features