import nltk print("NLTK is installed and ready to use.")
# Check version import nltk print(nltk.__version__)
from nltk.corpus import gutenberg print(gutenberg.fileids())
pip install nltk
import nltk nltk.download('punkt') nltk.download('stopwords')
# View NLTK data directory print(nltk.data.path)
from nltk.tokenize import sent_tokenize, word_tokenize text = "NLTK is great. It makes NLP easier!" print(sent_tokenize(text)) print(word_tokenize(text))
from nltk.stem import PorterStemmer, WordNetLemmatizer stemmer = PorterStemmer() lemmatizer = WordNetLemmatizer() print(stemmer.stem("running")) print(lemmatizer.lemmatize("running", pos="v"))
from nltk.corpus import stopwords words = ["this", "is", "an", "example"] filtered = [w for w in words if w not in stopwords.words('english')] print(filtered)
import nltk nltk.download('averaged_perceptron_tagger') print(nltk.pos_tag(["This", "is", "an", "example"]))
from nltk.tag import DefaultTagger tagger = DefaultTagger('NN') print(tagger.tag(["I", "run"]))
from nltk.corpus import treebank train = treebank.tagged_sents()[:3000] from nltk.tag import UnigramTagger tagger = UnigramTagger(train) print(tagger.tag(["This", "is", "fine"]))
from nltk import CFG, RecursiveDescentParser grammar = CFG.fromstring("S -> NP VP; NP -> 'I'; VP -> 'run'") parser = RecursiveDescentParser(grammar) for tree in parser.parse(["I", "run"]): tree.pretty_print()
import nltk nltk.download('maxent_ne_chunker') nltk.download('words') sentence = nltk.pos_tag(nltk.word_tokenize("Barack Obama was born in Hawaii")) print(nltk.ne_chunk(sentence))
from nltk.tree import Tree t = Tree.fromstring("(S (NP I) (VP (V run)))") t.pretty_print()
from nltk.corpus import gutenberg print(gutenberg.fileids()) print(gutenberg.words('austen-emma.txt')[:20])
from nltk.corpus import wordnet syns = wordnet.synsets("good") print(syns[0].definition())
for syn in wordnet.synsets('run'): print(syn.name(), syn.definition())
from nltk import FreqDist words = ['I', 'love', 'NLP', 'NLP', 'is', 'fun'] fdist = FreqDist(words) print(fdist.most_common())
from nltk.classify import NaiveBayesClassifier train_set = [({'word': 'awesome'}, 'pos'), ({'word': 'bad'}, 'neg')] classifier = NaiveBayesClassifier.train(train_set) print(classifier.classify({'word': 'awesome'}))
from nltk.classify.util import accuracy print("Accuracy:", accuracy(classifier, train_set))
from nltk.wsd import lesk from nltk.tokenize import word_tokenize print(lesk(word_tokenize("I went to the bank to deposit money"), 'bank'))
from nltk.sentiment import SentimentIntensityAnalyzer sia = SentimentIntensityAnalyzer() print(sia.polarity_scores("This product is great!"))
# Advanced: Use AllenNLP or SpaCy for SRL in Python
from transformers import pipeline translator = pipeline("translation_en_to_fr") print(translator("Hello, how are you?")[0]['translation_text'])
from sumy.parsers.plaintext import PlaintextParser # Extractive summarization example
from nltk import ne_chunk, pos_tag, word_tokenize print(ne_chunk(pos_tag(word_tokenize("Barack Obama was born in Hawaii"))))
# Simple chatbot response based on keyword matching user_input = "Hello" if "hello" in user_input.lower(): print("Hi! How can I help you?")
from nltk import FreqDist, word_tokenize text = "NLTK is powerful. NLTK is easy to use." fd = FreqDist(word_tokenize(text.lower())) print(fd.most_common())
# Use corpora and parsing tools to explore linguistic structures from nltk.corpus import treebank print(treebank.parsed_sents()[0])
import re pattern = r"\d+" result = re.findall(pattern, "There are 12 apples and 34 bananas") print(result) # ['12', '34']
text = "Contact me at hello@example.com" email = re.search(r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+", text) print(email.group())
import nltk nltk.download('punkt') text = "The cat sat on the mat." tokens = nltk.regexp_tokenize(text, pattern=r'\s|[\.,]', gaps=True) print(tokens)
pattern = r"\w+" tokens = nltk.regexp_tokenize("Test 123, this!", pattern) print(tokens) # ['Test', '123', 'this']
text = "Barack Obama was born in Hawaii" matches = re.findall(r"[A-Z][a-z]+ [A-Z][a-z]+", text) print(matches) # ['Barack Obama']
raw = "Hello!!! Are you $$$ ready?" clean = re.sub(r"[^\w\s]", "", raw) print(clean) # "Hello Are you ready"
text = "Price: $45, Discount: $5" matches = re.findall(r"(?<=\$)\d+", text) print(matches) # ['45', '5']
text = re.sub(r"[^a-zA-Z\s]", "", "It's NLP 101!") tokens = nltk.word_tokenize(text) print(tokens)
pattern = re.compile(r"\w+") tokens = pattern.findall("Time is precious!") print(tokens)
log = "ERROR at 2023-06-01 10:33:21" date = re.findall(r"\d{4}-\d{2}-\d{2}", log) print(date) # ['2023-06-01']
# Example: Next word prediction P("apple" | "I eat an") = 0.2
bigram = ("I", "am") trigram = ("I", "am", "happy")
from nltk.util import ngrams from nltk import FreqDist tokens = nltk.word_tokenize("I love NLP and NLP loves me.") bigrams = list(ngrams(tokens, 2)) print(bigrams)
# Add-one smoothing example: (count + 1)/(total + vocab)
# Sample perplexity computation import math P = 0.01 # example sequence probability perplexity = math.pow(1/P, 1/len(sequence))
# Chatbot: "User: How are you?" → Language model: "I’m doing great!"
# Limitation: P("He ate it because he was hungry") can't capture long context
text = "Once upon" next_word = model.predict(text)
# BERT, GPT are deep language models with transformer architecture
# See NLTK book Chapter 6 for step-by-step exercises
from nltk.corpus import gutenberg print(gutenberg.words('austen-emma.txt')[:20])
from nltk.corpus.reader import PlaintextCorpusReader corpus = PlaintextCorpusReader('data/', '.*\.txt') print(corpus.words())
from nltk.corpus import brown print(brown.categories())
from random import sample subset = sample(gutenberg.fileids(), 2)
from nltk.corpus import inaugural print(inaugural.fileids())
text = gutenberg.words('austen-emma.txt') print(len(text), len(set(text)))
from nltk.corpus import treebank print(treebank.tagged_words())
# Save sentences into .txt files and load with NLTK
tokens = [w.lower() for w in corpus.words() if w.isalpha()]
# Build model using corpus features for text classification
# Overview: Classify text using Naive Bayes from nltk.classify import NaiveBayesClassifier train_data = [({'word': 'hello'}, 'greet'), ({'word': 'bye'}, 'farewell')] classifier = NaiveBayesClassifier.train(train_data) print(classifier.classify({'word': 'hello'}))
def extract_features(text): words = set(text.split()) return {'contains_python': 'python' in words}
from sklearn.naive_bayes import MultinomialNB from nltk.classify import SklearnClassifier classifier = SklearnClassifier(MultinomialNB()).train(train_data)
from nltk import classify accuracy = classify.accuracy(classifier, test_data) print(f"Accuracy: {accuracy:.2f}")
from sklearn.metrics import classification_report print(classification_report(y_true, y_pred))
from sklearn.model_selection import cross_val_score scores = cross_val_score(model, X, y, cv=5) print(scores.mean())
from sklearn.pipeline import Pipeline pipe = Pipeline([('vect', vectorizer), ('clf', MultinomialNB())]) pipe.fit(X_train, y_train)
from imblearn.over_sampling import SMOTE X_res, y_res = SMOTE().fit_resample(X, y)
from sklearn.model_selection import GridSearchCV gs = GridSearchCV(pipe, {'clf__alpha': [0.1, 1, 10]}, cv=3) gs.fit(X_train, y_train)
# Example: Sentiment analysis of tweets # Train classifier to label tweets as 'positive' or 'negative'
from nltk import CFG grammar = CFG.fromstring("S -> NP VP; NP -> 'I'; VP -> 'sleep'")
from nltk.parse.generate import generate for sentence in generate(grammar, n=3): print(' '.join(sentence))
from nltk.parse.chart import ChartParser parser = ChartParser(grammar) for tree in parser.parse(['I', 'sleep']): tree.pretty_print()
from nltk.parse import RecursiveDescentParser rd_parser = RecursiveDescentParser(grammar)
# Use EarleyChartParser or ChartParser from nltk.parse.chart
# Use third-party tools like spaCy for full dependency parsing
# Parse a sentence with ambiguous structure and view trees
# Traverse parse tree to extract NP (noun phrases)
# Enable tracing: parser = ChartParser(grammar, trace=2)
# Create grammars with optional phrases and recursive rules
from nltk import ne_chunk, pos_tag, word_tokenize tree = ne_chunk(pos_tag(word_tokenize("Barack Obama was born in Hawaii"))) tree.draw()
# ne_chunk returns a tree with NE labels
# Requires labeled IOB datasets and feature extractors
# Analyze NE tags in chunk tree
# Compare predicted and actual entity spans
# Example: Apple (fruit vs. company) requires context
# Apply NER after parsing to identify named phrases
# Extract names of companies and locations from news feeds
# Use nltk.tree.Tree.draw() or render in web app
# Combine spaCy NER with sklearn classifier for entity classification
text = "I love this product!" # Sentiment polarity is positive
from nltk.sentiment.util import demo_liu_hu_lexicon demo_liu_hu_lexicon("This book is amazing and inspiring.")
from nltk.sentiment.vader import SentimentIntensityAnalyzer nltk.download('vader_lexicon') analyzer = SentimentIntensityAnalyzer() print(analyzer.polarity_scores("Awesome product! Loved it."))
from nltk.classify import NaiveBayesClassifier train_data = [({'text': 'good'}, 'pos'), ({'text': 'bad'}, 'neg')] classifier = NaiveBayesClassifier.train(train_data)
# Rule-based fix: Flip polarity if "not good" is detected
from nltk.classify.util import accuracy accuracy(classifier, test_set)
# Retrain on medical, finance, or movie domains
def predict_sentiment(text): return analyzer.polarity_scores(text)["compound"]
# Example: Monitor live tweets for negative sentiment spikes
features = {"sentiment": 0.8, "has_emojis": True, "length": 25}
# Topics in news articles: Politics, Sports, Economy
from gensim.models.ldamodel import LdaModel # Train on bag-of-words corpus
from gensim.corpora.dictionary import Dictionary texts = [["this", "is", "a", "test"], ["another", "test"]] dictionary = Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts] lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=2)
from gensim.models.coherencemodel import CoherenceModel cm = CoherenceModel(model=lda, texts=texts, dictionary=dictionary, coherence='c_v') print(cm.get_coherence())
from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.cluster import KMeans
import pyLDAvis.gensim_models pyLDAvis.enable_notebook() pyLDAvis.gensim_models.prepare(lda, corpus, dictionary)
# Search: "government budget" returns articles on economy topics
# Try multiple num_topics and compare coherence
# Build a topic dashboard from GitHub issue discussions
from nltk.corpus import wordnet as wn nltk.download('wordnet')
synsets = wn.synsets("car") print(synsets[0].definition())
for syn in wn.synsets("dog"): print(syn.name(), syn.definition())
print(wn.synset("car.n.01").hypernyms()) print(wn.synset("car.n.01").hyponyms())
dog = wn.synset('dog.n.01') cat = wn.synset('cat.n.01') print(dog.path_similarity(cat))
# Simplified Lesk Algorithm (via nltk.wsd)
# Use networkx to visualize synset relations as graphs
# Expand search terms with synonyms: "buy" → "purchase", "acquire"
# Use WordNet to filter or group embedding clusters
# Build a synonym dictionary using synsets
# Raw overview, no code needed
from nltk.tokenize import sent_tokenize, word_tokenize from nltk.probability import FreqDist text = "NLTK is a library. It helps with NLP tasks. Summarization is one of them." sentences = sent_tokenize(text) words = word_tokenize(text.lower()) fdist = FreqDist(words) scores = {sent: sum(fdist[word.lower()] for word in word_tokenize(sent)) for sent in sentences} summary = sorted(scores, key=scores.get, reverse=True)[:1] print(summary)
# Included above with FreqDist
# Use sumy or networkx for advanced implementation
# Already shown above
# Use rouge_score package for ROUGE
# Requires HuggingFace Transformers, not NLTK directly
# No code
# Preprocessing pipeline integration
# Custom NLTK scripts or pretrained models
text = "LOL 😆 this is #awesome! Visit http://example.com" clean = re.sub(r"http\S+|#\S+|@\S+", "", text) print(clean)
from nltk.tokenize import TweetTokenizer tokenizer = TweetTokenizer() print(tokenizer.tokenize("Check this out #AI #NLP @user"))
import re text = "Gr8!!! :) Check www.nltk.org" print(re.sub(r"http\S+|www\S+|[^a-zA-Z ]", "", text))
slang = {"gr8": "great", "u": "you"} text = "gr8 job u rock" print(" ".join([slang.get(word, word) for word in text.split()]))
import emoji text = "I love python 😍" print(emoji.demojize(text))
from langdetect import detect print(detect("Bonjour, comment ça va?"))
# Use polyglot or spaCy multilingual models
# Use sklearn.feature_extraction.text.TfidfVectorizer
# Example: handle legal case codes or chemical names
# Clean, normalize, then apply sentiment classifier
from nltk.tokenize import word_tokenize text = "Deep learning meets NLP!" tokens = word_tokenize(text.lower()) print(tokens)
# NLTK + Gensim for embeddings from gensim.models import Word2Vec model = Word2Vec([tokens], min_count=1) print(model.wv["deep"])
# Use tokenizer then convert to PyTorch tensor import torch tensor = torch.tensor([model.wv.get_vector(t) for t in tokens])
pos = nltk.pos_tag(tokens) print(pos)
# Use HuggingFace Transformers with NLTK-preprocessed input
# Load GloVe manually or via gensim
model = Word2Vec([tokens], vector_size=50, window=2, min_count=1)
# Tokenize + POS tag + Feed into neural net
# Use sklearn.metrics or TensorBoard
# Project-specific architecture
intents = {"greet": ["hello", "hi"], "bye": ["bye", "goodbye"]}
user_input = "hello" if "hello" in user_input.lower(): print("Hi there!")
import re if re.search(r"hi|hello", user_input.lower()): print("Greetings!")
from nltk.chat.util import Chat, reflections pairs = [["hi", ["hello", "hi there!"]]] chatbot = Chat(pairs, reflections) chatbot.converse()
from nltk import word_tokenize, pos_tag print(pos_tag(word_tokenize("How are you?")))
from nltk.stem import PorterStemmer stemmer = PorterStemmer() print(stemmer.stem("running"))
context = {"last_question": "greeting"}
# Use NaiveBayesClassifier with labeled intents
# Flask API wrapper around chatbot
# Create a travel assistant or FAQ bot using rules + ML
from sklearn.feature_extraction.text import TfidfVectorizer vectorizer = TfidfVectorizer() X = vectorizer.fit_transform(["text mining with nltk", "nltk information retrieval"])
from nltk.tokenize import word_tokenize index = {"doc1": word_tokenize("This is NLTK IR")}
from nltk import ne_chunk, pos_tag print(ne_chunk(pos_tag(word_tokenize("Steve Jobs founded Apple"))))
# Use SpaCy or dependency parsing to extract subject-verb-object triples
import re emails = re.findall(r"\b[A-Za-z0-9._%+-]+@[\w.-]+\.\w+\b", text)
from sklearn.metrics.pairwise import cosine_similarity cosine_similarity(X[0], X)
precision = TP / (TP + FP) recall = TP / (TP + FN)
from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() print(lemmatizer.lemmatize("running"))
# Example: Build a keyword-based news article retriever
# Use Gensim for topic modeling + NLTK preprocessing
# Train sentiment classifier with NaiveBayesClassifier
from nltk.sentiment import SentimentIntensityAnalyzer sia = SentimentIntensityAnalyzer() print(sia.polarity_scores("I love this product."))
# Use ne_chunk + Flask for UI
# Score sentences and select top-N as summary
# Use regex + tokenizers for intelligent response
# Preprocess text with NLTK, run LDA model
# Combine regex + POS + NER
# Use langdetect + Google Translate
# Preprocess tweets, analyze with VADER
# Flask + Gunicorn + NLTK model = production ready