# Transformer architecture uses self-attention layers and feed-forward networks
History and evolution
# Original Transformer paper: https://arxiv.org/abs/1706.03762
Attention mechanism basics
# Attention(Q,K,V) = softmax(QK^T / sqrt(d_k)) * V
Self-attention explained
# Enables modeling dependencies regardless of distance in input
Encoder vs decoder
# BERT: encoder-only; GPT: decoder-only; T5: encoder-decoder
Applications of transformers
# Transformer models achieve SOTA in many domains
Transformers vs RNNs/CNNs
# RNNs process sequentially; transformers process all tokens simultaneously
Popular transformer models overview
# Hugging Face hosts hundreds of pretrained transformer models
Use cases in NLP and beyond
# Vision Transformer (ViT) applies transformer concepts to images
Transformer limitations
# Research ongoing on efficient transformer variants (e.g., Longformer)
# https://huggingface.co/
Hugging Face ecosystem overview
# Transformers, Datasets, Tokenizers, Model Hub, Spaces
Installing Transformers library
pip install transformers
Basic API usage
from transformers import pipeline
classifier = pipeline("sentiment-analysis")
print(classifier("I love transformers!"))
Tokenizers overview
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
Pretrained models availability
# Models include BERT, GPT-2, T5, RoBERTa, etc.
Model hub navigation
# https://huggingface.co/models
Simple text classification example
from transformers import pipeline
classifier = pipeline("sentiment-analysis")
print(classifier("Transformers are amazing!"))
Text generation demo
generator = pipeline("text-generation", model="gpt2")
print(generator("The future of AI is", max_length=30))
Using pipelines
# pipeline(task_name) loads model and tokenizer automatically
# "Transformers are great" → ["Transformers", "are", "great"]
Wordpiece vs Byte-Pair Encoding (BPE)
# WordPiece used in BERT; BPE in GPT-2
SentencePiece tokenizer
# Developed by Google for mBERT and T5 tokenization
Using pretrained tokenizers
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
Tokenizer customization
tokenizer.add_special_tokens({'additional_special_tokens': ['']})
Special tokens explained
# Used to mark sentence boundaries or masked words
Padding and truncation
encoded = tokenizer("Hello", padding='max_length', max_length=10, truncation=True)
Tokenizer outputs (input IDs, attention masks)
# input_ids = [101, 7592, 102]; attention_mask = [1, 1, 1]
Handling multilingual tokenization
# mBERT tokenizer supports 100+ languages
Tokenizer performance tips
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
from transformers import BertModel, BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
GPT architecture
from transformers import GPT2LMHeadModel, GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
RoBERTa improvements
from transformers import RobertaModel, RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')
DistilBERT for efficiency
from transformers import DistilBertModel, DistilBertTokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')
T5 and sequence-to-sequence
from transformers import T5Tokenizer, T5ForConditionalGeneration
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')
XLNet and permutation-based models
from transformers import XLNetModel, XLNetTokenizer
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
model = XLNetModel.from_pretrained('xlnet-base-cased')
ALBERT and parameter sharing
from transformers import AlbertModel, AlbertTokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertModel.from_pretrained('albert-base-v2')
Longformer for long sequences
from transformers import LongformerModel, LongformerTokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
model = LongformerModel.from_pretrained('allenai/longformer-base-4096')
Vision Transformers (ViT)
from transformers import ViTModel, ViTFeatureExtractor
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model = ViTModel.from_pretrained('google/vit-base-patch16-224')
Choosing the right model# Consider trade-offs before selecting architecture
# Fine-tune BERT on classification task with labeled datasetSetting up datasets
from datasets import load_dataset
dataset = load_dataset("glue", "mrpc")
Preparing inputs for traininginputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt") labels = torch.tensor(labels)Fine-tuning for classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
outputs = model(**inputs, labels=labels)
loss = outputs.loss
loss.backward()
Fine-tuning for question answering
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
outputs = model(**inputs)
Fine-tuning for token classification
model = BertForTokenClassification.from_pretrained('bert-base-uncased')
outputs = model(**inputs)
Using Trainer APIfrom transformers import Trainer, TrainingArguments trainer = Trainer(model=model, args=training_args, train_dataset=train_ds) trainer.train()Training on GPUs
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
Monitoring training metricsfrom transformers import TrainerCallback # Use callbacks for loggingSaving and loading fine-tuned models
model.save_pretrained("./fine_tuned_model")
model = BertForSequenceClassification.from_pretrained("./fine_tuned_model")
from transformers import pipeline
classifier = pipeline('sentiment-analysis')
Tokenizing input text
inputs = tokenizer("Hello world!", return_tensors="pt")
Running inferenceoutputs = model(**inputs)Using pipelines for common tasks
summarizer = pipeline("summarization")
summarizer("Long text here...")
Batch inferencebatch_inputs = tokenizer(list_of_texts, padding=True, return_tensors="pt") outputs = model(**batch_inputs)Handling model outputs
predictions = torch.argmax(outputs.logits, dim=-1)Generating text with GPT-based models
generated = model.generate(inputs.input_ids, max_length=50)Extracting embeddings
embeddings = model(**inputs).last_hidden_state[:,0,:]Zero-shot classification
zero_shot = pipeline("zero-shot-classification")
zero_shot("Text to classify", candidate_labels=["label1", "label2"])
Multi-task inference# Example: T5 can perform summarization and translation with different prompts
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3) # multi-class
Dataset preparation
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
inputs = tokenizer(["sample text"], padding=True, truncation=True, return_tensors="pt")
Model selection for classification
model = BertForSequenceClassification.from_pretrained('distilbert-base-uncased')
Fine-tuning tips
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
Evaluation metrics (accuracy, F1, etc.)
from sklearn.metrics import f1_score
f1 = f1_score(true_labels, preds, average='weighted')
Handling imbalanced datasets
loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)
Using classification pipelines
from transformers import pipeline
classifier = pipeline("text-classification")
result = classifier("This is a great example!")
Exporting classification models
model.save_pretrained('./my_model')
tokenizer.save_pretrained('./my_model')
Deployment considerations
# Export to ONNX for deployment
!python transformers-cli convert --model_name ./my_model --framework pt --opset 11
Real-world applications
# Example: Sentiment analysis pipeline
sentiment = classifier("I love this product!")
from transformers import pipeline
qa_pipeline = pipeline("question-answering")
Dataset formats (SQuAD, etc.)
# Example SQuAD format
{"context": "...", "question": "...", "answers": {"text": "...", "answer_start": 42}}
Model architectures for QA
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
Input formatting for QA
inputs = tokenizer(question, context, return_tensors='pt')
Fine-tuning for extractive QA
loss = (start_loss + end_loss) / 2
Evaluation metrics (EM, F1)
# Calculate EM and F1 scores
Handling multiple answers
# Use list of possible answers in evaluation
Deploying QA systems
from flask import Flask, request
app = Flask(__name__)
@app.route('/qa')
def answer():
data = request.json
result = qa_pipeline(question=data['q'], context=data['c'])
return result
Performance optimization
from torch.quantization import quantize_dynamic
quantized_model = quantize_dynamic(model)
Case studies
# Example chatbot integration
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')
Dataset annotation for token classification
# Example BIO tags: O, B-PER, I-PER, B-LOC, I-LOC
Model setup for NER
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
Fine-tuning strategies
outputs = model(**inputs, labels=labels)
loss = outputs.loss
Evaluating token-level predictions
from seqeval.metrics import classification_report
print(classification_report(true_labels, pred_labels))
Handling overlapping entities
# Example: Use span-based models or layered tagging
Multi-lingual NER
model = AutoModelForTokenClassification.from_pretrained('xlm-roberta-base')
Visualization of results
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp("Apple is looking at buying U.K. startup.")
Transfer learning for token classification
model = AutoModelForTokenClassification.from_pretrained('bert-base-cased')
Applications in industry
# Example: Extract entities from financial reports
from transformers import GPT2LMHeadModel, GPT2Tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
Text generation methods (sampling, beam search)
outputs = model.generate(input_ids, do_sample=True, max_length=50)
outputs_beam = model.generate(input_ids, num_beams=5, max_length=50)
Controlling generation length and diversity
outputs = model.generate(input_ids, max_length=100, temperature=0.7, top_p=0.9)
Fine-tuning GPT and T5 for generation
from transformers import Trainer, TrainingArguments
# Set up dataset, model and train with Trainer API
Summarization approaches
from transformers import pipeline
summarizer = pipeline("summarization")
summarizer(text, max_length=150)
Evaluation metrics (ROUGE, BLEU)
from rouge import Rouge
rouge = Rouge()
scores = rouge.get_scores(hypothesis, reference)
Handling hallucination in generation
# Use knowledge-augmented models or factuality classifiers
Conditional generation
input_text = "Summarize: " + article_text
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs)
Interactive text generation demos
# Streamlit or Gradio demos for text generation UI
Ethical considerations
# Implement content filters and user disclaimers
import matplotlib.pyplot as plt
attentions = model(input_ids)[-1]
plt.matshow(attentions[0][0].detach().numpy())
plt.show()
Visualizing attention weights
# Use BertViz or similar libraries for visualization
Probing model internals
# Use probing classifiers on hidden states
Using Captum with transformers
from captum.attr import IntegratedGradients
ig = IntegratedGradients(model)
attr = ig.attribute(inputs, target=target_label)
Explaining classification decisions
# Generate token importance scores and visualize them
Feature importance analysis
# SHAP values or attention-based scoring for features
Detecting biases in models
# Run bias detection suites on datasets and predictions
Model fairness evaluation
# Compute fairness metrics like demographic parity or equal opportunity
Tools and libraries for explainability
# pip install captum bertviz eli5
Challenges in interpretability
# Stay updated with latest research and tool improvements
for name, param in model.named_parameters():
if "layer.0" in name:
param.requires_grad = False
Differential learning rates
optimizer = torch.optim.Adam([
{'params': model.base.parameters(), 'lr': 1e-5},
{'params': model.classifier.parameters(), 'lr': 1e-4}
])
Adapters and parameter-efficient tuning
# Use adapter-transformers library to add adapters
Mixed precision training
from torch.cuda.amp import autocast, GradScaler
with autocast():
output = model(input)
Gradient accumulation
optimizer.zero_grad()
for i, batch in enumerate(dataloader):
loss = model(batch)
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
Early stopping and checkpoints
from transformers import EarlyStoppingCallback
trainer = Trainer(callbacks=[EarlyStoppingCallback])
Using callbacks
from transformers import TrainerCallback
class MyCallback(TrainerCallback):
def on_step_end(self, args, state, control, **kwargs):
print("Step finished")
Distributed training with Hugging Face
# Use torch.distributed.launch or Accelerate library
Hyperparameter search
import optuna
def objective(trial):
lr = trial.suggest_loguniform('lr', 1e-6, 1e-3)
# Train model and return validation loss
return val_loss
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)
Debugging training
# Use tensorboard and print gradient norms during training
# Transformers scale as O(n^2) where n is sequence length
Longformer and BigBird models
# Longformer uses sliding window attention to limit context size
Sliding window attention
# Each token attends to neighbors within a window, not entire sequence
Sparse attention mechanisms
# Sparse attention masks implemented as sparse matrices
Chunking inputs
# Process 512-token chunks with overlap to maintain context
Efficient tokenization strategies
# Tokenize once and cache token IDs for reuse
Memory optimization
# Enable torch.cuda.amp and gradient checkpointing for efficiency
Training with long sequences
# Use smaller batches and accumulate gradients over steps
Applications for long documents
# Summarize entire books or analyze multi-page contracts
Performance trade-offs
# Evaluate accuracy vs speed when choosing attention method
# mBERT handles 104 languages with one model
XLM and XLM-R models
# XLM-R trained on 100+ languages with massive corpora
Cross-lingual transfer learning
# Train on English, infer on related languages zero-shot
Tokenization challenges in multilingual data
# Use SentencePiece or WordPiece tokenizers with multilingual corpora
Dataset preparation for multilingual tasks
# Use parallel datasets like OPUS or multilingual QA sets
Fine-tuning multilingual models
# Load pretrained checkpoints and train with mixed-language batches
Zero-shot cross-lingual classification
# Evaluate on unseen languages using mBERT zero-shot transfer
Evaluating multilingual models
# Evaluate on datasets covering multiple languages and tasks
Applications in global NLP
# Use models for multi-language chatbots or sentiment analysis
Language adaptation techniques
# Insert adapter layers or expand tokenizer vocab for target language
# Split image into patches → flatten → linear embedding → transformer encoder
Image classification with transformers
# Use positional embeddings to encode patch locations
Combining vision and text (CLIP)
# Encode images and captions → compute cosine similarity for retrieval
Multimodal transformers overview
# Cross-attention layers combine embeddings from both modalities
Fine-tuning for image captioning
# Train decoder to generate text conditioned on image features
Visual question answering
# Input image and question tokens → output answer classification
Multimodal data preprocessing
# Preprocess images and tokenize text consistently
Evaluation metrics for multimodal models
# Compute metrics on generated captions or answers
Deploying multimodal systems
# Use ONNX Runtime or TensorRT for efficient deployment
Research trends
# Explore foundation models like Flamingo and GPT-4 multimodal
import torch
traced_model = torch.jit.trace(model, example_input)
traced_model.save("model.pt")
import torch.onnx torch.onnx.export(model, example_input, "model.onnx")
from fastapi import FastAPI
app = FastAPI()
@app.post("/predict/")
def predict(input: InputData):
return model(input.data)
# Dockerfile example FROM python:3.8 COPY . /app RUN pip install -r requirements.txt CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]
// Use AWS SageMaker SDK to deploy models programmatically
// Example: Use ONNX Runtime with quantized models
// Define deployment with replicas in Kubernetes YAML
// Use model distillation or pruning for edge devices
// Integrate Prometheus and Grafana for metrics
// Use GitHub Actions or Jenkins for CI/CD
from transformers import AutoModel
model = AutoModel.from_pretrained("bert-base-uncased")
# Login and upload huggingface-cli login git lfs install git clone https://huggingface.co/username/modelname git add . git commit -m "Add model" git push
// Use Git tags and branches on the Hub repo
from datasets import load_dataset
dataset = load_dataset("imdb")
// Build and deploy Streamlit or Gradio demos on Spaces
// Invite collaborators on the Hub repo
// Add LICENSE file with MIT, Apache, or other license
// Search and load popular models
from transformers import pipeline
nlp = pipeline("sentiment-analysis")
result = nlp("I love Hugging Face!")
// Include README.md with instructions and citations
from transformers import BertConfig
config = BertConfig.from_pretrained("bert-base-uncased")
config.hidden_size = 512
from tokenizers import ByteLevelBPETokenizer tokenizer = ByteLevelBPETokenizer() tokenizer.train(["data.txt"], vocab_size=30000)
from transformers import BertModel model = BertModel(config) model.encoder.layer[0].attention.self.num_attention_heads = 8
// Implement custom attention module subclassing nn.Module
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model = BertModel(config) model.train()
from transformers import Trainer, TrainingArguments training_args = TrainingArguments(output_dir="./results") trainer = Trainer(model=model, args=training_args, train_dataset=train_ds) trainer.train()
// Use torch.autograd.gradcheck for gradient validation
// Upload via huggingface-cli or git
// Custom model applied to medical NLP or edge devices
// Arxiv.org and conferences like NeurIPS are primary sources
// Example: use Linformer library for efficiency
// Implement sparse attention masks in custom models
// Combine transformer embeddings with CNN outputs
// Use Hugging Face Trainer for MLM tasks
// Create prompts to guide model responses dynamically
// Use GPT-style zero-shot with proper prompts
// Vision transformer (ViT) usage in image classification
// Evaluate models for bias and fairness metrics
// Explore explainability techniques for transformers
// Job boards often list transformer-related roles
// Host projects on GitHub and personal sites
// Submit pull requests and issues
// Join challenges involving text classification or generation
// Write articles on Medium or Arxiv preprints
// Attend ACL, EMNLP, NeurIPS conferences
// Coursera, edX, and Udacity offer transformer courses
// Join AI groups and discussions
// Use Upwork or Freelancer platforms
// Use Twitter, Arxiv Sanity, and newsletters