# Transformer architecture uses self-attention layers and feed-forward networksHistory and evolution
# Original Transformer paper: https://arxiv.org/abs/1706.03762Attention mechanism basics
# Attention(Q,K,V) = softmax(QK^T / sqrt(d_k)) * VSelf-attention explained
# Enables modeling dependencies regardless of distance in inputEncoder vs decoder
# BERT: encoder-only; GPT: decoder-only; T5: encoder-decoderApplications of transformers
# Transformer models achieve SOTA in many domainsTransformers vs RNNs/CNNs
# RNNs process sequentially; transformers process all tokens simultaneouslyPopular transformer models overview
# Hugging Face hosts hundreds of pretrained transformer modelsUse cases in NLP and beyond
# Vision Transformer (ViT) applies transformer concepts to imagesTransformer limitations
# Research ongoing on efficient transformer variants (e.g., Longformer)
# https://huggingface.co/Hugging Face ecosystem overview
# Transformers, Datasets, Tokenizers, Model Hub, SpacesInstalling Transformers library
pip install transformersBasic API usage
from transformers import pipeline classifier = pipeline("sentiment-analysis") print(classifier("I love transformers!"))Tokenizers overview
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")Pretrained models availability
# Models include BERT, GPT-2, T5, RoBERTa, etc.Model hub navigation
# https://huggingface.co/modelsSimple text classification example
from transformers import pipeline classifier = pipeline("sentiment-analysis") print(classifier("Transformers are amazing!"))Text generation demo
generator = pipeline("text-generation", model="gpt2") print(generator("The future of AI is", max_length=30))Using pipelines
# pipeline(task_name) loads model and tokenizer automatically
# "Transformers are great" → ["Transformers", "are", "great"]Wordpiece vs Byte-Pair Encoding (BPE)
# WordPiece used in BERT; BPE in GPT-2SentencePiece tokenizer
# Developed by Google for mBERT and T5 tokenizationUsing pretrained tokenizers
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")Tokenizer customization
tokenizer.add_special_tokens({'additional_special_tokens': ['Special tokens explained']})
# Used to mark sentence boundaries or masked wordsPadding and truncation
encoded = tokenizer("Hello", padding='max_length', max_length=10, truncation=True)Tokenizer outputs (input IDs, attention masks)
# input_ids = [101, 7592, 102]; attention_mask = [1, 1, 1]Handling multilingual tokenization
# mBERT tokenizer supports 100+ languagesTokenizer performance tips
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", use_fast=True)
from transformers import BertModel, BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased')GPT architecture
from transformers import GPT2LMHeadModel, GPT2Tokenizer tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2')RoBERTa improvements
from transformers import RobertaModel, RobertaTokenizer tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaModel.from_pretrained('roberta-base')DistilBERT for efficiency
from transformers import DistilBertModel, DistilBertTokenizer tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = DistilBertModel.from_pretrained('distilbert-base-uncased')T5 and sequence-to-sequence
from transformers import T5Tokenizer, T5ForConditionalGeneration tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5ForConditionalGeneration.from_pretrained('t5-small')XLNet and permutation-based models
from transformers import XLNetModel, XLNetTokenizer tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model = XLNetModel.from_pretrained('xlnet-base-cased')ALBERT and parameter sharing
from transformers import AlbertModel, AlbertTokenizer tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2') model = AlbertModel.from_pretrained('albert-base-v2')Longformer for long sequences
from transformers import LongformerModel, LongformerTokenizer tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096') model = LongformerModel.from_pretrained('allenai/longformer-base-4096')Vision Transformers (ViT)
from transformers import ViTModel, ViTFeatureExtractor feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224') model = ViTModel.from_pretrained('google/vit-base-patch16-224')Choosing the right model
# Consider trade-offs before selecting architecture
# Fine-tune BERT on classification task with labeled datasetSetting up datasets
from datasets import load_dataset dataset = load_dataset("glue", "mrpc")Preparing inputs for training
inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt") labels = torch.tensor(labels)Fine-tuning for classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased') outputs = model(**inputs, labels=labels) loss = outputs.loss loss.backward()Fine-tuning for question answering
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased') outputs = model(**inputs)Fine-tuning for token classification
model = BertForTokenClassification.from_pretrained('bert-base-uncased') outputs = model(**inputs)Using Trainer API
from transformers import Trainer, TrainingArguments trainer = Trainer(model=model, args=training_args, train_dataset=train_ds) trainer.train()Training on GPUs
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') model.to(device)Monitoring training metrics
from transformers import TrainerCallback # Use callbacks for loggingSaving and loading fine-tuned models
model.save_pretrained("./fine_tuned_model") model = BertForSequenceClassification.from_pretrained("./fine_tuned_model")
from transformers import pipeline classifier = pipeline('sentiment-analysis')Tokenizing input text
inputs = tokenizer("Hello world!", return_tensors="pt")Running inference
outputs = model(**inputs)Using pipelines for common tasks
summarizer = pipeline("summarization") summarizer("Long text here...")Batch inference
batch_inputs = tokenizer(list_of_texts, padding=True, return_tensors="pt") outputs = model(**batch_inputs)Handling model outputs
predictions = torch.argmax(outputs.logits, dim=-1)Generating text with GPT-based models
generated = model.generate(inputs.input_ids, max_length=50)Extracting embeddings
embeddings = model(**inputs).last_hidden_state[:,0,:]Zero-shot classification
zero_shot = pipeline("zero-shot-classification") zero_shot("Text to classify", candidate_labels=["label1", "label2"])Multi-task inference
# Example: T5 can perform summarization and translation with different prompts
from transformers import BertForSequenceClassification model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3) # multi-classDataset preparation
from transformers import BertTokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') inputs = tokenizer(["sample text"], padding=True, truncation=True, return_tensors="pt")Model selection for classification
model = BertForSequenceClassification.from_pretrained('distilbert-base-uncased')Fine-tuning tips
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)Evaluation metrics (accuracy, F1, etc.)
from sklearn.metrics import f1_score f1 = f1_score(true_labels, preds, average='weighted')Handling imbalanced datasets
loss_fct = torch.nn.CrossEntropyLoss(weight=class_weights)Using classification pipelines
from transformers import pipeline classifier = pipeline("text-classification") result = classifier("This is a great example!")Exporting classification models
model.save_pretrained('./my_model') tokenizer.save_pretrained('./my_model')Deployment considerations
# Export to ONNX for deployment !python transformers-cli convert --model_name ./my_model --framework pt --opset 11Real-world applications
# Example: Sentiment analysis pipeline sentiment = classifier("I love this product!")
from transformers import pipeline qa_pipeline = pipeline("question-answering")Dataset formats (SQuAD, etc.)
# Example SQuAD format {"context": "...", "question": "...", "answers": {"text": "...", "answer_start": 42}}Model architectures for QA
model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')Input formatting for QA
inputs = tokenizer(question, context, return_tensors='pt')Fine-tuning for extractive QA
loss = (start_loss + end_loss) / 2Evaluation metrics (EM, F1)
# Calculate EM and F1 scoresHandling multiple answers
# Use list of possible answers in evaluationDeploying QA systems
from flask import Flask, request app = Flask(__name__) @app.route('/qa') def answer(): data = request.json result = qa_pipeline(question=data['q'], context=data['c']) return resultPerformance optimization
from torch.quantization import quantize_dynamic quantized_model = quantize_dynamic(model)Case studies
# Example chatbot integration
from transformers import AutoModelForTokenClassification model = AutoModelForTokenClassification.from_pretrained('dbmdz/bert-large-cased-finetuned-conll03-english')Dataset annotation for token classification
# Example BIO tags: O, B-PER, I-PER, B-LOC, I-LOCModel setup for NER
from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')Fine-tuning strategies
outputs = model(**inputs, labels=labels) loss = outputs.lossEvaluating token-level predictions
from seqeval.metrics import classification_report print(classification_report(true_labels, pred_labels))Handling overlapping entities
# Example: Use span-based models or layered taggingMulti-lingual NER
model = AutoModelForTokenClassification.from_pretrained('xlm-roberta-base')Visualization of results
import spacy nlp = spacy.load("en_core_web_sm") doc = nlp("Apple is looking at buying U.K. startup.")Transfer learning for token classification
model = AutoModelForTokenClassification.from_pretrained('bert-base-cased')Applications in industry
# Example: Extract entities from financial reports
from transformers import GPT2LMHeadModel, GPT2Tokenizer model = GPT2LMHeadModel.from_pretrained("gpt2") tokenizer = GPT2Tokenizer.from_pretrained("gpt2")Text generation methods (sampling, beam search)
outputs = model.generate(input_ids, do_sample=True, max_length=50) outputs_beam = model.generate(input_ids, num_beams=5, max_length=50)Controlling generation length and diversity
outputs = model.generate(input_ids, max_length=100, temperature=0.7, top_p=0.9)Fine-tuning GPT and T5 for generation
from transformers import Trainer, TrainingArguments # Set up dataset, model and train with Trainer APISummarization approaches
from transformers import pipeline summarizer = pipeline("summarization") summarizer(text, max_length=150)Evaluation metrics (ROUGE, BLEU)
from rouge import Rouge rouge = Rouge() scores = rouge.get_scores(hypothesis, reference)Handling hallucination in generation
# Use knowledge-augmented models or factuality classifiersConditional generation
input_text = "Summarize: " + article_text inputs = tokenizer(input_text, return_tensors="pt") outputs = model.generate(**inputs)Interactive text generation demos
# Streamlit or Gradio demos for text generation UIEthical considerations
# Implement content filters and user disclaimers
import matplotlib.pyplot as plt attentions = model(input_ids)[-1] plt.matshow(attentions[0][0].detach().numpy()) plt.show()Visualizing attention weights
# Use BertViz or similar libraries for visualizationProbing model internals
# Use probing classifiers on hidden statesUsing Captum with transformers
from captum.attr import IntegratedGradients ig = IntegratedGradients(model) attr = ig.attribute(inputs, target=target_label)Explaining classification decisions
# Generate token importance scores and visualize themFeature importance analysis
# SHAP values or attention-based scoring for featuresDetecting biases in models
# Run bias detection suites on datasets and predictionsModel fairness evaluation
# Compute fairness metrics like demographic parity or equal opportunityTools and libraries for explainability
# pip install captum bertviz eli5Challenges in interpretability
# Stay updated with latest research and tool improvements
for name, param in model.named_parameters(): if "layer.0" in name: param.requires_grad = FalseDifferential learning rates
optimizer = torch.optim.Adam([ {'params': model.base.parameters(), 'lr': 1e-5}, {'params': model.classifier.parameters(), 'lr': 1e-4} ])Adapters and parameter-efficient tuning
# Use adapter-transformers library to add adaptersMixed precision training
from torch.cuda.amp import autocast, GradScaler with autocast(): output = model(input)Gradient accumulation
optimizer.zero_grad() for i, batch in enumerate(dataloader): loss = model(batch) loss.backward() if (i + 1) % accumulation_steps == 0: optimizer.step() optimizer.zero_grad()Early stopping and checkpoints
from transformers import EarlyStoppingCallback trainer = Trainer(callbacks=[EarlyStoppingCallback])Using callbacks
from transformers import TrainerCallback class MyCallback(TrainerCallback): def on_step_end(self, args, state, control, **kwargs): print("Step finished")Distributed training with Hugging Face
# Use torch.distributed.launch or Accelerate libraryHyperparameter search
import optuna def objective(trial): lr = trial.suggest_loguniform('lr', 1e-6, 1e-3) # Train model and return validation loss return val_loss study = optuna.create_study(direction='minimize') study.optimize(objective, n_trials=50)Debugging training
# Use tensorboard and print gradient norms during training
# Transformers scale as O(n^2) where n is sequence lengthLongformer and BigBird models
# Longformer uses sliding window attention to limit context sizeSliding window attention
# Each token attends to neighbors within a window, not entire sequenceSparse attention mechanisms
# Sparse attention masks implemented as sparse matricesChunking inputs
# Process 512-token chunks with overlap to maintain contextEfficient tokenization strategies
# Tokenize once and cache token IDs for reuseMemory optimization
# Enable torch.cuda.amp and gradient checkpointing for efficiencyTraining with long sequences
# Use smaller batches and accumulate gradients over stepsApplications for long documents
# Summarize entire books or analyze multi-page contractsPerformance trade-offs
# Evaluate accuracy vs speed when choosing attention method
# mBERT handles 104 languages with one modelXLM and XLM-R models
# XLM-R trained on 100+ languages with massive corporaCross-lingual transfer learning
# Train on English, infer on related languages zero-shotTokenization challenges in multilingual data
# Use SentencePiece or WordPiece tokenizers with multilingual corporaDataset preparation for multilingual tasks
# Use parallel datasets like OPUS or multilingual QA setsFine-tuning multilingual models
# Load pretrained checkpoints and train with mixed-language batchesZero-shot cross-lingual classification
# Evaluate on unseen languages using mBERT zero-shot transferEvaluating multilingual models
# Evaluate on datasets covering multiple languages and tasksApplications in global NLP
# Use models for multi-language chatbots or sentiment analysisLanguage adaptation techniques
# Insert adapter layers or expand tokenizer vocab for target language
# Split image into patches → flatten → linear embedding → transformer encoderImage classification with transformers
# Use positional embeddings to encode patch locationsCombining vision and text (CLIP)
# Encode images and captions → compute cosine similarity for retrievalMultimodal transformers overview
# Cross-attention layers combine embeddings from both modalitiesFine-tuning for image captioning
# Train decoder to generate text conditioned on image featuresVisual question answering
# Input image and question tokens → output answer classificationMultimodal data preprocessing
# Preprocess images and tokenize text consistentlyEvaluation metrics for multimodal models
# Compute metrics on generated captions or answersDeploying multimodal systems
# Use ONNX Runtime or TensorRT for efficient deploymentResearch trends
# Explore foundation models like Flamingo and GPT-4 multimodal
import torch traced_model = torch.jit.trace(model, example_input) traced_model.save("model.pt")
import torch.onnx torch.onnx.export(model, example_input, "model.onnx")
from fastapi import FastAPI app = FastAPI() @app.post("/predict/") def predict(input: InputData): return model(input.data)
# Dockerfile example FROM python:3.8 COPY . /app RUN pip install -r requirements.txt CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "80"]
// Use AWS SageMaker SDK to deploy models programmatically
// Example: Use ONNX Runtime with quantized models
// Define deployment with replicas in Kubernetes YAML
// Use model distillation or pruning for edge devices
// Integrate Prometheus and Grafana for metrics
// Use GitHub Actions or Jenkins for CI/CD
from transformers import AutoModel model = AutoModel.from_pretrained("bert-base-uncased")
# Login and upload huggingface-cli login git lfs install git clone https://huggingface.co/username/modelname git add . git commit -m "Add model" git push
// Use Git tags and branches on the Hub repo
from datasets import load_dataset dataset = load_dataset("imdb")
// Build and deploy Streamlit or Gradio demos on Spaces
// Invite collaborators on the Hub repo
// Add LICENSE file with MIT, Apache, or other license
// Search and load popular models
from transformers import pipeline nlp = pipeline("sentiment-analysis") result = nlp("I love Hugging Face!")
// Include README.md with instructions and citations
from transformers import BertConfig config = BertConfig.from_pretrained("bert-base-uncased") config.hidden_size = 512
from tokenizers import ByteLevelBPETokenizer tokenizer = ByteLevelBPETokenizer() tokenizer.train(["data.txt"], vocab_size=30000)
from transformers import BertModel model = BertModel(config) model.encoder.layer[0].attention.self.num_attention_heads = 8
// Implement custom attention module subclassing nn.Module
from transformers import BertForSequenceClassification model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model = BertModel(config) model.train()
from transformers import Trainer, TrainingArguments training_args = TrainingArguments(output_dir="./results") trainer = Trainer(model=model, args=training_args, train_dataset=train_ds) trainer.train()
// Use torch.autograd.gradcheck for gradient validation
// Upload via huggingface-cli or git
// Custom model applied to medical NLP or edge devices
// Arxiv.org and conferences like NeurIPS are primary sources
// Example: use Linformer library for efficiency
// Implement sparse attention masks in custom models
// Combine transformer embeddings with CNN outputs
// Use Hugging Face Trainer for MLM tasks
// Create prompts to guide model responses dynamically
// Use GPT-style zero-shot with proper prompts
// Vision transformer (ViT) usage in image classification
// Evaluate models for bias and fairness metrics
// Explore explainability techniques for transformers
// Job boards often list transformer-related roles
// Host projects on GitHub and personal sites
// Submit pull requests and issues
// Join challenges involving text classification or generation
// Write articles on Medium or Arxiv preprints
// Attend ACL, EMNLP, NeurIPS conferences
// Coursera, edX, and Udacity offer transformer courses
// Join AI groups and discussions
// Use Upwork or Freelancer platforms
// Use Twitter, Arxiv Sanity, and newsletters