# Import a classifier
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
print(model) # Output: LogisticRegression()
# Check scikit-learn version
import sklearn
print(sklearn.__version__) # Example Output: '1.4.0'
# Using pip (command-line)
# pip install scikit-learn
# Using conda (command-line)
# conda install scikit-learn
from sklearn.cluster import KMeans
model = KMeans(n_clusters=3)
print(model) # Output: KMeans(n_clusters=3)
# Feature: Pipelines and preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
pipeline = make_pipeline(StandardScaler(), Ridge())
print(pipeline)
# Scikit-learn is great for small-to-medium datasets
from sklearn.svm import SVC
clf = SVC(kernel='linear')
print(clf)
# Full structure in short
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit([[0], [1], [2]], [0, 1, 2])
print(model.predict([[3]])) # Output: [3.]
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
pipeline = Pipeline([
('scaler', StandardScaler()),
('svm', SVC())
])
print(pipeline)
import pandas as pd
from sklearn.linear_model import LogisticRegression
df = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 1, 0]})
model = LogisticRegression()
model.fit(df[['x']], df['y'])
print(model.predict([[2]])) # Output: [1]
from sklearn.datasets import load_iris
data = load_iris()
print(data.data[:2]) # Show first 2 samples
import numpy as np
from sklearn.impute import SimpleImputer
X = [[1, 2], [np.nan, 3], [7, 6]]
imp = SimpleImputer(strategy='mean')
print(imp.fit_transform(X))
from sklearn.preprocessing import OneHotEncoder
X = [['red'], ['green'], ['blue']]
encoder = OneHotEncoder()
print(encoder.fit_transform(X).toarray())
from sklearn.preprocessing import StandardScaler
X = [[1, 10], [2, 15], [3, 14]]
scaler = StandardScaler()
print(scaler.fit_transform(X))
from sklearn.preprocessing import Binarizer
X = [[1, 5], [3, 2], [4, 0]]
binarizer = Binarizer(threshold=2)
print(binarizer.fit_transform(X))
from sklearn.preprocessing import PolynomialFeatures
X = [[2, 3]]
poly = PolynomialFeatures(degree=2)
print(poly.fit_transform(X))
from sklearn.base import BaseEstimator, TransformerMixin
class AddOneTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
return X + 1
import numpy as np
print(AddOneTransformer().fit_transform(np.array([[1], [2], [3]])))
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LinearRegression
pipeline = Pipeline([
('scale', MinMaxScaler()),
('model', LinearRegression())
])
print(pipeline)
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
data = pd.DataFrame({
'age': [20, 30, 40],
'gender': ['M', 'F', 'M']
})
ct = ColumnTransformer([
('num', StandardScaler(), ['age']),
('cat', OneHotEncoder(), ['gender'])
])
print(ct.fit_transform(data))
# Good practice: use pipeline + train/test split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
import numpy as np
X = np.array([[1], [2], [3], [4]])
y = np.array([1, 3, 3, 4])
X_train, X_test, y_train, y_test = train_test_split(X, y)
pipe = make_pipeline(StandardScaler(), Ridge())
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))
# Example: simple supervised model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit([[0], [1], [2]], [0, 1, 2])
print(model.predict([[3]])) # Output: [3.]
# Classification example
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit([[0], [1], [2], [3]], [0, 0, 1, 1])
print(model.predict([[1.5]])) # Output: [0] or [1]
# Regression example
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit([[0], [1], [2], [3]], [0, 0, 1, 1])
print(reg.predict([[1.5]])) # Output: float
# Use DecisionTreeClassifier for non-linear patterns
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier()
clf.fit([[0], [1], [2], [3]], [0, 0, 1, 1])
print(clf.predict([[1.5]])) # Output: [0]
from sklearn.metrics import accuracy_score
y_true = [0, 1, 1, 0]
y_pred = [0, 1, 0, 0]
print(accuracy_score(y_true, y_pred)) # Output: 0.75
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
scores = cross_val_score(SVC(), X, y, cv=5)
print(scores) # Output: array of 5 scores
# Overfitting example with high-degree polynomial
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
import numpy as np
X = np.array([[1], [2], [3], [4]])
y = np.array([3, 6, 7, 10])
model = make_pipeline(PolynomialFeatures(5), LinearRegression())
model.fit(X, y)
print(model.predict([[5]])) # May overfit badly
# Use cross-validation to manage bias-variance
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
scores = cross_val_score(KNeighborsClassifier(n_neighbors=1), X, y, cv=3)
print("High variance model scores:", scores)
from sklearn.model_selection import train_test_split
X = [[1], [2], [3], [4]]
y = [0, 0, 1, 1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)
print("Train:", X_train, "Test:", X_test)
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
params = {'C': [0.1, 1, 10]}
grid = GridSearchCV(SVC(), params)
grid.fit([[1], [2], [3], [4]], [0, 0, 1, 1])
print("Best Params:", grid.best_params_)
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit([[1], [2], [3], [4]], [0, 0, 1, 1])
print("Feature Importances:", model.feature_importances_)
from sklearn.linear_model import LogisticRegression
X = [[0], [1], [2], [3]]
y = [0, 0, 1, 1]
model = LogisticRegression()
model.fit(X, y)
print(model.predict([[1.5]])) # Output: [0]
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=3)
model.fit([[0], [1], [2], [3]], [0, 0, 1, 1])
print(model.predict([[1.6]])) # Output: [0] or [1]
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit([[1], [2], [3], [4]], [0, 0, 1, 1])
print(model.predict([[2.5]])) # Output: [0]
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=10)
model.fit([[1], [2], [3], [4]], [0, 0, 1, 1])
print(model.predict([[3.5]])) # Output: [1]
from sklearn.svm import SVC
model = SVC(kernel='linear')
model.fit([[0], [1], [2], [3]], [0, 0, 1, 1])
print(model.predict([[2.5]])) # Output: [1]
from sklearn.naive_bayes import GaussianNB
X = [[1, 20], [2, 18], [3, 25], [4, 28]]
y = [0, 0, 1, 1]
model = GaussianNB()
model.fit(X, y)
print(model.predict([[2.5, 22]])) # Output: [0] or [1]
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
model.fit([[1], [2], [3], [4]], [0, 0, 1, 1])
print(model.predict([[3.2]])) # Output: [1]
from sklearn.linear_model import SGDClassifier
model = SGDClassifier(loss='log_loss', max_iter=1000)
model.fit([[0], [1], [2], [3]], [0, 0, 1, 1])
print(model.predict([[2.2]])) # Output: [1]
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
clf1 = LogisticRegression()
clf2 = DecisionTreeClassifier()
clf3 = SVC(probability=True)
model = VotingClassifier(estimators=[
('lr', clf1), ('dt', clf2), ('svm', clf3)
], voting='soft')
X = [[1], [2], [3], [4]]
y = [0, 0, 1, 1]
model.fit(X, y)
print(model.predict([[2.8]])) # Output: [1]
from sklearn.metrics import classification_report
y_true = [0, 1, 0, 1]
y_pred = [0, 1, 1, 1]
print(classification_report(y_true, y_pred))
from sklearn.linear_model import LinearRegression
X = [[1], [2], [3], [4]]
y = [2, 4, 6, 8]
model = LinearRegression()
model.fit(X, y)
print(model.predict([[5]])) # Output: [10.]
from sklearn.linear_model import Ridge
model = Ridge(alpha=1.0)
model.fit(X, y)
print(model.predict([[5]])) # Output: [~10.]
from sklearn.linear_model import Lasso
model = Lasso(alpha=0.1)
model.fit(X, y)
print(model.predict([[5]])) # Output: [~10.]
from sklearn.linear_model import ElasticNet
model = ElasticNet(alpha=0.1, l1_ratio=0.5)
model.fit(X, y)
print(model.predict([[5]])) # Output: [~10.]
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression())
model.fit(X, y)
print(model.predict([[5]])) # Output: [10.]
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
model.fit(X, y)
print(model.predict([[5]])) # Output: [8.0] or [~10]
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(X, y)
print(model.predict([[5]])) # Output: [~10.]
from sklearn.svm import SVR
model = SVR(kernel='linear')
model.fit(X, y)
print(model.predict([[5]])) # Output: [~10.]
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()
model.fit(X, y)
print(model.predict([[5]])) # Output: [~10.]
from sklearn.metrics import mean_squared_error, r2_score
y_true = [2, 4, 6, 8]
y_pred = [2.1, 4.1, 6, 7.9]
print("MSE:", mean_squared_error(y_true, y_pred)) # Output: small number
print("R² Score:", r2_score(y_true, y_pred)) # Output: close to 1
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
scores = cross_val_score(LogisticRegression(max_iter=1000), X, y, cv=5)
print(scores) # Output: array of scores from 5 folds
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
grid = GridSearchCV(SVC(), param_grid, cv=3)
grid.fit(X, y)
print("Best Params:", grid.best_params_)
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
import numpy as np
param_dist = {'n_estimators': [10, 50, 100], 'max_depth': [None, 3, 5, 10]}
rand_search = RandomizedSearchCV(RandomForestClassifier(), param_dist, n_iter=4, cv=3)
rand_search.fit(X, y)
print(rand_search.best_params_)
score = cross_val_score(LogisticRegression(max_iter=1000), X, y, cv=5, scoring='accuracy')
print("Accuracy Scores:", score)
from sklearn.metrics import confusion_matrix
y_true = [0, 1, 0, 1]
y_pred = [0, 1, 1, 1]
print(confusion_matrix(y_true, y_pred))
from sklearn.metrics import precision_score, recall_score, f1_score
print("Precision:", precision_score(y_true, y_pred))
print("Recall:", recall_score(y_true, y_pred))
print("F1 Score:", f1_score(y_true, y_pred))
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_prob = model.predict_proba(X_test)[:, 1]
print("AUC Score:", roc_auc_score(y_test, y_prob, multi_class='ovr'))
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
print("MSE:", mean_squared_error(y_true, y_pred))
print("MAE:", mean_absolute_error(y_true, y_pred))
print("R²:", r2_score(y_true, y_pred))
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt
import numpy as np
train_sizes, train_scores, test_scores = learning_curve(
LogisticRegression(max_iter=1000), X, y, cv=5)
plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Train')
plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Validation')
plt.legend()
plt.title("Learning Curve")
plt.xlabel("Training Size")
plt.ylabel("Score")
plt.grid(True)
plt.show()
from sklearn.model_selection import validation_curve
param_range = [0.01, 0.1, 1, 10]
train_scores, test_scores = validation_curve(
LogisticRegression(max_iter=1000), X, y, param_name="C", param_range=param_range, cv=5)
plt.plot(param_range, np.mean(train_scores, axis=1), label="Train")
plt.plot(param_range, np.mean(test_scores, axis=1), label="Validation")
plt.xscale('log')
plt.legend()
plt.title("Validation Curve for C")
plt.xlabel("C")
plt.ylabel("Score")
plt.grid(True)
plt.show()
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
X, y = load_iris(return_X_y=True)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y)
plt.title("PCA - Iris Dataset")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.grid(True)
plt.show()
# t-SNE example
from sklearn.manifold import TSNE
X_tsne = TSNE(n_components=2, perplexity=30, random_state=42).fit_transform(X)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y)
plt.title("t-SNE - Iris Dataset")
plt.grid(True)
plt.show()
# UMAP example (requires umap-learn)
# pip install umap-learn
import umap
X_umap = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2).fit_transform(X)
plt.scatter(X_umap[:, 0], X_umap[:, 1], c=y)
plt.title("UMAP - Iris Dataset")
plt.grid(True)
plt.show()
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score
# Clustering on iris
model = KMeans(n_clusters=3, random_state=42)
y_pred = model.fit_predict(X)
print("Silhouette Score:", silhouette_score(X, y_pred))
print("Davies-Bouldin Score:", davies_bouldin_score(X, y_pred))
print("Adjusted Rand Index (with true labels):", adjusted_rand_score(y, y_pred))
# Example: creating new feature from date
import pandas as pd
df = pd.DataFrame({'date': pd.to_datetime(['2020-01-01', '2020-02-01'])})
df['month'] = df['date'].dt.month
print(df)
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
print(X_new[:2])
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(max_iter=1000)
selector = RFE(model, n_features_to_select=2)
X_rfe = selector.fit_transform(X, y)
print(X_rfe[:2])
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X, y)
print(model.feature_importances_)
from sklearn.linear_model import Lasso
from sklearn.datasets import load_boston
import numpy as np
boston = load_boston()
X, y = boston.data, boston.target
model = Lasso(alpha=0.1)
model.fit(X, y)
print(np.round(model.coef_, 2))
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print(X_pca[:2])
import seaborn as sns
import matplotlib.pyplot as plt
corr = pd.DataFrame(X).corr()
sns.heatmap(corr, annot=True)
plt.title("Feature Correlation Heatmap")
plt.show()
from sklearn.feature_selection import mutual_info_classif
mi = mutual_info_classif(X, y)
print(mi)
from sklearn.feature_selection import SelectFromModel
model = RandomForestClassifier()
model.fit(X, y)
selector = SelectFromModel(model, prefit=True)
X_sel = selector.transform(X)
print(X_sel[:2])
# Example: BMI from weight/height in a medical dataset
df = pd.DataFrame({'weight_kg': [70, 80], 'height_m': [1.75, 1.8]})
df['BMI'] = df['weight_kg'] / df['height_m']**2
print(df)
from sklearn.linear_model import LogisticRegression
import joblib
model = LogisticRegression()
model.fit([[0], [1]], [0, 1])
joblib.dump(model, 'model.pkl') # Save
loaded = joblib.load('model.pkl') # Load
print(loaded.predict([[2]]))
import pickle
with open('model.pkl', 'wb') as f:
pickle.dump(model, f)
with open('model.pkl', 'rb') as f:
loaded_model = pickle.load(f)
print(loaded_model.predict([[3]]))
# Simple API idea (in Flask)
from flask import Flask, request, jsonify
app = Flask(__name__)
@app.route('/predict', methods=['POST'])
def predict():
data = request.json['data']
result = loaded_model.predict([data])
return jsonify({'prediction': result.tolist()})
# Run with: flask run
# Save this as app.py and run it
from flask import Flask, request, jsonify
app = Flask(__name__)
@app.route('/')
def home():
return "ML Model API Running"
@app.route('/predict', methods=['POST'])
def predict():
data = request.json['data']
result = loaded_model.predict([data])
return jsonify(result.tolist())
# Save as app.py, then run with: streamlit run app.py
import streamlit as st
st.title("Simple ML App")
val = st.number_input("Enter a value:")
if st.button("Predict"):
result = loaded_model.predict([[val]])
st.write("Prediction:", result[0])
# Dockerfile Example
FROM python:3.9
COPY . /app
WORKDIR /app
RUN pip install -r requirements.txt
CMD ["python", "app.py"]
# Required files: Procfile, requirements.txt
# Procfile content:
web: python app.py
from fastapi import FastAPI
from pydantic import BaseModel
app = FastAPI()
class Input(BaseModel):
data: list
@app.post("/predict")
def predict(input: Input):
return {"result": loaded_model.predict([input.data]).tolist()}
import logging
logging.basicConfig(filename='predictions.log', level=logging.INFO)
input_val = [2]
pred = loaded_model.predict([input_val])
logging.info(f"Input: {input_val}, Prediction: {pred}")
# Save models with versioning
joblib.dump(model, 'model_v1.pkl')
joblib.dump(model, 'model_v2.pkl')
# Load specific version
v2 = joblib.load('model_v2.pkl')
# Instead of separate fit() calls, use pipelines to reduce errors and code clutter.
from sklearn.pipeline import Pipeline, make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression pipe = make_pipeline(StandardScaler(), LogisticRegression()) pipe.fit([[1], [2], [3], [4]], [0, 0, 1, 1]) print(pipe.predict([[1.5]]))
pipe = Pipeline([
('scaler', StandardScaler()),
('clf', LogisticRegression())
])
pipe.fit([[1], [2], [3], [4]], [0, 0, 1, 1])
# Useful when combining multiple column pipelines using ColumnTransformer
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
combined = FeatureUnion([("pca", PCA(n_components=2)), ("scale", StandardScaler())])
from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(pipe, {'logisticregression__C': [0.1, 1, 10]}, cv=3)
grid.fit([[1], [2], [3], [4]], [0, 0, 1, 1])
import joblib joblib.dump(pipe, 'pipeline.pkl')
print(pipe.named_steps['logisticregression'])
from sklearn.base import BaseEstimator, TransformerMixin
class MultiplyByTwo(BaseEstimator, TransformerMixin):
def fit(self, X, y=None): return self
def transform(self, X): return X * 2
import re text = "Hello, World!" cleaned = re.sub(r'[^\w\s]', '', text.lower()) print(cleaned)
from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() X = cv.fit_transform(["I love AI", "AI loves me"]) print(X.toarray())
from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer() X = tfidf.fit_transform(["I love AI", "AI loves me"]) print(X.toarray())
cv = CountVectorizer() tfidf = TfidfVectorizer()
from sklearn.pipeline import make_pipeline from sklearn.naive_bayes import MultinomialNB pipe = make_pipeline(CountVectorizer(), MultinomialNB()) pipe.fit(["spam spam", "ham email"], [1, 0]) print(pipe.predict(["spam now"]))
model = MultinomialNB()
from sklearn.cluster import KMeans X = TfidfVectorizer().fit_transform(["dog cat", "apple orange", "cat dog", "fruit banana"]) model = KMeans(n_clusters=2).fit(X) print(model.labels_)
from sklearn.decomposition import TruncatedSVD X = TfidfVectorizer().fit_transform(["text one", "text two"]) svd = TruncatedSVD(n_components=2) X_reduced = svd.fit_transform(X) print(X_reduced)
vectorizer = CountVectorizer(ngram_range=(1,2), stop_words='english')
texts = ["I love this!", "I hate that."] labels = [1, 0] pipe = make_pipeline(TfidfVectorizer(), MultinomialNB()) pipe.fit(texts, labels) print(pipe.predict(["love it"]))
import numpy as np
y = [0]*90 + [1]*10
print("Class 0:", y.count(0), "Class 1:", y.count(1))
from imblearn.over_sampling import SMOTE
X = [[i] for i in range(100)]
y = [0]*90 + [1]*10
X_res, y_res = SMOTE().fit_resample(X, y)
print("New class distribution:", {i: y_res.count(i) for i in set(y_res)})
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler()
X_res, y_res = rus.fit_resample(X, y)
print("Resampled:", len(y_res))
from sklearn.linear_model import LogisticRegression model = LogisticRegression(class_weight='balanced')
from sklearn.metrics import classification_report y_true = [0]*95 + [1]*5 y_pred = [0]*90 + [1]*10 print(classification_report(y_true, y_pred))
from imblearn.pipeline import Pipeline
pipe = Pipeline([('smote', SMOTE()), ('clf', LogisticRegression())])
pipe.fit(X, y)
# See: imblearn.over_sampling.ADASYN
from imblearn.ensemble import BalancedBaggingClassifier from sklearn.tree import DecisionTreeClassifier model = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10)
probs = model.predict_proba(X_res)[:, 1] y_pred = [1 if p > 0.3 else 0 for p in probs]
# Combine SMOTE + LogisticRegression + F1-score on fraud dataset.
# Simple agent example: a chatbot responding to user input
def agent(input_text):
if "hello" in input_text.lower():
return "Hello! How can I help you?"
return "I don't understand."
print(agent("Hello there!"))
# Example: Reflex agent responds to keywords
def reflex_agent(input_text):
if "weather" in input_text:
return "Check the weather app."
return "Sorry, I can't help."
print(reflex_agent("What's the weather?"))
# Email filter example emails = ["Buy now!", "Meeting at 3pm", "Cheap meds"] filtered = [e for e in emails if "buy" not in e.lower()] print(filtered)
# Simplified simulation: two agents competing for a resource
agent1 = {"energy": 5}
agent2 = {"energy": 3}
def compete(a1, a2):
winner = "Agent1" if a1['energy'] > a2['energy'] else "Agent2"
return winner
print(compete(agent1, agent2))
import gym
env = gym.make('CartPole-v1')
obs = env.reset()
print("Initial observation:", obs)
env.close()
# Simple scheduler example
def schedule_task(task, time):
print(f"Scheduled '{task}' at {time}")
schedule_task("Backup", "02:00 AM")
# Basic command parser
def parse_command(text):
if "turn on light" in text.lower():
return "Lights turned on"
return "Command not recognized"
print(parse_command("Please turn on light"))
# Pseudo-code for robot navigation
def move_robot(direction):
print(f"Moving {direction}")
move_robot("forward")
# No direct code, but always consider ethical constraints!
# Imagine AI agents that learn from each other and adapt dynamically.
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
X = pd.DataFrame({'feature': [1, 2, 3, 4]})
y = np.array([2, 4, 6, 8])
model = LinearRegression()
model.fit(X, y)
print(model.predict(pd.DataFrame({'feature': [5]})))
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
data = sns.load_dataset('iris')
sns.pairplot(data, hue='species')
plt.show()
from xgboost import XGBClassifier from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss') model.fit(X, y) print(model.predict(X[:2]))
import lightgbm as lgb from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) model = lgb.LGBMClassifier() model.fit(X, y) print(model.predict(X[:2]))
from catboost import CatBoostClassifier from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) model = CatBoostClassifier(verbose=0) model.fit(X, y) print(model.predict(X[:2]))
# Example Optuna tuning skeleton:
import optuna
def objective(trial):
param = {'C': trial.suggest_loguniform('C', 1e-3, 10)}
model = LogisticRegression(**param)
model.fit(X, y)
return model.score(X, y)
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
print(study.best_params)
import mlflow
import mlflow.sklearn
with mlflow.start_run():
model.fit(X, y)
mlflow.sklearn.log_model(model, "model")
import dask.array as da X = da.random.random((1000, 10), chunks=(100, 10)) # Train models with dask-ml wrappers
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
def create_model():
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
model = Sequential([Dense(10, input_shape=(4,), activation='relu'), Dense(3, activation='softmax')])
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
return model
keras_model = KerasClassifier(build_fn=create_model, epochs=5, batch_size=10)
keras_model.fit(X, y)
import skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
initial_type = [('float_input', FloatTensorType([None, X.shape[1]]))]
onnx_model = convert_sklearn(model, initial_types=initial_type)
with open("model.onnx", "wb") as f:
f.write(onnx_model.SerializeToString())
from sklearn.datasets import fetch_california_housing from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression data = fetch_california_housing() X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=42) model = LinearRegression() model.fit(X_train, y_train) print(model.predict([X_test[0]]))
# Use logistic regression on customer data (pseudo-example) # X, y = customer_features, churn_labels # model = LogisticRegression().fit(X, y)
from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) model = LogisticRegression(max_iter=1000) model.fit(X, y) print(model.predict(X[:2]))
from sklearn.pipeline import make_pipeline from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB texts = ["Buy cheap meds", "Meeting at 10", "Cheap meds now", "Lunch tomorrow"] labels = [1, 0, 1, 0] pipe = make_pipeline(CountVectorizer(), MultinomialNB()) pipe.fit(texts, labels) print(pipe.predict(["Cheap meds offer"]))
from sklearn.decomposition import PCA from sklearn.datasets import load_digits from sklearn.linear_model import LogisticRegression digits = load_digits() pca = PCA(n_components=30) X_pca = pca.fit_transform(digits.data) model = LogisticRegression(max_iter=1000) model.fit(X_pca, digits.target) print(model.predict(X_pca[:1]))
from sklearn.cluster import KMeans import numpy as np X = np.array([[1, 2], [1, 4], [5, 8], [8, 8]]) kmeans = KMeans(n_clusters=2, random_state=0).fit(X) print(kmeans.labels_)
# Simple example: recommend movies based on user ratings similarity (pseudo-code)
# Use statsmodels for ARIMA and sklearn for ML models on residuals.
from imblearn.over_sampling import SMOTE from sklearn.ensemble import RandomForestClassifier # X, y = fraud_data_features, fraud_labels # X_res, y_res = SMOTE().fit_resample(X, y) # model = RandomForestClassifier().fit(X_res, y_res)
# Example: stock prices over time vs random tabular data
import pandas as pd
df = pd.DataFrame({'value': [1,2,3,4,5]})
df['lag_1'] = df['value'].shift(1)
df['rolling_mean_2'] = df['value'].rolling(window=2).mean()
print(df)
# Convert series into supervised learning samples with windows
# See example above with lag and rolling_mean
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=3)
for train_index, test_index in tscv.split(df):
print("TRAIN:", train_index, "TEST:", test_index)
# Pipelines can include custom transformers for lag features
from sklearn.linear_model import LinearRegression import numpy as np X = np.array([[1], [2], [3]]) y = np.array([2, 3, 4]) model = LinearRegression().fit(X, y) print(model.predict([[4]]))
import matplotlib.pyplot as plt from pandas.plotting import autocorrelation_plot autocorrelation_plot(df['value']) plt.show()
# See TimeSeriesSplit example above
# Practical example would combine all above techniques on sales data
from sklearn.preprocessing import OneHotEncoder import numpy as np data = np.array([['red'], ['green'], ['blue']]) encoder = OneHotEncoder(sparse=False) print(encoder.fit_transform(data))
from sklearn.preprocessing import OrdinalEncoder data = np.array([['low'], ['medium'], ['high']]) encoder = OrdinalEncoder(categories=[['low', 'medium', 'high']]) print(encoder.fit_transform(data))
# Use category_encoders library # import category_encoders as ce # encoder = ce.BinaryEncoder(cols=['feature'])
# Use category_encoders or custom implementation
from sklearn.feature_extraction import FeatureHasher
data = [{'cat': 'dog'}, {'cat': 'fish'}, {'cat': 'dog'}]
hasher = FeatureHasher(input_type='dict')
print(hasher.transform(data).toarray())
# Consider grouping rare categories or using hashing
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([
('encoder', OneHotEncoder(handle_unknown='ignore')),
('clf', LogisticRegression())
])
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['num_feature']),
('cat', OneHotEncoder(), ['cat_feature'])
])
# Example: OneHotEncoder(handle_unknown='ignore')
import matplotlib.pyplot as plt import numpy as np encoded = encoder.fit_transform(data) plt.bar(range(encoded.shape[1]), np.sum(encoded, axis=0)) plt.show()
from sklearn.tree import DecisionTreeClassifier X = [[0, 0], [1, 1]] y = [0, 1] model = DecisionTreeClassifier() model.fit(X, y) print(model.predict([[2, 2]]))
# Specify criterion in DecisionTreeClassifier(criterion='gini' or 'entropy')
model = DecisionTreeClassifier(max_depth=3) model.fit(X, y)
import matplotlib.pyplot as plt from sklearn.tree import plot_tree plot_tree(model) plt.show()
print(model.feature_importances_)
model = DecisionTreeClassifier(max_depth=5, min_samples_split=4)
# Use export_text to print tree rules from sklearn.tree import export_text print(export_text(model))
# sklearn requires categorical features encoded
path = model.cost_complexity_pruning_path(X, y) ccp_alphas = path.ccp_alphas
# Train tree on credit data, interpret splits for risk factors
# Interpretability crucial in sensitive fields like healthcare.
print(model.coef_) # For linear models print(rf.feature_importances_) # For tree ensembles
from sklearn.inspection import plot_partial_dependence import matplotlib.pyplot as plt plot_partial_dependence(rf, X, [0,1]) plt.show()
from sklearn.inspection import permutation_importance result = permutation_importance(rf, X, y, n_repeats=10) print(result.importances_mean)
import shap explainer = shap.TreeExplainer(rf) shap_values = explainer.shap_values(X) shap.summary_plot(shap_values, X)
# Use lime package: lime.lime_tabular.LimeTabularExplainer
node_indicator = rf.decision_path(X) print(node_indicator.shape)
from sklearn.tree import plot_tree import matplotlib.pyplot as plt plot_tree(rf.estimators_[0]) plt.show()
# Use pipeline.named_steps to access individual components print(pipe.named_steps['clf'].coef_)
# Evaluate bias and fairness metrics during model validation.
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_scaled = scaler.fit_transform(X)
from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() X_scaled = scaler.fit_transform(X)
from sklearn.preprocessing import RobustScaler scaler = RobustScaler() X_scaled = scaler.fit_transform(X)
from sklearn.preprocessing import QuantileTransformer scaler = QuantileTransformer(output_distribution='normal') X_scaled = scaler.fit_transform(X)
from sklearn.preprocessing import PowerTransformer scaler = PowerTransformer() X_scaled = scaler.fit_transform(X)
from sklearn.preprocessing import Normalizer normalizer = Normalizer() X_normalized = normalizer.fit_transform(X)
# StandardScaler supports sparse data with with_mean=False scaler = StandardScaler(with_mean=False) X_scaled = scaler.fit_transform(X_sparse)
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
pipe = Pipeline([
('scaler', StandardScaler()),
('clf', LogisticRegression())
])
pipe.fit(X, y)
from sklearn.base import TransformerMixin
class CustomScaler(TransformerMixin):
def fit(self, X, y=None): return self
def transform(self, X): return X / 100
# Always call fit on training set, transform on train/test
from sklearn.linear_model import SGDClassifier
model = SGDClassifier()
for X_batch, y_batch in stream_batches():
model.partial_fit(X_batch, y_batch, classes=[0, 1])
from sklearn.linear_model import PassiveAggressiveClassifier model = PassiveAggressiveClassifier() model.partial_fit(X_train, y_train, classes=[0, 1])
import pandas as pd
for chunk in pd.read_csv("large_dataset.csv", chunksize=10000):
process(chunk)
from scipy.sparse import csr_matrix sparse_data = csr_matrix([[0, 0, 3], [4, 0, 0]])
import dask.dataframe as dd
df = dd.read_csv("large_dataset.csv")
print(df.head())
from sklearn.feature_extraction import FeatureHasher hasher = FeatureHasher(n_features=8, input_type='string') features = hasher.transform([["dog"], ["cat"], ["fish"]])
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
pipe = Pipeline([
('scaler', StandardScaler(with_mean=False)),
('clf', SGDClassifier())
])
# Feed batches one by one using partial_fit
for chunk in pd.read_csv(\"large.csv\", chunksize=5000):
scaled = scaler.transform(chunk)
model.partial_fit(scaled, labels)
from sklearn.decomposition import PCA pca = PCA(n_components=20) X_reduced = pca.fit_transform(X)