# Import a classifier from sklearn.linear_model import LogisticRegression model = LogisticRegression() print(model) # Output: LogisticRegression()
# Check scikit-learn version import sklearn print(sklearn.__version__) # Example Output: '1.4.0'
# Using pip (command-line) # pip install scikit-learn # Using conda (command-line) # conda install scikit-learn
from sklearn.cluster import KMeans model = KMeans(n_clusters=3) print(model) # Output: KMeans(n_clusters=3)
# Feature: Pipelines and preprocessing from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.linear_model import Ridge pipeline = make_pipeline(StandardScaler(), Ridge()) print(pipeline)
# Scikit-learn is great for small-to-medium datasets from sklearn.svm import SVC clf = SVC(kernel='linear') print(clf)
# Full structure in short from sklearn.datasets import load_iris from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier X, y = load_iris(return_X_y=True) X_train, X_test, y_train, y_test = train_test_split(X, y) clf = RandomForestClassifier() clf.fit(X_train, y_train) print(clf.score(X_test, y_test))
from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit([[0], [1], [2]], [0, 1, 2]) print(model.predict([[3]])) # Output: [3.]
from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC pipeline = Pipeline([ ('scaler', StandardScaler()), ('svm', SVC()) ]) print(pipeline)
import pandas as pd from sklearn.linear_model import LogisticRegression df = pd.DataFrame({'x': [1, 2, 3], 'y': [0, 1, 0]}) model = LogisticRegression() model.fit(df[['x']], df['y']) print(model.predict([[2]])) # Output: [1]
from sklearn.datasets import load_iris data = load_iris() print(data.data[:2]) # Show first 2 samples
import numpy as np from sklearn.impute import SimpleImputer X = [[1, 2], [np.nan, 3], [7, 6]] imp = SimpleImputer(strategy='mean') print(imp.fit_transform(X))
from sklearn.preprocessing import OneHotEncoder X = [['red'], ['green'], ['blue']] encoder = OneHotEncoder() print(encoder.fit_transform(X).toarray())
from sklearn.preprocessing import StandardScaler X = [[1, 10], [2, 15], [3, 14]] scaler = StandardScaler() print(scaler.fit_transform(X))
from sklearn.preprocessing import Binarizer X = [[1, 5], [3, 2], [4, 0]] binarizer = Binarizer(threshold=2) print(binarizer.fit_transform(X))
from sklearn.preprocessing import PolynomialFeatures X = [[2, 3]] poly = PolynomialFeatures(degree=2) print(poly.fit_transform(X))
from sklearn.base import BaseEstimator, TransformerMixin class AddOneTransformer(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): return X + 1 import numpy as np print(AddOneTransformer().fit_transform(np.array([[1], [2], [3]])))
from sklearn.pipeline import Pipeline from sklearn.preprocessing import MinMaxScaler from sklearn.linear_model import LinearRegression pipeline = Pipeline([ ('scale', MinMaxScaler()), ('model', LinearRegression()) ]) print(pipeline)
import pandas as pd from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler, OneHotEncoder data = pd.DataFrame({ 'age': [20, 30, 40], 'gender': ['M', 'F', 'M'] }) ct = ColumnTransformer([ ('num', StandardScaler(), ['age']), ('cat', OneHotEncoder(), ['gender']) ]) print(ct.fit_transform(data))
# Good practice: use pipeline + train/test split from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.pipeline import make_pipeline from sklearn.linear_model import Ridge import numpy as np X = np.array([[1], [2], [3], [4]]) y = np.array([1, 3, 3, 4]) X_train, X_test, y_train, y_test = train_test_split(X, y) pipe = make_pipeline(StandardScaler(), Ridge()) pipe.fit(X_train, y_train) print(pipe.score(X_test, y_test))
# Example: simple supervised model from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit([[0], [1], [2]], [0, 1, 2]) print(model.predict([[3]])) # Output: [3.]
# Classification example from sklearn.linear_model import LogisticRegression model = LogisticRegression() model.fit([[0], [1], [2], [3]], [0, 0, 1, 1]) print(model.predict([[1.5]])) # Output: [0] or [1] # Regression example from sklearn.linear_model import LinearRegression reg = LinearRegression() reg.fit([[0], [1], [2], [3]], [0, 0, 1, 1]) print(reg.predict([[1.5]])) # Output: float
# Use DecisionTreeClassifier for non-linear patterns from sklearn.tree import DecisionTreeClassifier clf = DecisionTreeClassifier() clf.fit([[0], [1], [2], [3]], [0, 0, 1, 1]) print(clf.predict([[1.5]])) # Output: [0]
from sklearn.metrics import accuracy_score y_true = [0, 1, 1, 0] y_pred = [0, 1, 0, 0] print(accuracy_score(y_true, y_pred)) # Output: 0.75
from sklearn.model_selection import cross_val_score from sklearn.svm import SVC from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) scores = cross_val_score(SVC(), X, y, cv=5) print(scores) # Output: array of 5 scores
# Overfitting example with high-degree polynomial from sklearn.preprocessing import PolynomialFeatures from sklearn.linear_model import LinearRegression from sklearn.pipeline import make_pipeline import numpy as np X = np.array([[1], [2], [3], [4]]) y = np.array([3, 6, 7, 10]) model = make_pipeline(PolynomialFeatures(5), LinearRegression()) model.fit(X, y) print(model.predict([[5]])) # May overfit badly
# Use cross-validation to manage bias-variance from sklearn.model_selection import cross_val_score from sklearn.neighbors import KNeighborsClassifier scores = cross_val_score(KNeighborsClassifier(n_neighbors=1), X, y, cv=3) print("High variance model scores:", scores)
from sklearn.model_selection import train_test_split X = [[1], [2], [3], [4]] y = [0, 0, 1, 1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5) print("Train:", X_train, "Test:", X_test)
from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC params = {'C': [0.1, 1, 10]} grid = GridSearchCV(SVC(), params) grid.fit([[1], [2], [3], [4]], [0, 0, 1, 1]) print("Best Params:", grid.best_params_)
from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier() model.fit([[1], [2], [3], [4]], [0, 0, 1, 1]) print("Feature Importances:", model.feature_importances_)
from sklearn.linear_model import LogisticRegression X = [[0], [1], [2], [3]] y = [0, 0, 1, 1] model = LogisticRegression() model.fit(X, y) print(model.predict([[1.5]])) # Output: [0]
from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier(n_neighbors=3) model.fit([[0], [1], [2], [3]], [0, 0, 1, 1]) print(model.predict([[1.6]])) # Output: [0] or [1]
from sklearn.tree import DecisionTreeClassifier model = DecisionTreeClassifier() model.fit([[1], [2], [3], [4]], [0, 0, 1, 1]) print(model.predict([[2.5]])) # Output: [0]
from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier(n_estimators=10) model.fit([[1], [2], [3], [4]], [0, 0, 1, 1]) print(model.predict([[3.5]])) # Output: [1]
from sklearn.svm import SVC model = SVC(kernel='linear') model.fit([[0], [1], [2], [3]], [0, 0, 1, 1]) print(model.predict([[2.5]])) # Output: [1]
from sklearn.naive_bayes import GaussianNB X = [[1, 20], [2, 18], [3, 25], [4, 28]] y = [0, 0, 1, 1] model = GaussianNB() model.fit(X, y) print(model.predict([[2.5, 22]])) # Output: [0] or [1]
from sklearn.ensemble import GradientBoostingClassifier model = GradientBoostingClassifier() model.fit([[1], [2], [3], [4]], [0, 0, 1, 1]) print(model.predict([[3.2]])) # Output: [1]
from sklearn.linear_model import SGDClassifier model = SGDClassifier(loss='log_loss', max_iter=1000) model.fit([[0], [1], [2], [3]], [0, 0, 1, 1]) print(model.predict([[2.2]])) # Output: [1]
from sklearn.ensemble import VotingClassifier from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC clf1 = LogisticRegression() clf2 = DecisionTreeClassifier() clf3 = SVC(probability=True) model = VotingClassifier(estimators=[ ('lr', clf1), ('dt', clf2), ('svm', clf3) ], voting='soft') X = [[1], [2], [3], [4]] y = [0, 0, 1, 1] model.fit(X, y) print(model.predict([[2.8]])) # Output: [1]
from sklearn.metrics import classification_report y_true = [0, 1, 0, 1] y_pred = [0, 1, 1, 1] print(classification_report(y_true, y_pred))
from sklearn.linear_model import LinearRegression X = [[1], [2], [3], [4]] y = [2, 4, 6, 8] model = LinearRegression() model.fit(X, y) print(model.predict([[5]])) # Output: [10.]
from sklearn.linear_model import Ridge model = Ridge(alpha=1.0) model.fit(X, y) print(model.predict([[5]])) # Output: [~10.]
from sklearn.linear_model import Lasso model = Lasso(alpha=0.1) model.fit(X, y) print(model.predict([[5]])) # Output: [~10.]
from sklearn.linear_model import ElasticNet model = ElasticNet(alpha=0.1, l1_ratio=0.5) model.fit(X, y) print(model.predict([[5]])) # Output: [~10.]
from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import make_pipeline model = make_pipeline(PolynomialFeatures(degree=2), LinearRegression()) model.fit(X, y) print(model.predict([[5]])) # Output: [10.]
from sklearn.tree import DecisionTreeRegressor model = DecisionTreeRegressor() model.fit(X, y) print(model.predict([[5]])) # Output: [8.0] or [~10]
from sklearn.ensemble import RandomForestRegressor model = RandomForestRegressor() model.fit(X, y) print(model.predict([[5]])) # Output: [~10.]
from sklearn.svm import SVR model = SVR(kernel='linear') model.fit(X, y) print(model.predict([[5]])) # Output: [~10.]
from sklearn.ensemble import GradientBoostingRegressor model = GradientBoostingRegressor() model.fit(X, y) print(model.predict([[5]])) # Output: [~10.]
from sklearn.metrics import mean_squared_error, r2_score y_true = [2, 4, 6, 8] y_pred = [2.1, 4.1, 6, 7.9] print("MSE:", mean_squared_error(y_true, y_pred)) # Output: small number print("R² Score:", r2_score(y_true, y_pred)) # Output: close to 1
from sklearn.model_selection import cross_val_score from sklearn.linear_model import LogisticRegression from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) scores = cross_val_score(LogisticRegression(max_iter=1000), X, y, cv=5) print(scores) # Output: array of scores from 5 folds
from sklearn.model_selection import GridSearchCV from sklearn.svm import SVC param_grid = {'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']} grid = GridSearchCV(SVC(), param_grid, cv=3) grid.fit(X, y) print("Best Params:", grid.best_params_)
from sklearn.model_selection import RandomizedSearchCV from sklearn.ensemble import RandomForestClassifier import numpy as np param_dist = {'n_estimators': [10, 50, 100], 'max_depth': [None, 3, 5, 10]} rand_search = RandomizedSearchCV(RandomForestClassifier(), param_dist, n_iter=4, cv=3) rand_search.fit(X, y) print(rand_search.best_params_)
score = cross_val_score(LogisticRegression(max_iter=1000), X, y, cv=5, scoring='accuracy') print("Accuracy Scores:", score)
from sklearn.metrics import confusion_matrix y_true = [0, 1, 0, 1] y_pred = [0, 1, 1, 1] print(confusion_matrix(y_true, y_pred))
from sklearn.metrics import precision_score, recall_score, f1_score print("Precision:", precision_score(y_true, y_pred)) print("Recall:", recall_score(y_true, y_pred)) print("F1 Score:", f1_score(y_true, y_pred))
from sklearn.metrics import roc_auc_score from sklearn.linear_model import LogisticRegression from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) model = LogisticRegression(max_iter=1000) model.fit(X_train, y_train) y_prob = model.predict_proba(X_test)[:, 1] print("AUC Score:", roc_auc_score(y_test, y_prob, multi_class='ovr'))
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score y_true = [3, -0.5, 2, 7] y_pred = [2.5, 0.0, 2, 8] print("MSE:", mean_squared_error(y_true, y_pred)) print("MAE:", mean_absolute_error(y_true, y_pred)) print("R²:", r2_score(y_true, y_pred))
from sklearn.model_selection import learning_curve import matplotlib.pyplot as plt import numpy as np train_sizes, train_scores, test_scores = learning_curve( LogisticRegression(max_iter=1000), X, y, cv=5) plt.plot(train_sizes, np.mean(train_scores, axis=1), label='Train') plt.plot(train_sizes, np.mean(test_scores, axis=1), label='Validation') plt.legend() plt.title("Learning Curve") plt.xlabel("Training Size") plt.ylabel("Score") plt.grid(True) plt.show()
from sklearn.model_selection import validation_curve param_range = [0.01, 0.1, 1, 10] train_scores, test_scores = validation_curve( LogisticRegression(max_iter=1000), X, y, param_name="C", param_range=param_range, cv=5) plt.plot(param_range, np.mean(train_scores, axis=1), label="Train") plt.plot(param_range, np.mean(test_scores, axis=1), label="Validation") plt.xscale('log') plt.legend() plt.title("Validation Curve for C") plt.xlabel("C") plt.ylabel("Score") plt.grid(True) plt.show()
from sklearn.decomposition import PCA from sklearn.datasets import load_iris import matplotlib.pyplot as plt X, y = load_iris(return_X_y=True) pca = PCA(n_components=2) X_pca = pca.fit_transform(X) plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y) plt.title("PCA - Iris Dataset") plt.xlabel("PC1") plt.ylabel("PC2") plt.grid(True) plt.show()
# t-SNE example from sklearn.manifold import TSNE X_tsne = TSNE(n_components=2, perplexity=30, random_state=42).fit_transform(X) plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y) plt.title("t-SNE - Iris Dataset") plt.grid(True) plt.show()
# UMAP example (requires umap-learn) # pip install umap-learn import umap X_umap = umap.UMAP(n_neighbors=15, min_dist=0.1, n_components=2).fit_transform(X) plt.scatter(X_umap[:, 0], X_umap[:, 1], c=y) plt.title("UMAP - Iris Dataset") plt.grid(True) plt.show()
from sklearn.cluster import KMeans from sklearn.metrics import silhouette_score, davies_bouldin_score, adjusted_rand_score # Clustering on iris model = KMeans(n_clusters=3, random_state=42) y_pred = model.fit_predict(X) print("Silhouette Score:", silhouette_score(X, y_pred)) print("Davies-Bouldin Score:", davies_bouldin_score(X, y_pred)) print("Adjusted Rand Index (with true labels):", adjusted_rand_score(y, y_pred))
# Example: creating new feature from date import pandas as pd df = pd.DataFrame({'date': pd.to_datetime(['2020-01-01', '2020-02-01'])}) df['month'] = df['date'].dt.month print(df)
from sklearn.feature_selection import SelectKBest, chi2 from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) X_new = SelectKBest(chi2, k=2).fit_transform(X, y) print(X_new[:2])
from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression model = LogisticRegression(max_iter=1000) selector = RFE(model, n_features_to_select=2) X_rfe = selector.fit_transform(X, y) print(X_rfe[:2])
from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier() model.fit(X, y) print(model.feature_importances_)
from sklearn.linear_model import Lasso from sklearn.datasets import load_boston import numpy as np boston = load_boston() X, y = boston.data, boston.target model = Lasso(alpha=0.1) model.fit(X, y) print(np.round(model.coef_, 2))
from sklearn.decomposition import PCA pca = PCA(n_components=2) X_pca = pca.fit_transform(X) print(X_pca[:2])
import seaborn as sns import matplotlib.pyplot as plt corr = pd.DataFrame(X).corr() sns.heatmap(corr, annot=True) plt.title("Feature Correlation Heatmap") plt.show()
from sklearn.feature_selection import mutual_info_classif mi = mutual_info_classif(X, y) print(mi)
from sklearn.feature_selection import SelectFromModel model = RandomForestClassifier() model.fit(X, y) selector = SelectFromModel(model, prefit=True) X_sel = selector.transform(X) print(X_sel[:2])
# Example: BMI from weight/height in a medical dataset df = pd.DataFrame({'weight_kg': [70, 80], 'height_m': [1.75, 1.8]}) df['BMI'] = df['weight_kg'] / df['height_m']**2 print(df)
from sklearn.linear_model import LogisticRegression import joblib model = LogisticRegression() model.fit([[0], [1]], [0, 1]) joblib.dump(model, 'model.pkl') # Save loaded = joblib.load('model.pkl') # Load print(loaded.predict([[2]]))
import pickle with open('model.pkl', 'wb') as f: pickle.dump(model, f) with open('model.pkl', 'rb') as f: loaded_model = pickle.load(f) print(loaded_model.predict([[3]]))
# Simple API idea (in Flask) from flask import Flask, request, jsonify app = Flask(__name__) @app.route('/predict', methods=['POST']) def predict(): data = request.json['data'] result = loaded_model.predict([data]) return jsonify({'prediction': result.tolist()}) # Run with: flask run
# Save this as app.py and run it from flask import Flask, request, jsonify app = Flask(__name__) @app.route('/') def home(): return "ML Model API Running" @app.route('/predict', methods=['POST']) def predict(): data = request.json['data'] result = loaded_model.predict([data]) return jsonify(result.tolist())
# Save as app.py, then run with: streamlit run app.py import streamlit as st st.title("Simple ML App") val = st.number_input("Enter a value:") if st.button("Predict"): result = loaded_model.predict([[val]]) st.write("Prediction:", result[0])
# Dockerfile Example FROM python:3.9 COPY . /app WORKDIR /app RUN pip install -r requirements.txt CMD ["python", "app.py"]
# Required files: Procfile, requirements.txt # Procfile content: web: python app.py
from fastapi import FastAPI from pydantic import BaseModel app = FastAPI() class Input(BaseModel): data: list @app.post("/predict") def predict(input: Input): return {"result": loaded_model.predict([input.data]).tolist()}
import logging logging.basicConfig(filename='predictions.log', level=logging.INFO) input_val = [2] pred = loaded_model.predict([input_val]) logging.info(f"Input: {input_val}, Prediction: {pred}")
# Save models with versioning joblib.dump(model, 'model_v1.pkl') joblib.dump(model, 'model_v2.pkl') # Load specific version v2 = joblib.load('model_v2.pkl')
# Instead of separate fit() calls, use pipelines to reduce errors and code clutter.
from sklearn.pipeline import Pipeline, make_pipeline from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression pipe = make_pipeline(StandardScaler(), LogisticRegression()) pipe.fit([[1], [2], [3], [4]], [0, 0, 1, 1]) print(pipe.predict([[1.5]]))
pipe = Pipeline([ ('scaler', StandardScaler()), ('clf', LogisticRegression()) ]) pipe.fit([[1], [2], [3], [4]], [0, 0, 1, 1])
# Useful when combining multiple column pipelines using ColumnTransformer
from sklearn.pipeline import FeatureUnion from sklearn.decomposition import PCA from sklearn.preprocessing import StandardScaler combined = FeatureUnion([("pca", PCA(n_components=2)), ("scale", StandardScaler())])
from sklearn.model_selection import GridSearchCV grid = GridSearchCV(pipe, {'logisticregression__C': [0.1, 1, 10]}, cv=3) grid.fit([[1], [2], [3], [4]], [0, 0, 1, 1])
import joblib joblib.dump(pipe, 'pipeline.pkl')
print(pipe.named_steps['logisticregression'])
from sklearn.base import BaseEstimator, TransformerMixin class MultiplyByTwo(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): return X * 2
import re text = "Hello, World!" cleaned = re.sub(r'[^\w\s]', '', text.lower()) print(cleaned)
from sklearn.feature_extraction.text import CountVectorizer cv = CountVectorizer() X = cv.fit_transform(["I love AI", "AI loves me"]) print(X.toarray())
from sklearn.feature_extraction.text import TfidfVectorizer tfidf = TfidfVectorizer() X = tfidf.fit_transform(["I love AI", "AI loves me"]) print(X.toarray())
cv = CountVectorizer() tfidf = TfidfVectorizer()
from sklearn.pipeline import make_pipeline from sklearn.naive_bayes import MultinomialNB pipe = make_pipeline(CountVectorizer(), MultinomialNB()) pipe.fit(["spam spam", "ham email"], [1, 0]) print(pipe.predict(["spam now"]))
model = MultinomialNB()
from sklearn.cluster import KMeans X = TfidfVectorizer().fit_transform(["dog cat", "apple orange", "cat dog", "fruit banana"]) model = KMeans(n_clusters=2).fit(X) print(model.labels_)
from sklearn.decomposition import TruncatedSVD X = TfidfVectorizer().fit_transform(["text one", "text two"]) svd = TruncatedSVD(n_components=2) X_reduced = svd.fit_transform(X) print(X_reduced)
vectorizer = CountVectorizer(ngram_range=(1,2), stop_words='english')
texts = ["I love this!", "I hate that."] labels = [1, 0] pipe = make_pipeline(TfidfVectorizer(), MultinomialNB()) pipe.fit(texts, labels) print(pipe.predict(["love it"]))
import numpy as np y = [0]*90 + [1]*10 print("Class 0:", y.count(0), "Class 1:", y.count(1))
from imblearn.over_sampling import SMOTE X = [[i] for i in range(100)] y = [0]*90 + [1]*10 X_res, y_res = SMOTE().fit_resample(X, y) print("New class distribution:", {i: y_res.count(i) for i in set(y_res)})
from imblearn.under_sampling import RandomUnderSampler rus = RandomUnderSampler() X_res, y_res = rus.fit_resample(X, y) print("Resampled:", len(y_res))
from sklearn.linear_model import LogisticRegression model = LogisticRegression(class_weight='balanced')
from sklearn.metrics import classification_report y_true = [0]*95 + [1]*5 y_pred = [0]*90 + [1]*10 print(classification_report(y_true, y_pred))
from imblearn.pipeline import Pipeline pipe = Pipeline([('smote', SMOTE()), ('clf', LogisticRegression())]) pipe.fit(X, y)
# See: imblearn.over_sampling.ADASYN
from imblearn.ensemble import BalancedBaggingClassifier from sklearn.tree import DecisionTreeClassifier model = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10)
probs = model.predict_proba(X_res)[:, 1] y_pred = [1 if p > 0.3 else 0 for p in probs]
# Combine SMOTE + LogisticRegression + F1-score on fraud dataset.
# Simple agent example: a chatbot responding to user input def agent(input_text): if "hello" in input_text.lower(): return "Hello! How can I help you?" return "I don't understand." print(agent("Hello there!"))
# Example: Reflex agent responds to keywords def reflex_agent(input_text): if "weather" in input_text: return "Check the weather app." return "Sorry, I can't help." print(reflex_agent("What's the weather?"))
# Email filter example emails = ["Buy now!", "Meeting at 3pm", "Cheap meds"] filtered = [e for e in emails if "buy" not in e.lower()] print(filtered)
# Simplified simulation: two agents competing for a resource agent1 = {"energy": 5} agent2 = {"energy": 3} def compete(a1, a2): winner = "Agent1" if a1['energy'] > a2['energy'] else "Agent2" return winner print(compete(agent1, agent2))
import gym env = gym.make('CartPole-v1') obs = env.reset() print("Initial observation:", obs) env.close()
# Simple scheduler example def schedule_task(task, time): print(f"Scheduled '{task}' at {time}") schedule_task("Backup", "02:00 AM")
# Basic command parser def parse_command(text): if "turn on light" in text.lower(): return "Lights turned on" return "Command not recognized" print(parse_command("Please turn on light"))
# Pseudo-code for robot navigation def move_robot(direction): print(f"Moving {direction}") move_robot("forward")
# No direct code, but always consider ethical constraints!
# Imagine AI agents that learn from each other and adapt dynamically.
import numpy as np import pandas as pd from sklearn.linear_model import LinearRegression X = pd.DataFrame({'feature': [1, 2, 3, 4]}) y = np.array([2, 4, 6, 8]) model = LinearRegression() model.fit(X, y) print(model.predict(pd.DataFrame({'feature': [5]})))
import matplotlib.pyplot as plt import seaborn as sns sns.set() data = sns.load_dataset('iris') sns.pairplot(data, hue='species') plt.show()
from xgboost import XGBClassifier from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss') model.fit(X, y) print(model.predict(X[:2]))
import lightgbm as lgb from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) model = lgb.LGBMClassifier() model.fit(X, y) print(model.predict(X[:2]))
from catboost import CatBoostClassifier from sklearn.datasets import load_iris X, y = load_iris(return_X_y=True) model = CatBoostClassifier(verbose=0) model.fit(X, y) print(model.predict(X[:2]))
# Example Optuna tuning skeleton: import optuna def objective(trial): param = {'C': trial.suggest_loguniform('C', 1e-3, 10)} model = LogisticRegression(**param) model.fit(X, y) return model.score(X, y) study = optuna.create_study(direction='maximize') study.optimize(objective, n_trials=10) print(study.best_params)
import mlflow import mlflow.sklearn with mlflow.start_run(): model.fit(X, y) mlflow.sklearn.log_model(model, "model")
import dask.array as da X = da.random.random((1000, 10), chunks=(100, 10)) # Train models with dask-ml wrappers
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier def create_model(): from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense model = Sequential([Dense(10, input_shape=(4,), activation='relu'), Dense(3, activation='softmax')]) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) return model keras_model = KerasClassifier(build_fn=create_model, epochs=5, batch_size=10) keras_model.fit(X, y)
import skl2onnx from skl2onnx import convert_sklearn from skl2onnx.common.data_types import FloatTensorType initial_type = [('float_input', FloatTensorType([None, X.shape[1]]))] onnx_model = convert_sklearn(model, initial_types=initial_type) with open("model.onnx", "wb") as f: f.write(onnx_model.SerializeToString())
from sklearn.datasets import fetch_california_housing from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression data = fetch_california_housing() X_train, X_test, y_train, y_test = train_test_split(data.data, data.target, random_state=42) model = LinearRegression() model.fit(X_train, y_train) print(model.predict([X_test[0]]))
# Use logistic regression on customer data (pseudo-example) # X, y = customer_features, churn_labels # model = LogisticRegression().fit(X, y)
from sklearn.datasets import load_breast_cancer X, y = load_breast_cancer(return_X_y=True) model = LogisticRegression(max_iter=1000) model.fit(X, y) print(model.predict(X[:2]))
from sklearn.pipeline import make_pipeline from sklearn.feature_extraction.text import CountVectorizer from sklearn.naive_bayes import MultinomialNB texts = ["Buy cheap meds", "Meeting at 10", "Cheap meds now", "Lunch tomorrow"] labels = [1, 0, 1, 0] pipe = make_pipeline(CountVectorizer(), MultinomialNB()) pipe.fit(texts, labels) print(pipe.predict(["Cheap meds offer"]))
from sklearn.decomposition import PCA from sklearn.datasets import load_digits from sklearn.linear_model import LogisticRegression digits = load_digits() pca = PCA(n_components=30) X_pca = pca.fit_transform(digits.data) model = LogisticRegression(max_iter=1000) model.fit(X_pca, digits.target) print(model.predict(X_pca[:1]))
from sklearn.cluster import KMeans import numpy as np X = np.array([[1, 2], [1, 4], [5, 8], [8, 8]]) kmeans = KMeans(n_clusters=2, random_state=0).fit(X) print(kmeans.labels_)
# Simple example: recommend movies based on user ratings similarity (pseudo-code)
# Use statsmodels for ARIMA and sklearn for ML models on residuals.
from imblearn.over_sampling import SMOTE from sklearn.ensemble import RandomForestClassifier # X, y = fraud_data_features, fraud_labels # X_res, y_res = SMOTE().fit_resample(X, y) # model = RandomForestClassifier().fit(X_res, y_res)
# Example: stock prices over time vs random tabular data
import pandas as pd df = pd.DataFrame({'value': [1,2,3,4,5]}) df['lag_1'] = df['value'].shift(1) df['rolling_mean_2'] = df['value'].rolling(window=2).mean() print(df)
# Convert series into supervised learning samples with windows
# See example above with lag and rolling_mean
from sklearn.model_selection import TimeSeriesSplit tscv = TimeSeriesSplit(n_splits=3) for train_index, test_index in tscv.split(df): print("TRAIN:", train_index, "TEST:", test_index)
# Pipelines can include custom transformers for lag features
from sklearn.linear_model import LinearRegression import numpy as np X = np.array([[1], [2], [3]]) y = np.array([2, 3, 4]) model = LinearRegression().fit(X, y) print(model.predict([[4]]))
import matplotlib.pyplot as plt from pandas.plotting import autocorrelation_plot autocorrelation_plot(df['value']) plt.show()
# See TimeSeriesSplit example above
# Practical example would combine all above techniques on sales data
from sklearn.preprocessing import OneHotEncoder import numpy as np data = np.array([['red'], ['green'], ['blue']]) encoder = OneHotEncoder(sparse=False) print(encoder.fit_transform(data))
from sklearn.preprocessing import OrdinalEncoder data = np.array([['low'], ['medium'], ['high']]) encoder = OrdinalEncoder(categories=[['low', 'medium', 'high']]) print(encoder.fit_transform(data))
# Use category_encoders library # import category_encoders as ce # encoder = ce.BinaryEncoder(cols=['feature'])
# Use category_encoders or custom implementation
from sklearn.feature_extraction import FeatureHasher data = [{'cat': 'dog'}, {'cat': 'fish'}, {'cat': 'dog'}] hasher = FeatureHasher(input_type='dict') print(hasher.transform(data).toarray())
# Consider grouping rare categories or using hashing
from sklearn.pipeline import Pipeline from sklearn.preprocessing import OneHotEncoder from sklearn.linear_model import LogisticRegression pipe = Pipeline([ ('encoder', OneHotEncoder(handle_unknown='ignore')), ('clf', LogisticRegression()) ])
from sklearn.compose import ColumnTransformer from sklearn.preprocessing import StandardScaler preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), ['num_feature']), ('cat', OneHotEncoder(), ['cat_feature']) ])
# Example: OneHotEncoder(handle_unknown='ignore')
import matplotlib.pyplot as plt import numpy as np encoded = encoder.fit_transform(data) plt.bar(range(encoded.shape[1]), np.sum(encoded, axis=0)) plt.show()
from sklearn.tree import DecisionTreeClassifier X = [[0, 0], [1, 1]] y = [0, 1] model = DecisionTreeClassifier() model.fit(X, y) print(model.predict([[2, 2]]))
# Specify criterion in DecisionTreeClassifier(criterion='gini' or 'entropy')
model = DecisionTreeClassifier(max_depth=3) model.fit(X, y)
import matplotlib.pyplot as plt from sklearn.tree import plot_tree plot_tree(model) plt.show()
print(model.feature_importances_)
model = DecisionTreeClassifier(max_depth=5, min_samples_split=4)
# Use export_text to print tree rules from sklearn.tree import export_text print(export_text(model))
# sklearn requires categorical features encoded
path = model.cost_complexity_pruning_path(X, y) ccp_alphas = path.ccp_alphas
# Train tree on credit data, interpret splits for risk factors
# Interpretability crucial in sensitive fields like healthcare.
print(model.coef_) # For linear models print(rf.feature_importances_) # For tree ensembles
from sklearn.inspection import plot_partial_dependence import matplotlib.pyplot as plt plot_partial_dependence(rf, X, [0,1]) plt.show()
from sklearn.inspection import permutation_importance result = permutation_importance(rf, X, y, n_repeats=10) print(result.importances_mean)
import shap explainer = shap.TreeExplainer(rf) shap_values = explainer.shap_values(X) shap.summary_plot(shap_values, X)
# Use lime package: lime.lime_tabular.LimeTabularExplainer
node_indicator = rf.decision_path(X) print(node_indicator.shape)
from sklearn.tree import plot_tree import matplotlib.pyplot as plt plot_tree(rf.estimators_[0]) plt.show()
# Use pipeline.named_steps to access individual components print(pipe.named_steps['clf'].coef_)
# Evaluate bias and fairness metrics during model validation.
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() X_scaled = scaler.fit_transform(X)
from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() X_scaled = scaler.fit_transform(X)
from sklearn.preprocessing import RobustScaler scaler = RobustScaler() X_scaled = scaler.fit_transform(X)
from sklearn.preprocessing import QuantileTransformer scaler = QuantileTransformer(output_distribution='normal') X_scaled = scaler.fit_transform(X)
from sklearn.preprocessing import PowerTransformer scaler = PowerTransformer() X_scaled = scaler.fit_transform(X)
from sklearn.preprocessing import Normalizer normalizer = Normalizer() X_normalized = normalizer.fit_transform(X)
# StandardScaler supports sparse data with with_mean=False scaler = StandardScaler(with_mean=False) X_scaled = scaler.fit_transform(X_sparse)
from sklearn.pipeline import Pipeline from sklearn.linear_model import LogisticRegression pipe = Pipeline([ ('scaler', StandardScaler()), ('clf', LogisticRegression()) ]) pipe.fit(X, y)
from sklearn.base import TransformerMixin class CustomScaler(TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): return X / 100
# Always call fit on training set, transform on train/test
from sklearn.linear_model import SGDClassifier model = SGDClassifier() for X_batch, y_batch in stream_batches(): model.partial_fit(X_batch, y_batch, classes=[0, 1])
from sklearn.linear_model import PassiveAggressiveClassifier model = PassiveAggressiveClassifier() model.partial_fit(X_train, y_train, classes=[0, 1])
import pandas as pd for chunk in pd.read_csv("large_dataset.csv", chunksize=10000): process(chunk)
from scipy.sparse import csr_matrix sparse_data = csr_matrix([[0, 0, 3], [4, 0, 0]])
import dask.dataframe as dd df = dd.read_csv("large_dataset.csv") print(df.head())
from sklearn.feature_extraction import FeatureHasher hasher = FeatureHasher(n_features=8, input_type='string') features = hasher.transform([["dog"], ["cat"], ["fish"]])
from sklearn.pipeline import Pipeline from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler pipe = Pipeline([ ('scaler', StandardScaler(with_mean=False)), ('clf', SGDClassifier()) ]) # Feed batches one by one using partial_fit
for chunk in pd.read_csv(\"large.csv\", chunksize=5000): scaled = scaler.transform(chunk) model.partial_fit(scaled, labels)
from sklearn.decomposition import PCA pca = PCA(n_components=20) X_reduced = pca.fit_transform(X)