from sklearn.preprocessing import StandardScaler | Import StandardScaler |
scaler = StandardScaler() | Create scaler (mean=0, std=1) |
X_scaled = scaler.fit_transform(X) | Fit and transform |
MinMaxScaler() | Scale to [0, 1] |
RobustScaler() | Robust to outliers |
Normalizer() | L2 normalize rows |
LabelEncoder() | Encode labels to integers |
le.fit_transform(y) | Fit and encode |
OneHotEncoder() | One-hot encoding |
OrdinalEncoder() | Ordinal encoding |
LabelBinarizer() | Binary labels |
from sklearn.impute import SimpleImputer | Import imputer |
SimpleImputer(strategy="mean") | Mean imputation |
SimpleImputer(strategy="median") | Median imputation |
SimpleImputer(strategy="most_frequent") | Mode imputation |
KNNImputer(n_neighbors=5) | KNN imputation |
SelectKBest(k=10) | Select K best features |
from sklearn.model_selection import train_test_split | Import split function |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) | 80/20 split |
train_test_split(X, y, stratify=y) | Stratified split |
train_test_split(X, y, random_state=42) | Reproducible split |
from sklearn.model_selection import cross_val_score | Import CV score |
cross_val_score(model, X, y, cv=5) | 5-fold CV |
cross_val_predict(model, X, y, cv=5) | CV predictions |
KFold(n_splits=5, shuffle=True) | K-Fold splitter |
StratifiedKFold(n_splits=5) | Stratified K-Fold |
LeaveOneOut() | Leave-one-out CV |
from sklearn.model_selection import GridSearchCV | Grid search |
GridSearchCV(model, param_grid, cv=5, scoring='accuracy') | Create grid search |
grid.fit(X_train, y_train) | Fit grid search |
grid.best_params_ | Best parameters |
grid.best_score_ | Best score |
RandomizedSearchCV(model, param_dist, n_iter=100) | Random search |
from sklearn.linear_model import LogisticRegression | Logistic Regression |
from sklearn.tree import DecisionTreeClassifier | Decision Tree |
from sklearn.ensemble import RandomForestClassifier | Random Forest |
from sklearn.svm import SVC | Support Vector Machine |
from sklearn.neighbors import KNeighborsClassifier | K-Nearest Neighbors |
from sklearn.naive_bayes import GaussianNB | Naive Bayes |
from sklearn.ensemble import GradientBoostingClassifier | Gradient Boosting |
model = RandomForestClassifier(n_estimators=100) | Create model |
model.fit(X_train, y_train) | Train model |
y_pred = model.predict(X_test) | Predict classes |
y_proba = model.predict_proba(X_test) | Predict probabilities |
model.score(X_test, y_test) | Accuracy score |
from sklearn.linear_model import LinearRegression | Linear Regression |
from sklearn.linear_model import Ridge | Ridge Regression (L2) |
from sklearn.linear_model import Lasso | Lasso Regression (L1) |
from sklearn.linear_model import ElasticNet | Elastic Net |
from sklearn.tree import DecisionTreeRegressor | Decision Tree |
from sklearn.ensemble import RandomForestRegressor | Random Forest |
from sklearn.svm import SVR | Support Vector Regression |
from sklearn.cluster import KMeans | K-Means |
KMeans(n_clusters=3) | Create KMeans |
labels = kmeans.fit_predict(X) | Fit and get labels |
kmeans.cluster_centers_ | Cluster centers |
DBSCAN(eps=0.5, min_samples=5) | DBSCAN |
AgglomerativeClustering(n_clusters=3) | Hierarchical |
from sklearn.metrics import accuracy_score | Accuracy |
from sklearn.metrics import precision_score, recall_score, f1_score | Precision, Recall, F1 |
from sklearn.metrics import classification_report | Classification report |
from sklearn.metrics import confusion_matrix | Confusion matrix |
from sklearn.metrics import roc_auc_score | ROC AUC |
from sklearn.metrics import roc_curve | ROC curve |
from sklearn.metrics import mean_squared_error | MSE |
from sklearn.metrics import mean_absolute_error | MAE |
from sklearn.metrics import r2_score | R² score |
mean_squared_error(y_true, y_pred, squared=False) | RMSE |
from sklearn.pipeline import Pipeline | Import Pipeline |
Pipeline([('scaler', StandardScaler()), ('clf', SVC())]) | Create pipeline |
from sklearn.pipeline import make_pipeline | Make pipeline helper |
make_pipeline(StandardScaler(), SVC()) | Auto-named pipeline |
pipe.fit(X_train, y_train) | Fit pipeline |
pipe.predict(X_test) | Predict with pipeline |