跳转至

02 - scikit-learn

学习时间: 10-12小时 重要性: ⭐⭐⭐⭐⭐ 传统机器学习的标准工具


🎯 学习目标

  • 掌握完整的数据预处理流程
  • 理解各种机器学习算法的原理和应用场景
  • 学会特征工程和特征选择
  • 掌握模型评估和超参数调优
  • 理解Pipeline的设计和使用
  • 学会模型解释和可视化

📚 内容概览

  1. 数据预处理
  2. 特征工程
  3. 监督学习算法
  4. 无监督学习
  5. 模型评估与选择
  6. Pipeline与模型组合
  7. 模型解释性

1. 数据预处理

1.1 数据加载与探索

Python
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris, fetch_california_housing, make_classification

# 加载内置数据集
iris = load_iris()
X, y = iris.data, iris.target
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature names: {iris.feature_names}")
print(f"Target names: {iris.target_names}")

# 生成模拟数据
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    n_classes=2,
    random_state=42
)

# 转换为DataFrame
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
df['target'] = y

# 数据探索
print(df.head())
print(df.describe())
print(df.info())
print(df.isnull().sum())

1.2 处理缺失值

Python
from sklearn.impute import SimpleImputer, KNNImputer

# 创建含缺失值的数据
X = np.array([[1, 2, np.nan],
              [4, np.nan, 6],
              [7, 8, 9]])

# 简单填充
imputer = SimpleImputer(strategy='mean')  # 策略: mean, median, most_frequent, constant
X_imputed = imputer.fit_transform(X)

# 中位数填充
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# 众数填充(适用于分类特征)
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)

# 常数填充
imputer = SimpleImputer(strategy='constant', fill_value=0)
X_imputed = imputer.fit_transform(X)

# KNN填充(利用相似样本填充)
imputer = KNNImputer(n_neighbors=2)
X_imputed = imputer.fit_transform(X)

# 迭代填充(利用其他特征预测缺失值)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=10, random_state=0)
X_imputed = imputer.fit_transform(X)

1.3 特征缩放

Python
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer

X = np.array([[1, -1, 2],
              [2, 0, 0],
              [0, 1, -1]], dtype=float)

# 标准化 (Z-score标准化)
# x' = (x - mean) / std
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"Mean: {X_scaled.mean(axis=0)}")  # [0, 0, 0]
print(f"Std: {X_scaled.std(axis=0)}")    # [1, 1, 1]

# 归一化 (缩放到[0,1])
# x' = (x - min) / (max - min)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
print(f"Min: {X_scaled.min(axis=0)}")    # [0, 0, 0]
print(f"Max: {X_scaled.max(axis=0)}")    # [1, 1, 1]

# 稳健标准化(对异常值不敏感)
# 使用中位数和四分位数
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# 归一化到单位范数(每个样本)
normalizer = Normalizer(norm='l2')  # 'l1', 'l2', 'max'
X_normalized = normalizer.fit_transform(X)

# 保存和加载scaler
import joblib
joblib.dump(scaler, 'scaler.pkl')
scaler_loaded = joblib.load('scaler.pkl')

1.4 类别特征编码

Python
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

# LabelEncoder - 将类别转换为整数(适用于目标变量)
le = LabelEncoder()
y = ['cat', 'dog', 'cat', 'bird', 'dog']
y_encoded = le.fit_transform(y)
print(y_encoded)  # [0, 2, 0, 1, 2]
print(le.classes_)  # ['bird' 'cat' 'dog']
print(le.inverse_transform([0, 1, 2]))  # ['bird' 'cat' 'dog']

# OneHotEncoder - 独热编码(适用于特征)
X = [['male', 'US', 'Safari'],
     ['female', 'EU', 'Chrome'],
     ['male', 'EU', 'Firefox']]

encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X)
print(X_encoded)
print(encoder.categories_)
print(encoder.get_feature_names_out())

# 处理未知类别
encoder = OneHotEncoder(handle_unknown='ignore')

# OrdinalEncoder - 有序类别编码
X = [['low', 'US'],
     ['high', 'EU'],
     ['medium', 'US']]
encoder = OrdinalEncoder(categories=[['low', 'medium', 'high'], ['EU', 'US']])
X_encoded = encoder.fit_transform(X)
print(X_encoded)

# TargetEncoder - 基于目标变量的编码(类别不平衡时效果好)
from category_encoders import TargetEncoder

X = np.array(['A', 'B', 'A', 'C', 'B']).reshape(-1, 1)
y = np.array([1, 0, 1, 0, 1])
encoder = TargetEncoder()
X_encoded = encoder.fit_transform(X, y)

1.5 异常值检测

Python
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope

# 生成含异常值的数据
np.random.seed(42)
X_inliers = np.random.randn(100, 2)
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
X = np.vstack([X_inliers, X_outliers])

# Isolation Forest
clf = IsolationForest(contamination=0.1, random_state=42)
y_pred = clf.fit_predict(X)
# 1表示正常,-1表示异常
outliers = X[y_pred == -1]

# Local Outlier Factor
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
y_pred = clf.fit_predict(X)

# Elliptic Envelope(假设数据服从高斯分布)
clf = EllipticEnvelope(contamination=0.1)
y_pred = clf.fit_predict(X)

# 基于统计方法
from scipy import stats
z_scores = np.abs(stats.zscore(X))
outliers = (z_scores > 3).any(axis=1)

2. 特征工程

2.1 多项式特征

Python
from sklearn.preprocessing import PolynomialFeatures

X = np.array([[0, 1],
              [2, 3],
              [4, 5]])

# 生成多项式特征
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
print(X_poly)
# [x1, x2, x1^2, x1*x2, x2^2]

# 查看特征名称
print(poly.get_feature_names_out())

# 交互特征(只保留交叉项)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)

2.2 特征选择

Python
from sklearn.feature_selection import (
    SelectKBest, f_classif, f_regression,
    RFE, SelectFromModel, VarianceThreshold
)

# 方差阈值(删除低方差特征)
selector = VarianceThreshold(threshold=0.1)
X_selected = selector.fit_transform(X)

# 单变量特征选择
# 分类问题
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
print(selector.scores_)  # 每个特征的F值
print(selector.pvalues_)  # p值

# 回归问题
selector = SelectKBest(score_func=f_regression, k=10)
X_selected = selector.fit_transform(X, y)

# 递归特征消除 (RFE)
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression()
selector = RFE(estimator, n_features_to_select=5, step=1)
X_selected = selector.fit_transform(X, y)
print(selector.support_)  # 被选中的特征
print(selector.ranking_)  # 特征排名

# 基于模型的特征选择
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()
selector = SelectFromModel(estimator, max_features=5)
X_selected = selector.fit_transform(X, y)
print(selector.get_support())

2.3 降维

Python
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# PCA (主成分分析)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.2%}")
print(f"Components shape: {pca.components_.shape}")

# 保留95%的方差
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)

# LDA (线性判别分析,需要标签)
lda = LinearDiscriminantAnalysis(n_components=2)
X_lda = lda.fit_transform(X, y)

# TruncatedSVD (适用于稀疏矩阵)
svd = TruncatedSVD(n_components=50)
X_svd = svd.fit_transform(X)

3. 监督学习算法

3.1 线性模型

Python
from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet,
    LogisticRegression, SGDClassifier, SGDRegressor
)
from sklearn.datasets import make_regression, make_classification

# 生成回归数据
X, y = make_regression(n_samples=100, n_features=2, noise=10, random_state=42)

# 线性回归
model = LinearRegression()
model.fit(X, y)
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")
print(f"R² score: {model.score(X, y)}")

# 预测
y_pred = model.predict(X)

# Ridge回归 (L2正则化)
model = Ridge(alpha=1.0)
model.fit(X, y)

# Lasso回归 (L1正则化,产生稀疏解)
model = Lasso(alpha=0.1)
model.fit(X, y)

# ElasticNet (L1 + L2)
model = ElasticNet(alpha=0.1, l1_ratio=0.5)
model.fit(X, y)

# 逻辑回归(分类)
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

model = LogisticRegression(
    penalty='l2',           # 正则化类型: 'l1', 'l2', 'elasticnet', 'none'
    C=1.0,                  # 正则化强度的倒数
    solver='lbfgs',         # 优化算法
    max_iter=1000,
    multi_class='auto'      # 'ovr'或'multinomial'
)
model.fit(X, y)
print(f"Classes: {model.classes_}")
print(f"Coefficients shape: {model.coef_.shape}")

# 预测概率
y_proba = model.predict_proba(X)
print(y_proba[:5])

# SGD优化器(大数据集)
model = SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3)
model.fit(X, y)

3.2 树模型

Python
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_text, plot_tree
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,
    GradientBoostingClassifier, GradientBoostingRegressor,
    AdaBoostClassifier, AdaBoostRegressor,
    ExtraTreesClassifier, ExtraTreesRegressor
)

# 决策树
model = DecisionTreeClassifier(
    criterion='gini',       # 'gini'或'entropy'
    max_depth=5,            # 最大深度
    min_samples_split=2,    # 分裂所需最小样本数
    min_samples_leaf=1,     # 叶子节点最小样本数
    max_features='sqrt',    # 每次分裂考虑的特征数
    random_state=42
)
model.fit(X, y)

# 特征重要性
print(f"Feature importances: {model.feature_importances_}")

# 可视化树结构
print(export_text(model, feature_names=[f'f{i}' for i in range(X.shape[1])]))

# 随机森林
model = RandomForestClassifier(
    n_estimators=100,       # 树的数量
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=True,         # 是否使用自助采样
    oob_score=True,         # 是否使用袋外样本评估
    n_jobs=-1,              # 使用所有CPU核心
    random_state=42
)
model.fit(X, y)
print(f"OOB score: {model.oob_score_}")

# 梯度提升
model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,      # 学习率(收缩率)
    max_depth=3,
    subsample=0.8,          # 每次迭代使用的样本比例
    random_state=42
)
model.fit(X, y)

# 查看 staged_score(训练过程中的得分)
for i, score in enumerate(model.staged_score(X, y)):
    if i % 10 == 0:
        print(f"Iteration {i}: {score:.4f}")

# AdaBoost
model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=100,
    learning_rate=1.0,
    random_state=42
)
model.fit(X, y)

# Extra Trees(极端随机树)
model = ExtraTreesClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    random_state=42
)
model.fit(X, y)

3.3 支持向量机

Python
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR

# SVM分类
model = SVC(
    C=1.0,                  # 正则化参数
    kernel='rbf',           # 核函数: 'linear', 'poly', 'rbf', 'sigmoid'
    gamma='scale',          # 核系数
    degree=3,               # 多项式核的次数
    probability=True,       # 启用概率估计
    class_weight='balanced' # 处理类别不平衡
)
model.fit(X, y)

# 预测
y_pred = model.predict(X)
y_proba = model.predict_proba(X)

# 线性SVM(大数据集更快)
model = LinearSVC(C=1.0, max_iter=10000)
model.fit(X, y)

# SVM回归
model = SVR(
    kernel='rbf',
    C=1.0,
    epsilon=0.1           # 不敏感区域的宽度
)
model.fit(X, y)

3.4 K近邻

Python
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

# KNN分类
model = KNeighborsClassifier(
    n_neighbors=5,          # K值
    weights='uniform',      # 权重: 'uniform'或'distance'
    algorithm='auto',       # 计算最近邻的算法
    metric='minkowski',     # 距离度量
    p=2                     # 闵可夫斯基距离的幂参数
)
model.fit(X, y)

# 预测
y_pred = model.predict(X)

# 获取邻居
neighbors = model.kneighbors(X[:5], n_neighbors=3, return_distance=True)
print(neighbors)

# KNN回归
model = KNeighborsRegressor(n_neighbors=5)
model.fit(X, y)

3.5 朴素贝叶斯

Python
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB

# 高斯朴素贝叶斯(连续特征)
model = GaussianNB()
model.fit(X, y)

# 预测概率
y_proba = model.predict_proba(X)

# 查看类别先验和特征均值/方差
print(f"Class priors: {model.class_prior_}")
print(f"Class means: {model.theta_}")
print(f"Class variances: {model.var_}")

# 多项式朴素贝叶斯(离散特征,如文本)
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.']
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(corpus)
y_text = [0, 0, 1]

model = MultinomialNB()
model.fit(X_text, y_text)

4. 无监督学习

4.1 K-Means聚类

Python
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score

# 生成数据
from sklearn.datasets import make_blobs
X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

# K-Means
kmeans = KMeans(
    n_clusters=4,
    init='k-means++',       # 初始化方法
    n_init=10,              # 运行次数,选最好的结果
    max_iter=300,
    random_state=42
)
kmeans.fit(X)

# 预测聚类
labels = kmeans.labels_
print(f"Cluster centers: {kmeans.cluster_centers_}")
print(f"Inertia: {kmeans.inertia_}")  # 簇内平方和

# 预测新数据
new_data = np.array([[0, 0], [4, 4]])
predictions = kmeans.predict(new_data)

# 选择最佳K值
inertias = []
silhouettes = []
K_range = range(2, 10)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(X, kmeans.labels_))

# MiniBatchKMeans(大数据集)
kmeans = MiniBatchKMeans(n_clusters=4, batch_size=100, random_state=42)
kmeans.fit(X)

4.2 层次聚类

Python
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage

# 凝聚聚类
clustering = AgglomerativeClustering(
    n_clusters=4,
    linkage='ward'          # 'ward', 'complete', 'average', 'single'
)
labels = clustering.fit_predict(X)

# 绘制树状图
linked = linkage(X, 'ward')
# dendrogram(linked)

4.3 DBSCAN

Python
from sklearn.cluster import DBSCAN

# DBSCAN(基于密度的聚类)
clustering = DBSCAN(
    eps=0.5,                # 邻域半径
    min_samples=5,          # 核心点所需最小样本数
    metric='euclidean'
)
labels = clustering.fit_predict(X)

# -1表示噪声点
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
print(f"Estimated number of clusters: {n_clusters}")
print(f"Estimated number of noise points: {n_noise}")

5. 模型评估与选择

5.1 交叉验证

Python
from sklearn.model_selection import (
    cross_val_score, cross_validate, cross_val_predict,
    KFold, StratifiedKFold, LeaveOneOut, TimeSeriesSplit
)

# 简单交叉验证
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"Scores: {scores}")
print(f"Mean: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# 多指标交叉验证
scoring = ['accuracy', 'precision', 'recall', 'f1']
scores = cross_validate(model, X, y, cv=5, scoring=scoring)
for metric, values in scores.items():
    if metric.startswith('test_'):
        print(f"{metric}: {values.mean():.3f} (+/- {values.std():.3f})")

# 分层K折(保持类别比例)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv)

# 时间序列分割
cv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(model, X, y, cv=cv)

# 留一法(小数据集)
cv = LeaveOneOut()
scores = cross_val_score(model, X, y, cv=cv)

5.2 超参数调优

Python
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform

# 定义参数网格
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 网格搜索
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X, y)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")

# 最佳模型
best_model = grid_search.best_estimator_

# 查看所有结果
results = pd.DataFrame(grid_search.cv_results_)
print(results[['params', 'mean_test_score', 'std_test_score']].head())

# 随机搜索(参数空间大时更高效)
param_distributions = {
    'n_estimators': randint(50, 500),
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10)
}

random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions,
    n_iter=100,             # 随机采样次数
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)
random_search.fit(X, y)

5.3 评估指标

Python
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report,
    mean_squared_error, mean_absolute_error, r2_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    log_loss, matthews_corrcoef, cohen_kappa_score
)

# 分类指标
y_true = [0, 1, 0, 0, 1, 1, 0, 1]
y_pred = [0, 1, 0, 1, 1, 0, 0, 1]
y_proba = [0.1, 0.9, 0.2, 0.7, 0.8, 0.3, 0.1, 0.95]

print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
print(f"Precision: {precision_score(y_true, y_pred):.4f}")
print(f"Recall: {recall_score(y_true, y_pred):.4f}")
print(f"F1: {f1_score(y_true, y_pred):.4f}")

# 多分类
print(classification_report(y_true, y_pred, target_names=['class_0', 'class_1']))

# 混淆矩阵
cm = confusion_matrix(y_true, y_pred)
print(cm)

# ROC-AUC
print(f"ROC-AUC: {roc_auc_score(y_true, y_proba):.4f}")

# 回归指标
y_true_reg = [3, -0.5, 2, 7]
y_pred_reg = [2.5, 0.0, 2, 8]

print(f"MSE: {mean_squared_error(y_true_reg, y_pred_reg):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_true_reg, y_pred_reg)):.4f}")
print(f"MAE: {mean_absolute_error(y_true_reg, y_pred_reg):.4f}")
print(f"R²: {r2_score(y_true_reg, y_pred_reg):.4f}")

6. Pipeline与模型组合

6.1 Pipeline构建

Python
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# 简单Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# 查看Pipeline中的步骤
print(pipeline.named_steps)
print(pipeline.named_steps['scaler'].mean_)

# 使用make_pipeline(自动命名)
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(StandardScaler(), LogisticRegression())

# 访问步骤
scaler = pipeline[0]
classifier = pipeline[1]

# 使用set_params修改参数
pipeline.set_params(classifier__C=10.0)

6.2 ColumnTransformer(不同特征不同处理)

Python
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# 假设有数值特征和类别特征
numeric_features = ['age', 'income']
categorical_features = ['gender', 'city']

# 定义预处理器
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# 完整Pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

clf.fit(X_train, y_train)

6.3 模型集成

Python
from sklearn.ensemble import VotingClassifier, VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# 投票分类器(硬投票)
voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression()),
        ('rf', RandomForestClassifier()),
        ('svc', SVC())
    ],
    voting='hard'  # 'hard'或'soft'
)
voting_clf.fit(X_train, y_train)

# 软投票(需要基分类器支持predict_proba)
voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression()),
        ('rf', RandomForestClassifier()),
        ('nb', GaussianNB())
    ],
    voting='soft',
    weights=[2, 1, 1]  # 权重
)

# Stacking(堆叠)
stacking_clf = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression()),
        ('rf', RandomForestClassifier()),
        ('svc', SVC(probability=True))
    ],
    final_estimator=LogisticRegression(),
    cv=5,
    stack_method='predict_proba'  # 'auto', 'predict_proba', 'decision_function', 'predict'
)
stacking_clf.fit(X_train, y_train)

7. 模型解释性

7.1 特征重要性

Python
# 树模型的特征重要性
model = RandomForestClassifier()
model.fit(X, y)

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

for i in range(10):
    print(f"{i+1}. Feature {indices[i]}: {importances[indices[i]]:.4f}")

# 置换重要性(适用于任何模型)
from sklearn.inspection import permutation_importance

result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
for i in result.importances_mean.argsort()[::-1]:
    print(f"Feature {i}: {result.importances_mean[i]:.4f} +/- {result.importances_std[i]:.4f}")

7.2 SHAP值

Python
import shap

# 创建解释器
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# 汇总图
shap.summary_plot(shap_values, X_test)

# 力图(单个预测)
shap.force_plot(explainer.expected_value[1], shap_values[1][0], X_test[0])

# 依赖图
shap.dependence_plot(0, shap_values[1], X_test)

7.3 部分依赖图

Python
from sklearn.inspection import partial_dependence, PartialDependenceDisplay

# 计算部分依赖
features = [0, 1, (0, 1)]
partial_dependence(model, X_train, features=features)

# 绘制
PartialDependenceDisplay.from_estimator(model, X_train, features=[0, 1])

📝 练习

练习1: 完整的数据预处理流程

Python
# 1. 加载一个真实数据集(如Titanic)
# 2. 处理缺失值
# 3. 编码类别特征
# 4. 缩放数值特征
# 5. 构建Pipeline

练习2: 模型比较与选择

Python
# 1. 使用交叉验证比较多个模型
# 2. 使用GridSearchCV调优最佳模型
# 3. 评估最终模型
# 4. 分析特征重要性

练习3: 端到端项目

Python
# 1. 选择一个数据集
# 2. 完成数据探索
# 3. 特征工程
# 4. 模型训练和调优
# 5. 模型评估和解释

🎯 自我检查

  • 掌握各种数据预处理方法
  • 理解特征工程的重要性
  • 熟悉常用的机器学习算法
  • 掌握模型评估和选择方法
  • 会使用Pipeline构建完整流程
  • 能解释模型结果

📚 延伸阅读


下一步: 03 - Hugging Face