02 - scikit-learn¶

学习时间: 10-12小时 重要性: ⭐⭐⭐⭐⭐ 传统机器学习的标准工具

🎯 学习目标¶

掌握完整的数据预处理流程
理解各种机器学习算法的原理和应用场景
学会特征工程和特征选择
掌握模型评估和超参数调优
理解Pipeline的设计和使用
学会模型解释和可视化

📚 内容概览¶

数据预处理
特征工程
监督学习算法
无监督学习
模型评估与选择
Pipeline与模型组合
模型解释性

1. 数据预处理¶

1.1 数据加载与探索¶

Python

import pandas as pd
import numpy as np
from sklearn.datasets import load_iris, fetch_california_housing, make_classification

# 加载内置数据集
iris = load_iris()
X, y = iris.data, iris.target
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature names: {iris.feature_names}")
print(f"Target names: {iris.target_names}")

# 生成模拟数据
X, y = make_classification(
    n_samples=1000,
    n_features=20,
    n_informative=10,
    n_redundant=5,
    n_classes=2,
    random_state=42
)

# 转换为DataFrame
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
df['target'] = y

# 数据探索
print(df.head())
print(df.describe())
print(df.info())
print(df.isnull().sum())

1.2 处理缺失值¶

Python

from sklearn.impute import SimpleImputer, KNNImputer

# 创建含缺失值的数据
X = np.array([[1, 2, np.nan],
              [4, np.nan, 6],
              [7, 8, 9]])

# 简单填充
imputer = SimpleImputer(strategy='mean')  # 策略: mean, median, most_frequent, constant
X_imputed = imputer.fit_transform(X)

# 中位数填充
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)

# 众数填充（适用于分类特征）
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)

# 常数填充
imputer = SimpleImputer(strategy='constant', fill_value=0)
X_imputed = imputer.fit_transform(X)

# KNN填充（利用相似样本填充）
imputer = KNNImputer(n_neighbors=2)
X_imputed = imputer.fit_transform(X)

# 迭代填充（利用其他特征预测缺失值）
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(max_iter=10, random_state=0)
X_imputed = imputer.fit_transform(X)

1.3 特征缩放¶

Python

from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer

X = np.array([[1, -1, 2],
              [2, 0, 0],
              [0, 1, -1]], dtype=float)

# 标准化 (Z-score标准化)
# x' = (x - mean) / std
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"Mean: {X_scaled.mean(axis=0)}")  # [0, 0, 0]
print(f"Std: {X_scaled.std(axis=0)}")    # [1, 1, 1]

# 归一化 (缩放到[0,1])
# x' = (x - min) / (max - min)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
print(f"Min: {X_scaled.min(axis=0)}")    # [0, 0, 0]
print(f"Max: {X_scaled.max(axis=0)}")    # [1, 1, 1]

# 稳健标准化（对异常值不敏感）
# 使用中位数和四分位数
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# 归一化到单位范数（每个样本）
normalizer = Normalizer(norm='l2')  # 'l1', 'l2', 'max'
X_normalized = normalizer.fit_transform(X)

# 保存和加载scaler
import joblib
joblib.dump(scaler, 'scaler.pkl')
scaler_loaded = joblib.load('scaler.pkl')

1.4 类别特征编码¶

Python

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder

# LabelEncoder - 将类别转换为整数（适用于目标变量）
le = LabelEncoder()
y = ['cat', 'dog', 'cat', 'bird', 'dog']
y_encoded = le.fit_transform(y)
print(y_encoded)  # [0, 2, 0, 1, 2]
print(le.classes_)  # ['bird' 'cat' 'dog']
print(le.inverse_transform([0, 1, 2]))  # ['bird' 'cat' 'dog']

# OneHotEncoder - 独热编码（适用于特征）
X = [['male', 'US', 'Safari'],
     ['female', 'EU', 'Chrome'],
     ['male', 'EU', 'Firefox']]

encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X)
print(X_encoded)
print(encoder.categories_)
print(encoder.get_feature_names_out())

# 处理未知类别
encoder = OneHotEncoder(handle_unknown='ignore')

# OrdinalEncoder - 有序类别编码
X = [['low', 'US'],
     ['high', 'EU'],
     ['medium', 'US']]
encoder = OrdinalEncoder(categories=[['low', 'medium', 'high'], ['EU', 'US']])
X_encoded = encoder.fit_transform(X)
print(X_encoded)

# TargetEncoder - 基于目标变量的编码（类别不平衡时效果好）
from category_encoders import TargetEncoder

X = np.array(['A', 'B', 'A', 'C', 'B']).reshape(-1, 1)
y = np.array([1, 0, 1, 0, 1])
encoder = TargetEncoder()
X_encoded = encoder.fit_transform(X, y)

1.5 异常值检测¶

Python

from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope

# 生成含异常值的数据
np.random.seed(42)
X_inliers = np.random.randn(100, 2)
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
X = np.vstack([X_inliers, X_outliers])

# Isolation Forest
clf = IsolationForest(contamination=0.1, random_state=42)
y_pred = clf.fit_predict(X)
# 1表示正常，-1表示异常
outliers = X[y_pred == -1]

# Local Outlier Factor
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
y_pred = clf.fit_predict(X)

# Elliptic Envelope（假设数据服从高斯分布）
clf = EllipticEnvelope(contamination=0.1)
y_pred = clf.fit_predict(X)

# 基于统计方法
from scipy import stats
z_scores = np.abs(stats.zscore(X))
outliers = (z_scores > 3).any(axis=1)

2. 特征工程¶

2.1 多项式特征¶

Python

from sklearn.preprocessing import PolynomialFeatures

X = np.array([[0, 1],
              [2, 3],
              [4, 5]])

# 生成多项式特征
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
print(X_poly)
# [x1, x2, x1^2, x1*x2, x2^2]

# 查看特征名称
print(poly.get_feature_names_out())

# 交互特征（只保留交叉项）
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)

2.2 特征选择¶

Python

from sklearn.feature_selection import (
    SelectKBest, f_classif, f_regression,
    RFE, SelectFromModel, VarianceThreshold
)

# 方差阈值（删除低方差特征）
selector = VarianceThreshold(threshold=0.1)
X_selected = selector.fit_transform(X)

# 单变量特征选择
# 分类问题
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
print(selector.scores_)  # 每个特征的F值
print(selector.pvalues_)  # p值

# 回归问题
selector = SelectKBest(score_func=f_regression, k=10)
X_selected = selector.fit_transform(X, y)

# 递归特征消除 (RFE)
from sklearn.linear_model import LogisticRegression

estimator = LogisticRegression()
selector = RFE(estimator, n_features_to_select=5, step=1)
X_selected = selector.fit_transform(X, y)
print(selector.support_)  # 被选中的特征
print(selector.ranking_)  # 特征排名

# 基于模型的特征选择
from sklearn.ensemble import RandomForestClassifier

estimator = RandomForestClassifier()
selector = SelectFromModel(estimator, max_features=5)
X_selected = selector.fit_transform(X, y)
print(selector.get_support())

2.3 降维¶

Python

from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# PCA (主成分分析)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.2%}")
print(f"Components shape: {pca.components_.shape}")

# 保留95%的方差
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)

# LDA (线性判别分析，需要标签)
lda = LinearDiscriminantAnalysis(n_components=2)
X_lda = lda.fit_transform(X, y)

# TruncatedSVD (适用于稀疏矩阵)
svd = TruncatedSVD(n_components=50)
X_svd = svd.fit_transform(X)

3. 监督学习算法¶

3.1 线性模型¶

Python

from sklearn.linear_model import (
    LinearRegression, Ridge, Lasso, ElasticNet,
    LogisticRegression, SGDClassifier, SGDRegressor
)
from sklearn.datasets import make_regression, make_classification

# 生成回归数据
X, y = make_regression(n_samples=100, n_features=2, noise=10, random_state=42)

# 线性回归
model = LinearRegression()
model.fit(X, y)
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")
print(f"R² score: {model.score(X, y)}")

# 预测
y_pred = model.predict(X)

# Ridge回归 (L2正则化)
model = Ridge(alpha=1.0)
model.fit(X, y)

# Lasso回归 (L1正则化，产生稀疏解)
model = Lasso(alpha=0.1)
model.fit(X, y)

# ElasticNet (L1 + L2)
model = ElasticNet(alpha=0.1, l1_ratio=0.5)
model.fit(X, y)

# 逻辑回归（分类）
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)

model = LogisticRegression(
    penalty='l2',           # 正则化类型: 'l1', 'l2', 'elasticnet', 'none'
    C=1.0,                  # 正则化强度的倒数
    solver='lbfgs',         # 优化算法
    max_iter=1000,
    multi_class='auto'      # 'ovr'或'multinomial'
)
model.fit(X, y)
print(f"Classes: {model.classes_}")
print(f"Coefficients shape: {model.coef_.shape}")

# 预测概率
y_proba = model.predict_proba(X)
print(y_proba[:5])

# SGD优化器（大数据集）
model = SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3)
model.fit(X, y)

3.2 树模型¶

Python

from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_text, plot_tree
from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,
    GradientBoostingClassifier, GradientBoostingRegressor,
    AdaBoostClassifier, AdaBoostRegressor,
    ExtraTreesClassifier, ExtraTreesRegressor
)

# 决策树
model = DecisionTreeClassifier(
    criterion='gini',       # 'gini'或'entropy'
    max_depth=5,            # 最大深度
    min_samples_split=2,    # 分裂所需最小样本数
    min_samples_leaf=1,     # 叶子节点最小样本数
    max_features='sqrt',    # 每次分裂考虑的特征数
    random_state=42
)
model.fit(X, y)

# 特征重要性
print(f"Feature importances: {model.feature_importances_}")

# 可视化树结构
print(export_text(model, feature_names=[f'f{i}' for i in range(X.shape[1])]))

# 随机森林
model = RandomForestClassifier(
    n_estimators=100,       # 树的数量
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=True,         # 是否使用自助采样
    oob_score=True,         # 是否使用袋外样本评估
    n_jobs=-1,              # 使用所有CPU核心
    random_state=42
)
model.fit(X, y)
print(f"OOB score: {model.oob_score_}")

# 梯度提升
model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,      # 学习率（收缩率）
    max_depth=3,
    subsample=0.8,          # 每次迭代使用的样本比例
    random_state=42
)
model.fit(X, y)

# 查看 staged_score（训练过程中的得分）
for i, score in enumerate(model.staged_score(X, y)):
    if i % 10 == 0:
        print(f"Iteration {i}: {score:.4f}")

# AdaBoost
model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=100,
    learning_rate=1.0,
    random_state=42
)
model.fit(X, y)

# Extra Trees（极端随机树）
model = ExtraTreesClassifier(
    n_estimators=100,
    max_depth=None,
    min_samples_split=2,
    random_state=42
)
model.fit(X, y)

3.3 支持向量机¶

Python

from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR

# SVM分类
model = SVC(
    C=1.0,                  # 正则化参数
    kernel='rbf',           # 核函数: 'linear', 'poly', 'rbf', 'sigmoid'
    gamma='scale',          # 核系数
    degree=3,               # 多项式核的次数
    probability=True,       # 启用概率估计
    class_weight='balanced' # 处理类别不平衡
)
model.fit(X, y)

# 预测
y_pred = model.predict(X)
y_proba = model.predict_proba(X)

# 线性SVM（大数据集更快）
model = LinearSVC(C=1.0, max_iter=10000)
model.fit(X, y)

# SVM回归
model = SVR(
    kernel='rbf',
    C=1.0,
    epsilon=0.1           # 不敏感区域的宽度
)
model.fit(X, y)

3.4 K近邻¶

Python

from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor

# KNN分类
model = KNeighborsClassifier(
    n_neighbors=5,          # K值
    weights='uniform',      # 权重: 'uniform'或'distance'
    algorithm='auto',       # 计算最近邻的算法
    metric='minkowski',     # 距离度量
    p=2                     # 闵可夫斯基距离的幂参数
)
model.fit(X, y)

# 预测
y_pred = model.predict(X)

# 获取邻居
neighbors = model.kneighbors(X[:5], n_neighbors=3, return_distance=True)
print(neighbors)

# KNN回归
model = KNeighborsRegressor(n_neighbors=5)
model.fit(X, y)

3.5 朴素贝叶斯¶

Python

from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB

# 高斯朴素贝叶斯（连续特征）
model = GaussianNB()
model.fit(X, y)

# 预测概率
y_proba = model.predict_proba(X)

# 查看类别先验和特征均值/方差
print(f"Class priors: {model.class_prior_}")
print(f"Class means: {model.theta_}")
print(f"Class variances: {model.var_}")

# 多项式朴素贝叶斯（离散特征，如文本）
from sklearn.feature_extraction.text import CountVectorizer

corpus = ['This is the first document.',
          'This document is the second document.',
          'And this is the third one.']
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(corpus)
y_text = [0, 0, 1]

model = MultinomialNB()
model.fit(X_text, y_text)

4. 无监督学习¶

4.1 K-Means聚类¶

Python

from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score

# 生成数据
from sklearn.datasets import make_blobs
X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)

# K-Means
kmeans = KMeans(
    n_clusters=4,
    init='k-means++',       # 初始化方法
    n_init=10,              # 运行次数，选最好的结果
    max_iter=300,
    random_state=42
)
kmeans.fit(X)

# 预测聚类
labels = kmeans.labels_
print(f"Cluster centers: {kmeans.cluster_centers_}")
print(f"Inertia: {kmeans.inertia_}")  # 簇内平方和

# 预测新数据
new_data = np.array([[0, 0], [4, 4]])
predictions = kmeans.predict(new_data)

# 选择最佳K值
inertias = []
silhouettes = []
K_range = range(2, 10)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(X, kmeans.labels_))

# MiniBatchKMeans（大数据集）
kmeans = MiniBatchKMeans(n_clusters=4, batch_size=100, random_state=42)
kmeans.fit(X)

4.2 层次聚类¶

Python

from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage

# 凝聚聚类
clustering = AgglomerativeClustering(
    n_clusters=4,
    linkage='ward'          # 'ward', 'complete', 'average', 'single'
)
labels = clustering.fit_predict(X)

# 绘制树状图
linked = linkage(X, 'ward')
# dendrogram(linked)

4.3 DBSCAN¶

Python

from sklearn.cluster import DBSCAN

# DBSCAN（基于密度的聚类）
clustering = DBSCAN(
    eps=0.5,                # 邻域半径
    min_samples=5,          # 核心点所需最小样本数
    metric='euclidean'
)
labels = clustering.fit_predict(X)

# -1表示噪声点
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
print(f"Estimated number of clusters: {n_clusters}")
print(f"Estimated number of noise points: {n_noise}")

5. 模型评估与选择¶

5.1 交叉验证¶

Python

from sklearn.model_selection import (
    cross_val_score, cross_validate, cross_val_predict,
    KFold, StratifiedKFold, LeaveOneOut, TimeSeriesSplit
)

# 简单交叉验证
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"Scores: {scores}")
print(f"Mean: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

# 多指标交叉验证
scoring = ['accuracy', 'precision', 'recall', 'f1']
scores = cross_validate(model, X, y, cv=5, scoring=scoring)
for metric, values in scores.items():
    if metric.startswith('test_'):
        print(f"{metric}: {values.mean():.3f} (+/- {values.std():.3f})")

# 分层K折（保持类别比例）
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv)

# 时间序列分割
cv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(model, X, y, cv=cv)

# 留一法（小数据集）
cv = LeaveOneOut()
scores = cross_val_score(model, X, y, cv=cv)

5.2 超参数调优¶

Python

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform

# 定义参数网格
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 网格搜索
grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
grid_search.fit(X, y)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")

# 最佳模型
best_model = grid_search.best_estimator_

# 查看所有结果
results = pd.DataFrame(grid_search.cv_results_)
print(results[['params', 'mean_test_score', 'std_test_score']].head())

# 随机搜索（参数空间大时更高效）
param_distributions = {
    'n_estimators': randint(50, 500),
    'max_depth': [3, 5, 7, 10, None],
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 10)
}

random_search = RandomizedSearchCV(
    RandomForestClassifier(random_state=42),
    param_distributions,
    n_iter=100,             # 随机采样次数
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)
random_search.fit(X, y)

5.3 评估指标¶

Python

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report,
    mean_squared_error, mean_absolute_error, r2_score,
    roc_auc_score, roc_curve, precision_recall_curve,
    log_loss, matthews_corrcoef, cohen_kappa_score
)

# 分类指标
y_true = [0, 1, 0, 0, 1, 1, 0, 1]
y_pred = [0, 1, 0, 1, 1, 0, 0, 1]
y_proba = [0.1, 0.9, 0.2, 0.7, 0.8, 0.3, 0.1, 0.95]

print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
print(f"Precision: {precision_score(y_true, y_pred):.4f}")
print(f"Recall: {recall_score(y_true, y_pred):.4f}")
print(f"F1: {f1_score(y_true, y_pred):.4f}")

# 多分类
print(classification_report(y_true, y_pred, target_names=['class_0', 'class_1']))

# 混淆矩阵
cm = confusion_matrix(y_true, y_pred)
print(cm)

# ROC-AUC
print(f"ROC-AUC: {roc_auc_score(y_true, y_proba):.4f}")

# 回归指标
y_true_reg = [3, -0.5, 2, 7]
y_pred_reg = [2.5, 0.0, 2, 8]

print(f"MSE: {mean_squared_error(y_true_reg, y_pred_reg):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_true_reg, y_pred_reg)):.4f}")
print(f"MAE: {mean_absolute_error(y_true_reg, y_pred_reg):.4f}")
print(f"R²: {r2_score(y_true_reg, y_pred_reg):.4f}")

6. Pipeline与模型组合¶

6.1 Pipeline构建¶

Python

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# 简单Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression())
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

# 查看Pipeline中的步骤
print(pipeline.named_steps)
print(pipeline.named_steps['scaler'].mean_)

# 使用make_pipeline（自动命名）
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(StandardScaler(), LogisticRegression())

# 访问步骤
scaler = pipeline[0]
classifier = pipeline[1]

# 使用set_params修改参数
pipeline.set_params(classifier__C=10.0)

6.2 ColumnTransformer（不同特征不同处理）¶

Python

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# 假设有数值特征和类别特征
numeric_features = ['age', 'income']
categorical_features = ['gender', 'city']

# 定义预处理器
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# 完整Pipeline
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

clf.fit(X_train, y_train)

6.3 模型集成¶

Python

from sklearn.ensemble import VotingClassifier, VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# 投票分类器（硬投票）
voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression()),
        ('rf', RandomForestClassifier()),
        ('svc', SVC())
    ],
    voting='hard'  # 'hard'或'soft'
)
voting_clf.fit(X_train, y_train)

# 软投票（需要基分类器支持predict_proba）
voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression()),
        ('rf', RandomForestClassifier()),
        ('nb', GaussianNB())
    ],
    voting='soft',
    weights=[2, 1, 1]  # 权重
)

# Stacking（堆叠）
stacking_clf = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression()),
        ('rf', RandomForestClassifier()),
        ('svc', SVC(probability=True))
    ],
    final_estimator=LogisticRegression(),
    cv=5,
    stack_method='predict_proba'  # 'auto', 'predict_proba', 'decision_function', 'predict'
)
stacking_clf.fit(X_train, y_train)

7. 模型解释性¶

7.1 特征重要性¶

Python

# 树模型的特征重要性
model = RandomForestClassifier()
model.fit(X, y)

importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

for i in range(10):
    print(f"{i+1}. Feature {indices[i]}: {importances[indices[i]]:.4f}")

# 置换重要性（适用于任何模型）
from sklearn.inspection import permutation_importance

result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
for i in result.importances_mean.argsort()[::-1]:
    print(f"Feature {i}: {result.importances_mean[i]:.4f} +/- {result.importances_std[i]:.4f}")

7.2 SHAP值¶

Python

import shap

# 创建解释器
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)

# 汇总图
shap.summary_plot(shap_values, X_test)

# 力图（单个预测）
shap.force_plot(explainer.expected_value[1], shap_values[1][0], X_test[0])

# 依赖图
shap.dependence_plot(0, shap_values[1], X_test)

7.3 部分依赖图¶

Python

from sklearn.inspection import partial_dependence, PartialDependenceDisplay

# 计算部分依赖
features = [0, 1, (0, 1)]
partial_dependence(model, X_train, features=features)

# 绘制
PartialDependenceDisplay.from_estimator(model, X_train, features=[0, 1])

📝 练习¶

练习1: 完整的数据预处理流程¶

Python

# 1. 加载一个真实数据集（如Titanic）
# 2. 处理缺失值
# 3. 编码类别特征
# 4. 缩放数值特征
# 5. 构建Pipeline

练习2: 模型比较与选择¶

Python

# 1. 使用交叉验证比较多个模型
# 2. 使用GridSearchCV调优最佳模型
# 3. 评估最终模型
# 4. 分析特征重要性

练习3: 端到端项目¶

Python

# 1. 选择一个数据集
# 2. 完成数据探索
# 3. 特征工程
# 4. 模型训练和调优
# 5. 模型评估和解释

🎯 自我检查¶

掌握各种数据预处理方法
理解特征工程的重要性
熟悉常用的机器学习算法
掌握模型评估和选择方法
会使用Pipeline构建完整流程
能解释模型结果

📚 延伸阅读¶

下一步: 03 - Hugging Face