02 - scikit-learn¶
学习时间: 10-12小时 重要性: ⭐⭐⭐⭐⭐ 传统机器学习的标准工具
🎯 学习目标¶
- 掌握完整的数据预处理流程
- 理解各种机器学习算法的原理和应用场景
- 学会特征工程和特征选择
- 掌握模型评估和超参数调优
- 理解Pipeline的设计和使用
- 学会模型解释和可视化
📚 内容概览¶
1. 数据预处理¶
1.1 数据加载与探索¶
Python
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris, fetch_california_housing, make_classification
# 加载内置数据集
iris = load_iris()
X, y = iris.data, iris.target
print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Feature names: {iris.feature_names}")
print(f"Target names: {iris.target_names}")
# 生成模拟数据
X, y = make_classification(
n_samples=1000,
n_features=20,
n_informative=10,
n_redundant=5,
n_classes=2,
random_state=42
)
# 转换为DataFrame
df = pd.DataFrame(X, columns=[f'feature_{i}' for i in range(X.shape[1])])
df['target'] = y
# 数据探索
print(df.head())
print(df.describe())
print(df.info())
print(df.isnull().sum())
1.2 处理缺失值¶
Python
from sklearn.impute import SimpleImputer, KNNImputer
# 创建含缺失值的数据
X = np.array([[1, 2, np.nan],
[4, np.nan, 6],
[7, 8, 9]])
# 简单填充
imputer = SimpleImputer(strategy='mean') # 策略: mean, median, most_frequent, constant
X_imputed = imputer.fit_transform(X)
# 中位数填充
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
# 众数填充(适用于分类特征)
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = imputer.fit_transform(X)
# 常数填充
imputer = SimpleImputer(strategy='constant', fill_value=0)
X_imputed = imputer.fit_transform(X)
# KNN填充(利用相似样本填充)
imputer = KNNImputer(n_neighbors=2)
X_imputed = imputer.fit_transform(X)
# 迭代填充(利用其他特征预测缺失值)
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imputer = IterativeImputer(max_iter=10, random_state=0)
X_imputed = imputer.fit_transform(X)
1.3 特征缩放¶
Python
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, Normalizer
X = np.array([[1, -1, 2],
[2, 0, 0],
[0, 1, -1]], dtype=float)
# 标准化 (Z-score标准化)
# x' = (x - mean) / std
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(f"Mean: {X_scaled.mean(axis=0)}") # [0, 0, 0]
print(f"Std: {X_scaled.std(axis=0)}") # [1, 1, 1]
# 归一化 (缩放到[0,1])
# x' = (x - min) / (max - min)
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
print(f"Min: {X_scaled.min(axis=0)}") # [0, 0, 0]
print(f"Max: {X_scaled.max(axis=0)}") # [1, 1, 1]
# 稳健标准化(对异常值不敏感)
# 使用中位数和四分位数
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
# 归一化到单位范数(每个样本)
normalizer = Normalizer(norm='l2') # 'l1', 'l2', 'max'
X_normalized = normalizer.fit_transform(X)
# 保存和加载scaler
import joblib
joblib.dump(scaler, 'scaler.pkl')
scaler_loaded = joblib.load('scaler.pkl')
1.4 类别特征编码¶
Python
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, OrdinalEncoder
# LabelEncoder - 将类别转换为整数(适用于目标变量)
le = LabelEncoder()
y = ['cat', 'dog', 'cat', 'bird', 'dog']
y_encoded = le.fit_transform(y)
print(y_encoded) # [0, 2, 0, 1, 2]
print(le.classes_) # ['bird' 'cat' 'dog']
print(le.inverse_transform([0, 1, 2])) # ['bird' 'cat' 'dog']
# OneHotEncoder - 独热编码(适用于特征)
X = [['male', 'US', 'Safari'],
['female', 'EU', 'Chrome'],
['male', 'EU', 'Firefox']]
encoder = OneHotEncoder(sparse_output=False)
X_encoded = encoder.fit_transform(X)
print(X_encoded)
print(encoder.categories_)
print(encoder.get_feature_names_out())
# 处理未知类别
encoder = OneHotEncoder(handle_unknown='ignore')
# OrdinalEncoder - 有序类别编码
X = [['low', 'US'],
['high', 'EU'],
['medium', 'US']]
encoder = OrdinalEncoder(categories=[['low', 'medium', 'high'], ['EU', 'US']])
X_encoded = encoder.fit_transform(X)
print(X_encoded)
# TargetEncoder - 基于目标变量的编码(类别不平衡时效果好)
from category_encoders import TargetEncoder
X = np.array(['A', 'B', 'A', 'C', 'B']).reshape(-1, 1)
y = np.array([1, 0, 1, 0, 1])
encoder = TargetEncoder()
X_encoded = encoder.fit_transform(X, y)
1.5 异常值检测¶
Python
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.covariance import EllipticEnvelope
# 生成含异常值的数据
np.random.seed(42)
X_inliers = np.random.randn(100, 2)
X_outliers = np.random.uniform(low=-4, high=4, size=(20, 2))
X = np.vstack([X_inliers, X_outliers])
# Isolation Forest
clf = IsolationForest(contamination=0.1, random_state=42)
y_pred = clf.fit_predict(X)
# 1表示正常,-1表示异常
outliers = X[y_pred == -1]
# Local Outlier Factor
clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
y_pred = clf.fit_predict(X)
# Elliptic Envelope(假设数据服从高斯分布)
clf = EllipticEnvelope(contamination=0.1)
y_pred = clf.fit_predict(X)
# 基于统计方法
from scipy import stats
z_scores = np.abs(stats.zscore(X))
outliers = (z_scores > 3).any(axis=1)
2. 特征工程¶
2.1 多项式特征¶
Python
from sklearn.preprocessing import PolynomialFeatures
X = np.array([[0, 1],
[2, 3],
[4, 5]])
# 生成多项式特征
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)
print(X_poly)
# [x1, x2, x1^2, x1*x2, x2^2]
# 查看特征名称
print(poly.get_feature_names_out())
# 交互特征(只保留交叉项)
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)
2.2 特征选择¶
Python
from sklearn.feature_selection import (
SelectKBest, f_classif, f_regression,
RFE, SelectFromModel, VarianceThreshold
)
# 方差阈值(删除低方差特征)
selector = VarianceThreshold(threshold=0.1)
X_selected = selector.fit_transform(X)
# 单变量特征选择
# 分类问题
selector = SelectKBest(score_func=f_classif, k=10)
X_selected = selector.fit_transform(X, y)
print(selector.scores_) # 每个特征的F值
print(selector.pvalues_) # p值
# 回归问题
selector = SelectKBest(score_func=f_regression, k=10)
X_selected = selector.fit_transform(X, y)
# 递归特征消除 (RFE)
from sklearn.linear_model import LogisticRegression
estimator = LogisticRegression()
selector = RFE(estimator, n_features_to_select=5, step=1)
X_selected = selector.fit_transform(X, y)
print(selector.support_) # 被选中的特征
print(selector.ranking_) # 特征排名
# 基于模型的特征选择
from sklearn.ensemble import RandomForestClassifier
estimator = RandomForestClassifier()
selector = SelectFromModel(estimator, max_features=5)
X_selected = selector.fit_transform(X, y)
print(selector.get_support())
2.3 降维¶
Python
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# PCA (主成分分析)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)
print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.2%}")
print(f"Components shape: {pca.components_.shape}")
# 保留95%的方差
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X)
# LDA (线性判别分析,需要标签)
lda = LinearDiscriminantAnalysis(n_components=2)
X_lda = lda.fit_transform(X, y)
# TruncatedSVD (适用于稀疏矩阵)
svd = TruncatedSVD(n_components=50)
X_svd = svd.fit_transform(X)
3. 监督学习算法¶
3.1 线性模型¶
Python
from sklearn.linear_model import (
LinearRegression, Ridge, Lasso, ElasticNet,
LogisticRegression, SGDClassifier, SGDRegressor
)
from sklearn.datasets import make_regression, make_classification
# 生成回归数据
X, y = make_regression(n_samples=100, n_features=2, noise=10, random_state=42)
# 线性回归
model = LinearRegression()
model.fit(X, y)
print(f"Coefficients: {model.coef_}")
print(f"Intercept: {model.intercept_}")
print(f"R² score: {model.score(X, y)}")
# 预测
y_pred = model.predict(X)
# Ridge回归 (L2正则化)
model = Ridge(alpha=1.0)
model.fit(X, y)
# Lasso回归 (L1正则化,产生稀疏解)
model = Lasso(alpha=0.1)
model.fit(X, y)
# ElasticNet (L1 + L2)
model = ElasticNet(alpha=0.1, l1_ratio=0.5)
model.fit(X, y)
# 逻辑回归(分类)
X, y = make_classification(n_samples=1000, n_features=20, random_state=42)
model = LogisticRegression(
penalty='l2', # 正则化类型: 'l1', 'l2', 'elasticnet', 'none'
C=1.0, # 正则化强度的倒数
solver='lbfgs', # 优化算法
max_iter=1000,
multi_class='auto' # 'ovr'或'multinomial'
)
model.fit(X, y)
print(f"Classes: {model.classes_}")
print(f"Coefficients shape: {model.coef_.shape}")
# 预测概率
y_proba = model.predict_proba(X)
print(y_proba[:5])
# SGD优化器(大数据集)
model = SGDClassifier(loss='log_loss', max_iter=1000, tol=1e-3)
model.fit(X, y)
3.2 树模型¶
Python
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_text, plot_tree
from sklearn.ensemble import (
RandomForestClassifier, RandomForestRegressor,
GradientBoostingClassifier, GradientBoostingRegressor,
AdaBoostClassifier, AdaBoostRegressor,
ExtraTreesClassifier, ExtraTreesRegressor
)
# 决策树
model = DecisionTreeClassifier(
criterion='gini', # 'gini'或'entropy'
max_depth=5, # 最大深度
min_samples_split=2, # 分裂所需最小样本数
min_samples_leaf=1, # 叶子节点最小样本数
max_features='sqrt', # 每次分裂考虑的特征数
random_state=42
)
model.fit(X, y)
# 特征重要性
print(f"Feature importances: {model.feature_importances_}")
# 可视化树结构
print(export_text(model, feature_names=[f'f{i}' for i in range(X.shape[1])]))
# 随机森林
model = RandomForestClassifier(
n_estimators=100, # 树的数量
max_depth=10,
min_samples_split=5,
min_samples_leaf=2,
max_features='sqrt',
bootstrap=True, # 是否使用自助采样
oob_score=True, # 是否使用袋外样本评估
n_jobs=-1, # 使用所有CPU核心
random_state=42
)
model.fit(X, y)
print(f"OOB score: {model.oob_score_}")
# 梯度提升
model = GradientBoostingClassifier(
n_estimators=100,
learning_rate=0.1, # 学习率(收缩率)
max_depth=3,
subsample=0.8, # 每次迭代使用的样本比例
random_state=42
)
model.fit(X, y)
# 查看 staged_score(训练过程中的得分)
for i, score in enumerate(model.staged_score(X, y)):
if i % 10 == 0:
print(f"Iteration {i}: {score:.4f}")
# AdaBoost
model = AdaBoostClassifier(
estimator=DecisionTreeClassifier(max_depth=1),
n_estimators=100,
learning_rate=1.0,
random_state=42
)
model.fit(X, y)
# Extra Trees(极端随机树)
model = ExtraTreesClassifier(
n_estimators=100,
max_depth=None,
min_samples_split=2,
random_state=42
)
model.fit(X, y)
3.3 支持向量机¶
Python
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR
# SVM分类
model = SVC(
C=1.0, # 正则化参数
kernel='rbf', # 核函数: 'linear', 'poly', 'rbf', 'sigmoid'
gamma='scale', # 核系数
degree=3, # 多项式核的次数
probability=True, # 启用概率估计
class_weight='balanced' # 处理类别不平衡
)
model.fit(X, y)
# 预测
y_pred = model.predict(X)
y_proba = model.predict_proba(X)
# 线性SVM(大数据集更快)
model = LinearSVC(C=1.0, max_iter=10000)
model.fit(X, y)
# SVM回归
model = SVR(
kernel='rbf',
C=1.0,
epsilon=0.1 # 不敏感区域的宽度
)
model.fit(X, y)
3.4 K近邻¶
Python
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
# KNN分类
model = KNeighborsClassifier(
n_neighbors=5, # K值
weights='uniform', # 权重: 'uniform'或'distance'
algorithm='auto', # 计算最近邻的算法
metric='minkowski', # 距离度量
p=2 # 闵可夫斯基距离的幂参数
)
model.fit(X, y)
# 预测
y_pred = model.predict(X)
# 获取邻居
neighbors = model.kneighbors(X[:5], n_neighbors=3, return_distance=True)
print(neighbors)
# KNN回归
model = KNeighborsRegressor(n_neighbors=5)
model.fit(X, y)
3.5 朴素贝叶斯¶
Python
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
# 高斯朴素贝叶斯(连续特征)
model = GaussianNB()
model.fit(X, y)
# 预测概率
y_proba = model.predict_proba(X)
# 查看类别先验和特征均值/方差
print(f"Class priors: {model.class_prior_}")
print(f"Class means: {model.theta_}")
print(f"Class variances: {model.var_}")
# 多项式朴素贝叶斯(离散特征,如文本)
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['This is the first document.',
'This document is the second document.',
'And this is the third one.']
vectorizer = CountVectorizer()
X_text = vectorizer.fit_transform(corpus)
y_text = [0, 0, 1]
model = MultinomialNB()
model.fit(X_text, y_text)
4. 无监督学习¶
4.1 K-Means聚类¶
Python
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score
# 生成数据
from sklearn.datasets import make_blobs
X, _ = make_blobs(n_samples=300, centers=4, cluster_std=0.60, random_state=0)
# K-Means
kmeans = KMeans(
n_clusters=4,
init='k-means++', # 初始化方法
n_init=10, # 运行次数,选最好的结果
max_iter=300,
random_state=42
)
kmeans.fit(X)
# 预测聚类
labels = kmeans.labels_
print(f"Cluster centers: {kmeans.cluster_centers_}")
print(f"Inertia: {kmeans.inertia_}") # 簇内平方和
# 预测新数据
new_data = np.array([[0, 0], [4, 4]])
predictions = kmeans.predict(new_data)
# 选择最佳K值
inertias = []
silhouettes = []
K_range = range(2, 10)
for k in K_range:
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(X)
inertias.append(kmeans.inertia_)
silhouettes.append(silhouette_score(X, kmeans.labels_))
# MiniBatchKMeans(大数据集)
kmeans = MiniBatchKMeans(n_clusters=4, batch_size=100, random_state=42)
kmeans.fit(X)
4.2 层次聚类¶
Python
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
# 凝聚聚类
clustering = AgglomerativeClustering(
n_clusters=4,
linkage='ward' # 'ward', 'complete', 'average', 'single'
)
labels = clustering.fit_predict(X)
# 绘制树状图
linked = linkage(X, 'ward')
# dendrogram(linked)
4.3 DBSCAN¶
Python
from sklearn.cluster import DBSCAN
# DBSCAN(基于密度的聚类)
clustering = DBSCAN(
eps=0.5, # 邻域半径
min_samples=5, # 核心点所需最小样本数
metric='euclidean'
)
labels = clustering.fit_predict(X)
# -1表示噪声点
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = list(labels).count(-1)
print(f"Estimated number of clusters: {n_clusters}")
print(f"Estimated number of noise points: {n_noise}")
5. 模型评估与选择¶
5.1 交叉验证¶
Python
from sklearn.model_selection import (
cross_val_score, cross_validate, cross_val_predict,
KFold, StratifiedKFold, LeaveOneOut, TimeSeriesSplit
)
# 简单交叉验证
scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"Scores: {scores}")
print(f"Mean: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")
# 多指标交叉验证
scoring = ['accuracy', 'precision', 'recall', 'f1']
scores = cross_validate(model, X, y, cv=5, scoring=scoring)
for metric, values in scores.items():
if metric.startswith('test_'):
print(f"{metric}: {values.mean():.3f} (+/- {values.std():.3f})")
# 分层K折(保持类别比例)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(model, X, y, cv=cv)
# 时间序列分割
cv = TimeSeriesSplit(n_splits=5)
scores = cross_val_score(model, X, y, cv=cv)
# 留一法(小数据集)
cv = LeaveOneOut()
scores = cross_val_score(model, X, y, cv=cv)
5.2 超参数调优¶
Python
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import randint, uniform
# 定义参数网格
param_grid = {
'n_estimators': [50, 100, 200],
'max_depth': [3, 5, 7, None],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 2, 4]
}
# 网格搜索
grid_search = GridSearchCV(
RandomForestClassifier(random_state=42),
param_grid,
cv=5,
scoring='accuracy',
n_jobs=-1,
verbose=1
)
grid_search.fit(X, y)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_:.4f}")
# 最佳模型
best_model = grid_search.best_estimator_
# 查看所有结果
results = pd.DataFrame(grid_search.cv_results_)
print(results[['params', 'mean_test_score', 'std_test_score']].head())
# 随机搜索(参数空间大时更高效)
param_distributions = {
'n_estimators': randint(50, 500),
'max_depth': [3, 5, 7, 10, None],
'min_samples_split': randint(2, 20),
'min_samples_leaf': randint(1, 10)
}
random_search = RandomizedSearchCV(
RandomForestClassifier(random_state=42),
param_distributions,
n_iter=100, # 随机采样次数
cv=5,
scoring='accuracy',
n_jobs=-1,
random_state=42
)
random_search.fit(X, y)
5.3 评估指标¶
Python
from sklearn.metrics import (
accuracy_score, precision_score, recall_score, f1_score,
confusion_matrix, classification_report,
mean_squared_error, mean_absolute_error, r2_score,
roc_auc_score, roc_curve, precision_recall_curve,
log_loss, matthews_corrcoef, cohen_kappa_score
)
# 分类指标
y_true = [0, 1, 0, 0, 1, 1, 0, 1]
y_pred = [0, 1, 0, 1, 1, 0, 0, 1]
y_proba = [0.1, 0.9, 0.2, 0.7, 0.8, 0.3, 0.1, 0.95]
print(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}")
print(f"Precision: {precision_score(y_true, y_pred):.4f}")
print(f"Recall: {recall_score(y_true, y_pred):.4f}")
print(f"F1: {f1_score(y_true, y_pred):.4f}")
# 多分类
print(classification_report(y_true, y_pred, target_names=['class_0', 'class_1']))
# 混淆矩阵
cm = confusion_matrix(y_true, y_pred)
print(cm)
# ROC-AUC
print(f"ROC-AUC: {roc_auc_score(y_true, y_proba):.4f}")
# 回归指标
y_true_reg = [3, -0.5, 2, 7]
y_pred_reg = [2.5, 0.0, 2, 8]
print(f"MSE: {mean_squared_error(y_true_reg, y_pred_reg):.4f}")
print(f"RMSE: {np.sqrt(mean_squared_error(y_true_reg, y_pred_reg)):.4f}")
print(f"MAE: {mean_absolute_error(y_true_reg, y_pred_reg):.4f}")
print(f"R²: {r2_score(y_true_reg, y_pred_reg):.4f}")
6. Pipeline与模型组合¶
6.1 Pipeline构建¶
Python
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# 简单Pipeline
pipeline = Pipeline([
('scaler', StandardScaler()),
('classifier', LogisticRegression())
])
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
# 查看Pipeline中的步骤
print(pipeline.named_steps)
print(pipeline.named_steps['scaler'].mean_)
# 使用make_pipeline(自动命名)
from sklearn.pipeline import make_pipeline
pipeline = make_pipeline(StandardScaler(), LogisticRegression())
# 访问步骤
scaler = pipeline[0]
classifier = pipeline[1]
# 使用set_params修改参数
pipeline.set_params(classifier__C=10.0)
6.2 ColumnTransformer(不同特征不同处理)¶
Python
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
# 假设有数值特征和类别特征
numeric_features = ['age', 'income']
categorical_features = ['gender', 'city']
# 定义预处理器
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numeric_transformer, numeric_features),
('cat', categorical_transformer, categorical_features)
]
)
# 完整Pipeline
clf = Pipeline(steps=[
('preprocessor', preprocessor),
('classifier', RandomForestClassifier())
])
clf.fit(X_train, y_train)
6.3 模型集成¶
Python
from sklearn.ensemble import VotingClassifier, VotingRegressor, StackingClassifier, StackingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
# 投票分类器(硬投票)
voting_clf = VotingClassifier(
estimators=[
('lr', LogisticRegression()),
('rf', RandomForestClassifier()),
('svc', SVC())
],
voting='hard' # 'hard'或'soft'
)
voting_clf.fit(X_train, y_train)
# 软投票(需要基分类器支持predict_proba)
voting_clf = VotingClassifier(
estimators=[
('lr', LogisticRegression()),
('rf', RandomForestClassifier()),
('nb', GaussianNB())
],
voting='soft',
weights=[2, 1, 1] # 权重
)
# Stacking(堆叠)
stacking_clf = StackingClassifier(
estimators=[
('lr', LogisticRegression()),
('rf', RandomForestClassifier()),
('svc', SVC(probability=True))
],
final_estimator=LogisticRegression(),
cv=5,
stack_method='predict_proba' # 'auto', 'predict_proba', 'decision_function', 'predict'
)
stacking_clf.fit(X_train, y_train)
7. 模型解释性¶
7.1 特征重要性¶
Python
# 树模型的特征重要性
model = RandomForestClassifier()
model.fit(X, y)
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]
for i in range(10):
print(f"{i+1}. Feature {indices[i]}: {importances[indices[i]]:.4f}")
# 置换重要性(适用于任何模型)
from sklearn.inspection import permutation_importance
result = permutation_importance(model, X_test, y_test, n_repeats=10, random_state=42)
for i in result.importances_mean.argsort()[::-1]:
print(f"Feature {i}: {result.importances_mean[i]:.4f} +/- {result.importances_std[i]:.4f}")
7.2 SHAP值¶
Python
import shap
# 创建解释器
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_test)
# 汇总图
shap.summary_plot(shap_values, X_test)
# 力图(单个预测)
shap.force_plot(explainer.expected_value[1], shap_values[1][0], X_test[0])
# 依赖图
shap.dependence_plot(0, shap_values[1], X_test)
7.3 部分依赖图¶
Python
from sklearn.inspection import partial_dependence, PartialDependenceDisplay
# 计算部分依赖
features = [0, 1, (0, 1)]
partial_dependence(model, X_train, features=features)
# 绘制
PartialDependenceDisplay.from_estimator(model, X_train, features=[0, 1])
📝 练习¶
练习1: 完整的数据预处理流程¶
练习2: 模型比较与选择¶
练习3: 端到端项目¶
🎯 自我检查¶
- 掌握各种数据预处理方法
- 理解特征工程的重要性
- 熟悉常用的机器学习算法
- 掌握模型评估和选择方法
- 会使用Pipeline构建完整流程
- 能解释模型结果
📚 延伸阅读¶
下一步: 03 - Hugging Face