01 - 从调包到理解原理¶
目标:不只会调用API,而是真正理解AI/ML原理
时间:4-6周
核心原则:理解原理 → 手写实现 → 灵活运用
🎯 你现在的状态¶
典型场景¶
Python
# 现在的你
from transformers import pipeline
classifier = pipeline("sentiment-analysis")
result = classifier("I love this product!")
print(result) # [{'label': 'POSITIVE', 'score': 0.9998}]
# 问题:
# - 不知道pipeline内部做了什么
# - 不知道模型如何训练
# - 不会修改模型结构
# - 遇到bug无法定位
目标状态¶
📚 学习路径¶
阶段1:深度学习基础回顾(1周)¶
阶段2:核心算法理解(2周)¶
阶段3:大模型原理(2-3周)¶
🧠 核心概念深入¶
1. 神经网络基础¶
Python
import torch
import torch.nn as nn
# 理解每一层在做什么
class NeuralNetwork(nn.Module):
def __init__(self, input_size, hidden_size, num_classes):
super(NeuralNetwork, self).__init__()
# 全连接层:线性变换 y = xW + b
self.layer1 = nn.Linear(input_size, hidden_size)
# 激活函数:引入非线性
self.relu = nn.ReLU()
# Dropout:防止过拟合
self.dropout = nn.Dropout(0.2)
# 输出层
self.layer2 = nn.Linear(hidden_size, num_classes)
def forward(self, x):
# 前向传播流程
out = self.layer1(x)
out = self.relu(out)
out = self.dropout(out)
out = self.layer2(out)
return out
# 理解反向传播
model = NeuralNetwork(784, 256, 10)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# 训练步骤
for epoch in range(num_epochs):
# 前向传播
outputs = model(images)
loss = criterion(outputs, labels)
# 反向传播
optimizer.zero_grad() # 清空梯度
loss.backward() # 计算梯度
optimizer.step() # 更新参数
2. 注意力机制(Attention)¶
Python
import torch
import torch.nn as nn
import math
class SelfAttention(nn.Module):
"""
自注意力机制的核心思想:
1. 将输入转换为Query、Key、Value
2. 计算Query和Key的相似度(点积)
3. 用softmax得到注意力权重
4. 用权重加权Value
"""
def __init__(self, embed_size, heads):
super(SelfAttention, self).__init__()
self.embed_size = embed_size
self.heads = heads
self.head_dim = embed_size // heads
# Q, K, V的线性变换
self.values = nn.Linear(embed_size, embed_size)
self.keys = nn.Linear(embed_size, embed_size)
self.queries = nn.Linear(embed_size, embed_size)
self.fc_out = nn.Linear(embed_size, embed_size)
def forward(self, values, keys, query, mask=None):
N = query.shape[0] # batch size
value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
# 线性变换
values = self.values(values)
keys = self.keys(keys)
queries = self.queries(query)
# 分头
values = values.reshape(N, value_len, self.heads, self.head_dim)
keys = keys.reshape(N, key_len, self.heads, self.head_dim)
queries = queries.reshape(N, query_len, self.heads, self.head_dim)
# 计算注意力分数:Q @ K^T
energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
# 缩放
energy = energy / math.sqrt(self.head_dim)
# 应用mask(用于decoder的自回归)
if mask is not None:
energy = energy.masked_fill(mask == 0, float("-1e20"))
# Softmax得到注意力权重
attention = torch.softmax(energy, dim=3)
# 加权求和:attention @ V
out = torch.einsum("nhql,nlhd->nqhd", [attention, values])
# 拼接多头
out = out.reshape(N, query_len, self.embed_size)
# 最终线性变换
out = self.fc_out(out)
return out
# 理解注意力的直观意义
"""
在翻译任务中:
"I love you" → "我爱你"
当生成"爱"时:
- Query: "爱"的表示
- Key: "I", "love", "you"的表示
- Value: "I", "love", "you"的表示
注意力权重可能:
- "I" → 0.1
- "love" → 0.8 (高权重,因为是"爱"的对应词)
- "you" → 0.1
所以"爱"主要关注"love"这个词
"""
3. Transformer架构¶
Python
class TransformerBlock(nn.Module):
"""
Transformer的核心组件
"""
def __init__(self, embed_size, heads, dropout, forward_expansion):
super(TransformerBlock, self).__init__()
# 多头自注意力
self.attention = SelfAttention(embed_size, heads)
# 层归一化
self.norm1 = nn.LayerNorm(embed_size)
self.norm2 = nn.LayerNorm(embed_size)
# 前馈网络
self.feed_forward = nn.Sequential(
nn.Linear(embed_size, forward_expansion * embed_size),
nn.ReLU(),
nn.Linear(forward_expansion * embed_size, embed_size)
)
self.dropout = nn.Dropout(dropout)
def forward(self, value, key, query, mask):
# 自注意力 + 残差连接 + 层归一化
attention = self.attention(value, key, query, mask)
x = self.dropout(self.norm1(attention + query))
# 前馈网络 + 残差连接 + 层归一化
forward = self.feed_forward(x)
out = self.dropout(self.norm2(forward + x))
return out
# 理解Transformer的优势
"""
1. 并行计算:不像RNN需要顺序计算
2. 长距离依赖:注意力可以直接连接任意位置
3. 可解释性:注意力权重显示模型关注哪里
"""
🎯 从调包到理解¶
对比学习¶
Python
# ========== 调包方式 ==========
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained('bert-base-chinese')
inputs = tokenizer("这是一个测试", return_tensors="pt")
outputs = model(**inputs)
# 问题:
# - 不知道tokenizer做了什么
# - 不知道模型的内部结构
# - 不会修改模型
# ========== 理解原理后 ==========
# 1. 理解Tokenizer
"""
Tokenizer的作用:
- 文本 → 分词 → 转换为ID
- "这是一个测试" → ["这", "是", "一", "个", "测", "试"] → [101, 2023, ...]
- 添加特殊标记:[CLS] 这 是 一 个 测 试 [SEP]
"""
# 2. 理解模型结构
"""
BertForSequenceClassification =
BERT模型(12层Transformer) +
分类头([CLS]位置的输出 → 线性层 → softmax)
"""
# 3. 能修改模型
class CustomBert(nn.Module):
def __init__(self, bert_model, num_classes):
super().__init__() # super()调用父类方法
self.bert = bert_model
# 添加自定义层
self.custom_layer = nn.Linear(768, 256)
self.classifier = nn.Linear(256, num_classes)
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids, attention_mask)
pooled = outputs.pooler_output
x = self.custom_layer(pooled)
x = torch.relu(x)
logits = self.classifier(x)
return logits
📝 实践练习¶
练习1:手写神经网络¶
Python
# 不使用PyTorch的autograd,手动实现反向传播
import numpy as np
class ManualNN:
"""手动实现神经网络"""
def __init__(self, layers):
self.weights = []
self.biases = []
for i in range(len(layers) - 1):
# Xavier初始化
w = np.random.randn(layers[i], layers[i+1]) * np.sqrt(2/layers[i])
b = np.zeros((1, layers[i+1]))
self.weights.append(w)
self.biases.append(b)
def relu(self, x):
return np.maximum(0, x)
def relu_derivative(self, x):
return (x > 0).astype(float)
def softmax(self, x):
exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
return exp_x / np.sum(exp_x, axis=1, keepdims=True)
def forward(self, X):
"""前向传播"""
self.activations = [X]
self.z_values = []
for i in range(len(self.weights)):
z = np.dot(self.activations[-1], self.weights[i]) + self.biases[i] # 负索引:从末尾倒数访问元素
self.z_values.append(z)
if i < len(self.weights) - 1:
a = self.relu(z)
else:
a = self.softmax(z)
self.activations.append(a)
return self.activations[-1]
def backward(self, X, y, learning_rate):
"""反向传播"""
m = X.shape[0]
# 输出层梯度
dz = self.activations[-1] - y
for i in range(len(self.weights) - 1, -1, -1):
# 计算梯度
dw = np.dot(self.activations[i].T, dz) / m
db = np.sum(dz, axis=0, keepdims=True) / m
# 更新参数
self.weights[i] -= learning_rate * dw
self.biases[i] -= learning_rate * db
# 传播梯度到前一层
if i > 0:
da = np.dot(dz, self.weights[i].T)
dz = da * self.relu_derivative(self.z_values[i-1])
练习2:理解预训练模型¶
Python
# 分析BERT的结构
from transformers import BertModel
import torch
model = BertModel.from_pretrained('bert-base-chinese')
# 查看模型结构
print(model)
# 统计参数量
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}") # 约1.1亿
# 查看每一层
for name, param in model.named_parameters():
print(f"{name}: {param.shape}")
# 理解预训练权重
"""
预训练阶段:
1. MLM(Masked Language Model):预测被遮盖的词
2. NSP(Next Sentence Prediction):预测两个句子是否连续
微调阶段:
- 在特定任务上继续训练
- 可以冻结底层,只训练顶层
"""
# 冻结底层参数
for param in model.encoder.layer[:6].parameters(): # 切片操作:[start:end:step]提取子序列
param.requires_grad = False
练习3:模型部署¶
Python
# 将模型导出为ONNX格式
import torch.onnx
model = BertForSequenceClassification.from_pretrained('bert-base-chinese')
dummy_input = torch.randint(0, 1000, (1, 128))
torch.onnx.export(
model,
dummy_input,
"bert_model.onnx",
input_names=['input_ids'],
output_names=['output'],
dynamic_axes={'input_ids': {0: 'batch_size', 1: 'sequence'}}
)
# 使用TensorRT加速
import tensorrt as trt
# 使用量化减少模型大小
from transformers import BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained('bert-base-chinese')
model.eval()
# 动态量化
quantized_model = torch.quantization.quantize_dynamic(
model, {torch.nn.Linear}, dtype=torch.qint8
)
✅ 学习检查点¶
阶段1结束后:¶
- 理解神经网络的前向和反向传播
- 能手写简单的神经网络
- 理解各种激活函数的作用
阶段2结束后:¶
- 理解注意力机制的原理
- 能手写多头注意力
- 理解Transformer的完整架构
阶段3结束后:¶
- 理解BERT和GPT的区别
- 能进行模型微调
- 能将模型部署到生产环境
📚 推荐资源¶
课程¶
- CS231n(斯坦福深度学习)
- 李宏毅机器学习
- Attention Is All You Need(原始论文)
书籍¶
- 《深度学习》(花书)
- 《动手学深度学习》
- 《自然语言处理入门》
实践¶
- Hugging Face教程
- PyTorch官方教程
- 你的
扩散模型学习/资料
🚀 下一步¶
进入 02-大模型应用开发.md 学习大模型应用开发!
记住:从调包侠到专家,只差一个"手写实现"的距离! 🧠