03 - Hugging Face¶
学习时间: 8-10小时 重要性: ⭐⭐⭐⭐⭐ NLP和Transformer模型的标准工具
🎯 学习目标¶
- 理解Transformer架构原理
- 掌握Tokenizer的使用
- 学会使用预训练模型
- 掌握模型微调方法
- 了解不同NLP任务的实现
- 学会模型优化和部署
📚 内容概览¶
1. Transformer架构原理¶
1.1 自注意力机制¶
Python
import torch
import torch.nn as nn
import math
class SelfAttention(nn.Module):
def __init__(self, embed_dim, num_heads):
super().__init__() # super()调用父类方法,常用于继承中的初始化
self.num_heads = num_heads
self.head_dim = embed_dim // num_heads
assert embed_dim % num_heads == 0, "embed_dim必须能被num_heads整除" # assert断言条件为真,否则抛出AssertionError
self.qkv = nn.Linear(embed_dim, embed_dim * 3)
self.proj = nn.Linear(embed_dim, embed_dim)
def forward(self, x, mask=None):
batch_size, seq_len, embed_dim = x.shape
# 生成Q, K, V
qkv = self.qkv(x).reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim)
qkv = qkv.permute(2, 0, 3, 1, 4) # (3, batch, heads, seq, head_dim)
q, k, v = qkv[0], qkv[1], qkv[2]
# 计算注意力分数
scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
# 应用mask(用于因果注意力或padding)
if mask is not None:
scores = scores.masked_fill(mask == 0, float('-inf'))
# Softmax
attn_weights = torch.softmax(scores, dim=-1)
# 加权求和
attn_output = torch.matmul(attn_weights, v)
# 合并多头
attn_output = attn_output.transpose(1, 2).reshape(batch_size, seq_len, embed_dim)
return self.proj(attn_output)
# 测试
x = torch.randn(2, 10, 512) # (batch, seq_len, embed_dim)
attn = SelfAttention(embed_dim=512, num_heads=8)
output = attn(x)
print(output.shape) # (2, 10, 512)
1.2 Transformer编码器¶
Python
class TransformerEncoderLayer(nn.Module):
def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
super().__init__()
self.self_attn = SelfAttention(embed_dim, num_heads)
self.feed_forward = nn.Sequential(
nn.Linear(embed_dim, ff_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(ff_dim, embed_dim)
)
self.norm1 = nn.LayerNorm(embed_dim)
self.norm2 = nn.LayerNorm(embed_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask=None):
# 自注意力子层
attn_output = self.self_attn(x, mask)
x = self.norm1(x + self.dropout(attn_output))
# 前馈子层
ff_output = self.feed_forward(x)
x = self.norm2(x + self.dropout(ff_output))
return x
class TransformerEncoder(nn.Module):
def __init__(self, num_layers, embed_dim, num_heads, ff_dim, dropout=0.1):
super().__init__()
self.layers = nn.ModuleList([
TransformerEncoderLayer(embed_dim, num_heads, ff_dim, dropout)
for _ in range(num_layers)
])
def forward(self, x, mask=None):
for layer in self.layers:
x = layer(x, mask)
return x
1.3 Transformer变体¶
Python
# BERT (双向编码器)
# - 使用Transformer编码器
# - 双向注意力(可以看到整个序列)
# - 预训练任务: MLM (Masked Language Modeling) + NSP (Next Sentence Prediction)
# GPT (生成式预训练)
# - 使用Transformer解码器
# - 因果注意力(只能看到前面的token)
# - 预训练任务: 自回归语言建模
# T5 (Text-to-Text Transfer Transformer)
# - 编码器-解码器结构
# - 所有任务都转换为文本到文本的格式
# 从Hugging Face加载不同模型
from transformers import (
BertModel, GPT2Model, T5Model,
BertTokenizer, GPT2Tokenizer, T5Tokenizer
)
# BERT
bert = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# GPT-2
gpt2 = GPT2Model.from_pretrained('gpt2')
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
# T5
t5 = T5Model.from_pretrained('t5-small')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')
2. Tokenizer深入¶
2.1 分词算法¶
Python
# BPE (Byte Pair Encoding)
# WordPiece
# SentencePiece
# Unigram
from transformers import (
BertTokenizer, # WordPiece
GPT2Tokenizer, # BPE
T5Tokenizer, # SentencePiece
XLNetTokenizer, # SentencePiece
AutoTokenizer
)
# 自动选择对应的Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
2.2 Tokenizer使用¶
Python
from transformers import AutoTokenizer
# 加载Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
# 基本使用
text = "你好,世界!"
tokens = tokenizer.tokenize(text)
print(tokens) # ['你', '好', ',', '世', '界', '!']
# 编码(转换为ID)
encoded = tokenizer.encode(text)
print(encoded) # [101, 872, 1962, 8024, 686, 4518, 511, 102]
# 完整编码(返回更多信息的字典)
encoded = tokenizer(
text,
padding=True, # 填充
truncation=True, # 截断
max_length=512, # 最大长度
return_tensors='pt' # 返回PyTorch张量
)
print(encoded)
# {'input_ids': tensor([[...]]), 'attention_mask': tensor([[...]])}
# 批量编码
texts = ["第一句话", "第二句话", "第三句话"]
encoded = tokenizer(
texts,
padding=True,
truncation=True,
max_length=512,
return_tensors='pt'
)
print(encoded['input_ids'].shape) # (3, seq_len)
# 解码
ids = [101, 872, 1962, 102]
decoded = tokenizer.decode(ids)
print(decoded) # [CLS] 你好 [SEP]
# 跳过特殊token
decoded = tokenizer.decode(ids, skip_special_tokens=True)
print(decoded) # 你好
2.3 特殊Token¶
Python
# 查看特殊token
print(tokenizer.special_tokens_map)
# {
# 'cls_token': '[CLS]',
# 'sep_token': '[SEP]',
# 'pad_token': '[PAD]',
# 'unk_token': '[UNK]',
# 'mask_token': '[MASK]'
# }
# 获取特殊token的ID
cls_token_id = tokenizer.cls_token_id
sep_token_id = tokenizer.sep_token_id
pad_token_id = tokenizer.pad_token_id
# 添加新token
new_tokens = ['[NEW_TOKEN_1]', '[NEW_TOKEN_2]']
tokenizer.add_tokens(new_tokens)
# 添加特殊token
special_tokens = {'cls_token': '[MY_CLS]', 'sep_token': '[MY_SEP]'}
tokenizer.add_special_tokens(special_tokens)
# 调整模型嵌入层大小(添加新token后)
model.resize_token_embeddings(len(tokenizer))
2.4 处理句子对¶
Python
# 处理两个句子(用于NSP等任务)
sentence1 = "今天天气很好"
sentence2 = "适合出去散步"
encoded = tokenizer(
sentence1,
sentence2,
padding=True,
truncation=True,
max_length=512,
return_tensors='pt'
)
# 输出包含token type ids(区分两个句子)
print(encoded['input_ids'])
print(encoded['token_type_ids']) # 0表示第一句,1表示第二句
# 手动构建句子对
encoded = tokenizer(
f"{sentence1} [SEP] {sentence2}",
padding=True,
truncation=True,
return_tensors='pt'
)
3. 模型使用¶
3.1 AutoModel体系¶
Python
from transformers import (
AutoModel, AutoModelForSequenceClassification,
AutoModelForTokenClassification, AutoModelForQuestionAnswering,
AutoModelForCausalLM, AutoModelForSeq2SeqLM
)
# 基础模型(只输出隐藏状态)
model = AutoModel.from_pretrained('bert-base-chinese')
# 用于分类任务(带分类头)
model = AutoModelForSequenceClassification.from_pretrained(
'bert-base-chinese',
num_labels=2 # 类别数
)
# 用于token级别分类(如NER)
model = AutoModelForTokenClassification.from_pretrained(
'bert-base-chinese',
num_labels=9 # BIO标签数
)
# 用于问答任务
model = AutoModelForQuestionAnswering.from_pretrained('bert-base-chinese')
# 用于文本生成(因果语言模型)
model = AutoModelForCausalLM.from_pretrained('gpt2')
# 用于序列到序列任务(如翻译、摘要)
model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')
3.2 模型推理¶
Python
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
# 加载模型和tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-chinese')
# 准备输入
text = "这是一个很好的产品"
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
# 推理
model.eval()
with torch.no_grad():
outputs = model(**inputs)
# 获取logits
logits = outputs.logits
print(logits.shape) # (batch_size, num_labels)
# 获取预测
predictions = torch.argmax(logits, dim=-1)
print(predictions)
# 获取概率
probs = torch.softmax(logits, dim=-1)
print(probs)
# 获取隐藏状态
outputs = model(**inputs, output_hidden_states=True)
hidden_states = outputs.hidden_states # 13层(embedding + 12层transformer)
print(len(hidden_states))
print(hidden_states[-1].shape) # (batch, seq_len, hidden_size)
# 获取注意力权重
outputs = model(**inputs, output_attentions=True)
attentions = outputs.attentions # 12层注意力权重
print(len(attentions))
print(attentions[0].shape) # (batch, num_heads, seq_len, seq_len)
3.3 文本生成¶
Python
from transformers import AutoTokenizer, AutoModelForCausalLM
# 加载生成模型
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2')
# 生成文本
prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors='pt')
# 生成
outputs = model.generate(
**inputs,
max_length=100, # 最大生成长度
num_return_sequences=3, # 生成3个候选
temperature=0.7, # 温度(控制随机性)
top_k=50, # Top-k采样
top_p=0.95, # Nucleus sampling
do_sample=True, # 使用采样而不是贪心
repetition_penalty=1.2, # 重复惩罚
pad_token_id=tokenizer.eos_token_id
)
# 解码
generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
for i, text in enumerate(generated_texts):
print(f"Generated {i+1}:\n{text}\n")
4. 微调完整流程¶
4.1 数据准备¶
Python
from datasets import load_dataset, Dataset
import pandas as pd
# 加载数据集
dataset = load_dataset('imdb') # 情感分析数据集
print(dataset)
# DatasetDict({
# train: Dataset({
# features: ['text', 'label'],
# num_rows: 25000
# })
# test: Dataset({
# features: ['text', 'label'],
# num_rows: 25000
# })
# })
# 从pandas创建数据集
df = pd.DataFrame({
'text': ['很好', '不好', '一般'],
'label': [1, 0, 2]
})
dataset = Dataset.from_pandas(df)
# 从本地文件加载
dataset = load_dataset('csv', data_files='data.csv')
# 数据预处理
def preprocess_function(examples):
return tokenizer(
examples['text'],
truncation=True,
padding='max_length',
max_length=512
)
encoded_dataset = dataset.map(preprocess_function, batched=True)
4.2 使用Trainer API¶
Python
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorWithPadding
)
import numpy as np
from datasets import load_dataset
import evaluate
# 加载数据
dataset = load_dataset('imdb')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# 预处理
def preprocess_function(examples):
return tokenizer(examples['text'], truncation=True)
encoded_dataset = dataset.map(preprocess_function, batched=True)
# 数据整理器(动态padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(
'bert-base-uncased',
num_labels=2
)
# 训练参数
training_args = TrainingArguments(
output_dir='./results',
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
eval_strategy='epoch',
save_strategy='epoch',
load_best_model_at_end=True,
push_to_hub=False,
logging_dir='./logs',
logging_steps=10,
)
# 评估指标
metric = evaluate.load('accuracy')
def compute_metrics(eval_pred):
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return metric.compute(predictions=predictions, references=labels)
# 创建Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=encoded_dataset['train'].shuffle(seed=42).select(range(1000)),
eval_dataset=encoded_dataset['test'].shuffle(seed=42).select(range(500)),
tokenizer=tokenizer,
data_collator=data_collator,
compute_metrics=compute_metrics,
)
# 训练
trainer.train()
# 评估
trainer.evaluate()
# 保存模型
trainer.save_model('./my_model')
4.3 自定义训练循环¶
Python
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm
# 准备数据
train_dataloader = DataLoader(
encoded_dataset['train'],
batch_size=16,
shuffle=True,
collate_fn=data_collator
)
# 优化器和学习率调度
optimizer = AdamW(model.parameters(), lr=2e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=num_training_steps
)
# 训练循环
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()
for epoch in range(num_epochs):
total_loss = 0
progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}')
for batch in progress_bar:
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
total_loss += loss.item()
progress_bar.set_postfix({'loss': loss.item()})
avg_loss = total_loss / len(train_dataloader)
print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')
5. 不同NLP任务¶
5.1 文本分类¶
Python
from transformers import pipeline
# 使用pipeline快速实现
classifier = pipeline(
'sentiment-analysis',
model='distilbert-base-uncased-finetuned-sst-2-english'
)
result = classifier("I love this product!")
print(result)
# [{'label': 'POSITIVE', 'score': 0.9998}]
# 多标签分类
classifier = pipeline(
'text-classification',
model='facebook/bart-large-mnli'
)
result = classifier(
"I have a problem with my iphone that needs to be resolved asap!",
candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"]
)
print(result)
5.2 命名实体识别 (NER)¶
Python
# 使用pipeline
ner_pipeline = pipeline(
'ner',
model='dslim/bert-base-NER',
aggregation_strategy='simple'
)
text = "Apple is looking at buying U.K. startup for $1 billion"
entities = ner_pipeline(text)
for entity in entities:
print(f"{entity['word']}: {entity['entity_group']} ({entity['score']:.4f})")
# 微调NER模型
from transformers import AutoModelForTokenClassification
model = AutoModelForTokenClassification.from_pretrained(
'bert-base-cased',
num_labels=9 # BIO标签: O, B-PER, I-PER, B-ORG, I-ORG, B-LOC, I-LOC, B-MISC, I-MISC
)
5.3 问答系统¶
Python
# 抽取式问答
qa_pipeline = pipeline(
'question-answering',
model='distilbert-base-cased-distilled-squad'
)
context = """
The Transformers library provides state-of-the-art general-purpose architectures
for Natural Language Understanding (NLU) and Natural Language Generation (NLG).
"""
question = "What does the Transformers library provide?"
result = qa_pipeline(question=question, context=context)
print(result)
# {'score': 0.99, 'start': 34, 'end': 95, 'answer': 'state-of-the-art general-purpose architectures'}
# 生成式问答(使用T5)
generator = pipeline('text2text-generation', model='google/t5-small-ssm-nq')
5.4 文本摘要¶
Python
# 使用pipeline
summarizer = pipeline('summarization', model='facebook/bart-large-cnn')
text = """
Your long text here... (at least 100 words)
"""
summary = summarizer(
text,
max_length=130,
min_length=30,
do_sample=False
)
print(summary[0]['summary_text'])
# 使用T5
t5_summarizer = pipeline(
'text2text-generation',
model='t5-small'
)
result = t5_summarizer(
"summarize: " + text,
max_length=100,
num_return_sequences=1
)
5.5 机器翻译¶
Python
# 使用pipeline
translator = pipeline(
'translation_en_to_de',
model='t5-small'
)
result = translator("Hello, how are you?")
print(result)
# 使用MarianMT
from transformers import MarianMTModel, MarianTokenizer
model_name = 'Helsinki-NLP/opus-mt-en-de'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors='pt', padding=True)
translated = model.generate(**inputs)
result = tokenizer.decode(translated[0], skip_special_tokens=True)
print(result)
5.6 文本生成¶
Python
# GPT-2生成
generator = pipeline('text-generation', model='gpt2')
prompt = "The future of AI is"
result = generator(
prompt,
max_length=100,
num_return_sequences=3,
temperature=0.8
)
for i, res in enumerate(result):
print(f"Generation {i+1}:\n{res['generated_text']}\n")
6. 模型优化与部署¶
6.1 模型量化¶
Python
from transformers import AutoModelForSequenceClassification
import torch
# 动态量化
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
quantized_model = torch.quantization.quantize_dynamic(
model,
{torch.nn.Linear},
dtype=torch.qint8
)
# 保存量化模型
torch.save(quantized_model.state_dict(), 'quantized_model.pt')
# 使用Optimum进行优化
from optimum.onnxruntime import ORTModelForSequenceClassification
model = ORTModelForSequenceClassification.from_pretrained(
'bert-base-uncased',
export=True
)
6.2 ONNX导出¶
Python
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
# 加载模型
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
# 准备示例输入
text = "This is a sample text"
inputs = tokenizer(text, return_tensors='pt')
# 导出为ONNX
torch.onnx.export(
model,
(inputs['input_ids'], inputs['attention_mask']),
'model.onnx',
input_names=['input_ids', 'attention_mask'],
output_names=['output'],
dynamic_axes={
'input_ids': {0: 'batch_size', 1: 'sequence'},
'attention_mask': {0: 'batch_size', 1: 'sequence'},
'output': {0: 'batch_size'}
},
opset_version=11
)
6.3 模型推理优化¶
Python
# 使用TorchScript
from transformers import AutoModel
model = AutoModel.from_pretrained('bert-base-uncased')
model.eval()
# 追踪模型
example_inputs = tokenizer("Example text", return_tensors='pt')
traced_model = torch.jit.trace(
model,
(example_inputs['input_ids'], example_inputs['attention_mask'])
)
# 保存
traced_model.save('traced_model.pt')
# 使用ONNX Runtime
import onnxruntime as ort
session = ort.InferenceSession('model.onnx')
inputs = {
'input_ids': inputs['input_ids'].numpy(),
'attention_mask': inputs['attention_mask'].numpy()
}
outputs = session.run(None, inputs)
6.4 部署为API¶
Python
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline
app = FastAPI()
# 加载模型
classifier = pipeline('sentiment-analysis')
class TextInput(BaseModel):
text: str
class PredictionOutput(BaseModel):
label: str
score: float
@app.post('/predict', response_model=PredictionOutput)
def predict(input: TextInput):
result = classifier(input.text)[0]
return PredictionOutput(label=result['label'], score=result['score'])
@app.get('/health')
def health():
return {'status': 'healthy'}
# 运行: uvicorn api:app --reload
📝 练习¶
练习1: Tokenizer实践¶
练习2: 模型推理¶
练习3: 模型微调¶
练习4: 完整NLP项目¶
🎯 自我检查¶
- 理解Transformer架构和自注意力机制
- 掌握Tokenizer的使用方法
- 能加载和使用预训练模型
- 掌握模型微调流程
- 能实现不同NLP任务
- 了解模型优化和部署方法
📚 延伸阅读¶
下一步: 04 - 实战项目