跳转至

03 - Hugging Face

学习时间: 8-10小时 重要性: ⭐⭐⭐⭐⭐ NLP和Transformer模型的标准工具


🎯 学习目标

  • 理解Transformer架构原理
  • 掌握Tokenizer的使用
  • 学会使用预训练模型
  • 掌握模型微调方法
  • 了解不同NLP任务的实现
  • 学会模型优化和部署

📚 内容概览

  1. Transformer架构原理
  2. Tokenizer深入
  3. 模型使用
  4. 微调完整流程
  5. 不同NLP任务
  6. 模型优化与部署

1. Transformer架构原理

1.1 自注意力机制

Python
import torch
import torch.nn as nn
import math

class SelfAttention(nn.Module):
    def __init__(self, embed_dim, num_heads):
        super().__init__()  # super()调用父类方法,常用于继承中的初始化
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        assert embed_dim % num_heads == 0, "embed_dim必须能被num_heads整除"  # assert断言条件为真,否则抛出AssertionError

        self.qkv = nn.Linear(embed_dim, embed_dim * 3)
        self.proj = nn.Linear(embed_dim, embed_dim)

    def forward(self, x, mask=None):
        batch_size, seq_len, embed_dim = x.shape

        # 生成Q, K, V
        qkv = self.qkv(x).reshape(batch_size, seq_len, 3, self.num_heads, self.head_dim)
        qkv = qkv.permute(2, 0, 3, 1, 4)  # (3, batch, heads, seq, head_dim)
        q, k, v = qkv[0], qkv[1], qkv[2]

        # 计算注意力分数
        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)

        # 应用mask(用于因果注意力或padding)
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))

        # Softmax
        attn_weights = torch.softmax(scores, dim=-1)

        # 加权求和
        attn_output = torch.matmul(attn_weights, v)

        # 合并多头
        attn_output = attn_output.transpose(1, 2).reshape(batch_size, seq_len, embed_dim)

        return self.proj(attn_output)

# 测试
x = torch.randn(2, 10, 512)  # (batch, seq_len, embed_dim)
attn = SelfAttention(embed_dim=512, num_heads=8)
output = attn(x)
print(output.shape)  # (2, 10, 512)

1.2 Transformer编码器

Python
class TransformerEncoderLayer(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.self_attn = SelfAttention(embed_dim, num_heads)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(ff_dim, embed_dim)
        )
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # 自注意力子层
        attn_output = self.self_attn(x, mask)
        x = self.norm1(x + self.dropout(attn_output))

        # 前馈子层
        ff_output = self.feed_forward(x)
        x = self.norm2(x + self.dropout(ff_output))

        return x

class TransformerEncoder(nn.Module):
    def __init__(self, num_layers, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.layers = nn.ModuleList([
            TransformerEncoderLayer(embed_dim, num_heads, ff_dim, dropout)
            for _ in range(num_layers)
        ])

    def forward(self, x, mask=None):
        for layer in self.layers:
            x = layer(x, mask)
        return x

1.3 Transformer变体

Python
# BERT (双向编码器)
# - 使用Transformer编码器
# - 双向注意力(可以看到整个序列)
# - 预训练任务: MLM (Masked Language Modeling) + NSP (Next Sentence Prediction)

# GPT (生成式预训练)
# - 使用Transformer解码器
# - 因果注意力(只能看到前面的token)
# - 预训练任务: 自回归语言建模

# T5 (Text-to-Text Transfer Transformer)
# - 编码器-解码器结构
# - 所有任务都转换为文本到文本的格式

# 从Hugging Face加载不同模型
from transformers import (
    BertModel, GPT2Model, T5Model,
    BertTokenizer, GPT2Tokenizer, T5Tokenizer
)

# BERT
bert = BertModel.from_pretrained('bert-base-uncased')
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# GPT-2
gpt2 = GPT2Model.from_pretrained('gpt2')
gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# T5
t5 = T5Model.from_pretrained('t5-small')
t5_tokenizer = T5Tokenizer.from_pretrained('t5-small')

2. Tokenizer深入

2.1 分词算法

Python
# BPE (Byte Pair Encoding)
# WordPiece
# SentencePiece
# Unigram

from transformers import (
    BertTokenizer,      # WordPiece
    GPT2Tokenizer,      # BPE
    T5Tokenizer,        # SentencePiece
    XLNetTokenizer,     # SentencePiece
    AutoTokenizer
)

# 自动选择对应的Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

2.2 Tokenizer使用

Python
from transformers import AutoTokenizer

# 加载Tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')

# 基本使用
text = "你好,世界!"
tokens = tokenizer.tokenize(text)
print(tokens)  # ['你', '好', ',', '世', '界', '!']

# 编码(转换为ID)
encoded = tokenizer.encode(text)
print(encoded)  # [101, 872, 1962, 8024, 686, 4518, 511, 102]

# 完整编码(返回更多信息的字典)
encoded = tokenizer(
    text,
    padding=True,           # 填充
    truncation=True,        # 截断
    max_length=512,         # 最大长度
    return_tensors='pt'     # 返回PyTorch张量
)
print(encoded)
# {'input_ids': tensor([[...]]), 'attention_mask': tensor([[...]])}

# 批量编码
texts = ["第一句话", "第二句话", "第三句话"]
encoded = tokenizer(
    texts,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)
print(encoded['input_ids'].shape)  # (3, seq_len)

# 解码
ids = [101, 872, 1962, 102]
decoded = tokenizer.decode(ids)
print(decoded)  # [CLS] 你好 [SEP]

# 跳过特殊token
decoded = tokenizer.decode(ids, skip_special_tokens=True)
print(decoded)  # 你好

2.3 特殊Token

Python
# 查看特殊token
print(tokenizer.special_tokens_map)
# {
#     'cls_token': '[CLS]',
#     'sep_token': '[SEP]',
#     'pad_token': '[PAD]',
#     'unk_token': '[UNK]',
#     'mask_token': '[MASK]'
# }

# 获取特殊token的ID
cls_token_id = tokenizer.cls_token_id
sep_token_id = tokenizer.sep_token_id
pad_token_id = tokenizer.pad_token_id

# 添加新token
new_tokens = ['[NEW_TOKEN_1]', '[NEW_TOKEN_2]']
tokenizer.add_tokens(new_tokens)

# 添加特殊token
special_tokens = {'cls_token': '[MY_CLS]', 'sep_token': '[MY_SEP]'}
tokenizer.add_special_tokens(special_tokens)

# 调整模型嵌入层大小(添加新token后)
model.resize_token_embeddings(len(tokenizer))

2.4 处理句子对

Python
# 处理两个句子(用于NSP等任务)
sentence1 = "今天天气很好"
sentence2 = "适合出去散步"

encoded = tokenizer(
    sentence1,
    sentence2,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors='pt'
)

# 输出包含token type ids(区分两个句子)
print(encoded['input_ids'])
print(encoded['token_type_ids'])  # 0表示第一句,1表示第二句

# 手动构建句子对
encoded = tokenizer(
    f"{sentence1} [SEP] {sentence2}",
    padding=True,
    truncation=True,
    return_tensors='pt'
)

3. 模型使用

3.1 AutoModel体系

Python
from transformers import (
    AutoModel, AutoModelForSequenceClassification,
    AutoModelForTokenClassification, AutoModelForQuestionAnswering,
    AutoModelForCausalLM, AutoModelForSeq2SeqLM
)

# 基础模型(只输出隐藏状态)
model = AutoModel.from_pretrained('bert-base-chinese')

# 用于分类任务(带分类头)
model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-chinese',
    num_labels=2  # 类别数
)

# 用于token级别分类(如NER)
model = AutoModelForTokenClassification.from_pretrained(
    'bert-base-chinese',
    num_labels=9  # BIO标签数
)

# 用于问答任务
model = AutoModelForQuestionAnswering.from_pretrained('bert-base-chinese')

# 用于文本生成(因果语言模型)
model = AutoModelForCausalLM.from_pretrained('gpt2')

# 用于序列到序列任务(如翻译、摘要)
model = AutoModelForSeq2SeqLM.from_pretrained('t5-small')

3.2 模型推理

Python
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# 加载模型和tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-chinese')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-chinese')

# 准备输入
text = "这是一个很好的产品"
inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

# 推理
model.eval()
with torch.no_grad():
    outputs = model(**inputs)

# 获取logits
logits = outputs.logits
print(logits.shape)  # (batch_size, num_labels)

# 获取预测
predictions = torch.argmax(logits, dim=-1)
print(predictions)

# 获取概率
probs = torch.softmax(logits, dim=-1)
print(probs)

# 获取隐藏状态
outputs = model(**inputs, output_hidden_states=True)
hidden_states = outputs.hidden_states  # 13层(embedding + 12层transformer)
print(len(hidden_states))
print(hidden_states[-1].shape)  # (batch, seq_len, hidden_size)

# 获取注意力权重
outputs = model(**inputs, output_attentions=True)
attentions = outputs.attentions  # 12层注意力权重
print(len(attentions))
print(attentions[0].shape)  # (batch, num_heads, seq_len, seq_len)

3.3 文本生成

Python
from transformers import AutoTokenizer, AutoModelForCausalLM

# 加载生成模型
tokenizer = AutoTokenizer.from_pretrained('gpt2')
model = AutoModelForCausalLM.from_pretrained('gpt2')

# 生成文本
prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors='pt')

# 生成
outputs = model.generate(
    **inputs,
    max_length=100,           # 最大生成长度
    num_return_sequences=3,   # 生成3个候选
    temperature=0.7,          # 温度(控制随机性)
    top_k=50,                 # Top-k采样
    top_p=0.95,               # Nucleus sampling
    do_sample=True,           # 使用采样而不是贪心
    repetition_penalty=1.2,   # 重复惩罚
    pad_token_id=tokenizer.eos_token_id
)

# 解码
generated_texts = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
for i, text in enumerate(generated_texts):
    print(f"Generated {i+1}:\n{text}\n")

4. 微调完整流程

4.1 数据准备

Python
from datasets import load_dataset, Dataset
import pandas as pd

# 加载数据集
dataset = load_dataset('imdb')  # 情感分析数据集
print(dataset)
# DatasetDict({
#     train: Dataset({
#         features: ['text', 'label'],
#         num_rows: 25000
#     })
#     test: Dataset({
#         features: ['text', 'label'],
#         num_rows: 25000
#     })
# })

# 从pandas创建数据集
df = pd.DataFrame({
    'text': ['很好', '不好', '一般'],
    'label': [1, 0, 2]
})
dataset = Dataset.from_pandas(df)

# 从本地文件加载
dataset = load_dataset('csv', data_files='data.csv')

# 数据预处理
def preprocess_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=512
    )

encoded_dataset = dataset.map(preprocess_function, batched=True)

4.2 使用Trainer API

Python
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import numpy as np
from datasets import load_dataset
import evaluate

# 加载数据
dataset = load_dataset('imdb')
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# 预处理
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True)

encoded_dataset = dataset.map(preprocess_function, batched=True)

# 数据整理器(动态padding)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)

# 训练参数
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    push_to_hub=False,
    logging_dir='./logs',
    logging_steps=10,
)

# 评估指标
metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# 创建Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset['train'].shuffle(seed=42).select(range(1000)),
    eval_dataset=encoded_dataset['test'].shuffle(seed=42).select(range(500)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# 训练
trainer.train()

# 评估
trainer.evaluate()

# 保存模型
trainer.save_model('./my_model')

4.3 自定义训练循环

Python
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import get_linear_schedule_with_warmup
from tqdm import tqdm

# 准备数据
train_dataloader = DataLoader(
    encoded_dataset['train'],
    batch_size=16,
    shuffle=True,
    collate_fn=data_collator
)

# 优化器和学习率调度
optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# 训练循环
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}')

    for batch in progress_bar:
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})

    avg_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}, Average Loss: {avg_loss:.4f}')

5. 不同NLP任务

5.1 文本分类

Python
from transformers import pipeline

# 使用pipeline快速实现
classifier = pipeline(
    'sentiment-analysis',
    model='distilbert-base-uncased-finetuned-sst-2-english'
)

result = classifier("I love this product!")
print(result)
# [{'label': 'POSITIVE', 'score': 0.9998}]

# 多标签分类
classifier = pipeline(
    'text-classification',
    model='facebook/bart-large-mnli'
)

result = classifier(
    "I have a problem with my iphone that needs to be resolved asap!",
    candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"]
)
print(result)

5.2 命名实体识别 (NER)

Python
# 使用pipeline
ner_pipeline = pipeline(
    'ner',
    model='dslim/bert-base-NER',
    aggregation_strategy='simple'
)

text = "Apple is looking at buying U.K. startup for $1 billion"
entities = ner_pipeline(text)
for entity in entities:
    print(f"{entity['word']}: {entity['entity_group']} ({entity['score']:.4f})")

# 微调NER模型
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    'bert-base-cased',
    num_labels=9  # BIO标签: O, B-PER, I-PER, B-ORG, I-ORG, B-LOC, I-LOC, B-MISC, I-MISC
)

5.3 问答系统

Python
# 抽取式问答
qa_pipeline = pipeline(
    'question-answering',
    model='distilbert-base-cased-distilled-squad'
)

context = """
The Transformers library provides state-of-the-art general-purpose architectures
for Natural Language Understanding (NLU) and Natural Language Generation (NLG).
"""
question = "What does the Transformers library provide?"

result = qa_pipeline(question=question, context=context)
print(result)
# {'score': 0.99, 'start': 34, 'end': 95, 'answer': 'state-of-the-art general-purpose architectures'}

# 生成式问答(使用T5)
generator = pipeline('text2text-generation', model='google/t5-small-ssm-nq')

5.4 文本摘要

Python
# 使用pipeline
summarizer = pipeline('summarization', model='facebook/bart-large-cnn')

text = """
Your long text here... (at least 100 words)
"""

summary = summarizer(
    text,
    max_length=130,
    min_length=30,
    do_sample=False
)
print(summary[0]['summary_text'])

# 使用T5
t5_summarizer = pipeline(
    'text2text-generation',
    model='t5-small'
)

result = t5_summarizer(
    "summarize: " + text,
    max_length=100,
    num_return_sequences=1
)

5.5 机器翻译

Python
# 使用pipeline
translator = pipeline(
    'translation_en_to_de',
    model='t5-small'
)

result = translator("Hello, how are you?")
print(result)

# 使用MarianMT
from transformers import MarianMTModel, MarianTokenizer

model_name = 'Helsinki-NLP/opus-mt-en-de'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

text = "Hello, how are you?"
inputs = tokenizer(text, return_tensors='pt', padding=True)
translated = model.generate(**inputs)
result = tokenizer.decode(translated[0], skip_special_tokens=True)
print(result)

5.6 文本生成

Python
# GPT-2生成
generator = pipeline('text-generation', model='gpt2')

prompt = "The future of AI is"
result = generator(
    prompt,
    max_length=100,
    num_return_sequences=3,
    temperature=0.8
)

for i, res in enumerate(result):
    print(f"Generation {i+1}:\n{res['generated_text']}\n")

6. 模型优化与部署

6.1 模型量化

Python
from transformers import AutoModelForSequenceClassification
import torch

# 动态量化
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')
quantized_model = torch.quantization.quantize_dynamic(
    model,
    {torch.nn.Linear},
    dtype=torch.qint8
)

# 保存量化模型
torch.save(quantized_model.state_dict(), 'quantized_model.pt')

# 使用Optimum进行优化
from optimum.onnxruntime import ORTModelForSequenceClassification

model = ORTModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    export=True
)

6.2 ONNX导出

Python
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# 加载模型
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')

# 准备示例输入
text = "This is a sample text"
inputs = tokenizer(text, return_tensors='pt')

# 导出为ONNX
torch.onnx.export(
    model,
    (inputs['input_ids'], inputs['attention_mask']),
    'model.onnx',
    input_names=['input_ids', 'attention_mask'],
    output_names=['output'],
    dynamic_axes={
        'input_ids': {0: 'batch_size', 1: 'sequence'},
        'attention_mask': {0: 'batch_size', 1: 'sequence'},
        'output': {0: 'batch_size'}
    },
    opset_version=11
)

6.3 模型推理优化

Python
# 使用TorchScript
from transformers import AutoModel

model = AutoModel.from_pretrained('bert-base-uncased')
model.eval()

# 追踪模型
example_inputs = tokenizer("Example text", return_tensors='pt')
traced_model = torch.jit.trace(
    model,
    (example_inputs['input_ids'], example_inputs['attention_mask'])
)

# 保存
traced_model.save('traced_model.pt')

# 使用ONNX Runtime
import onnxruntime as ort

session = ort.InferenceSession('model.onnx')
inputs = {
    'input_ids': inputs['input_ids'].numpy(),
    'attention_mask': inputs['attention_mask'].numpy()
}
outputs = session.run(None, inputs)

6.4 部署为API

Python
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import pipeline

app = FastAPI()

# 加载模型
classifier = pipeline('sentiment-analysis')

class TextInput(BaseModel):
    text: str

class PredictionOutput(BaseModel):
    label: str
    score: float

@app.post('/predict', response_model=PredictionOutput)
def predict(input: TextInput):
    result = classifier(input.text)[0]
    return PredictionOutput(label=result['label'], score=result['score'])

@app.get('/health')
def health():
    return {'status': 'healthy'}

# 运行: uvicorn api:app --reload

📝 练习

练习1: Tokenizer实践

Python
# 1. 加载一个中文BERT模型的Tokenizer
# 2. 对一段中文文本进行tokenize
# 3. 查看特殊token和它们的ID
# 4. 添加自定义token并调整模型

练习2: 模型推理

Python
# 1. 加载一个预训练的文本分类模型
# 2. 对多个文本进行批量推理
# 3. 获取预测概率和隐藏状态
# 4. 可视化注意力权重

练习3: 模型微调

Python
# 1. 选择一个文本分类数据集
# 2. 使用Trainer API微调BERT模型
# 3. 评估模型性能
# 4. 保存并加载微调后的模型

练习4: 完整NLP项目

Python
# 1. 选择一个NLP任务(如情感分析、NER等)
# 2. 准备数据集
# 3. 微调预训练模型
# 4. 评估和优化
# 5. 部署为API服务

🎯 自我检查

  • 理解Transformer架构和自注意力机制
  • 掌握Tokenizer的使用方法
  • 能加载和使用预训练模型
  • 掌握模型微调流程
  • 能实现不同NLP任务
  • 了解模型优化和部署方法

📚 延伸阅读


下一步: 04 - 实战项目