📖 第7章:机器翻译¶
学习时间:8小时 难度星级:⭐⭐⭐⭐ 前置知识:Seq2Seq、注意力机制、Transformer 学习目标:掌握从统计翻译到神经翻译的演进,理解Transformer在翻译中的应用
📋 目录¶
- 1. 机器翻译概述
- 2. 统计机器翻译
- 3. 神经机器翻译基础
- 4. 注意力机制详解
- 5. Transformer翻译模型
- 6. BLEU评估指标
- 7. 翻译技巧与改进
- 8. 实战:中英翻译系统
- 9. 面试要点
- 10. 练习题
1. 机器翻译概述¶
1.1 发展历程¶
mt_history = {
"1950s-1980s": {
"方法": "基于规则的翻译(RBMT)",
"核心": "语言学专家编写转换规则",
"特点": "需要大量人力,覆盖面有限",
},
"1990s-2010s": {
"方法": "统计机器翻译(SMT)",
"核心": "基于大规模平行语料统计建模",
"代表": "IBM模型、短语翻译、Moses",
},
"2014-2017": {
"方法": "神经机器翻译(NMT)",
"核心": "端到端的Seq2Seq + Attention",
"代表": "Google NMT (GNMT)",
},
"2017-至今": {
"方法": "Transformer翻译",
"核心": "自注意力机制、预训练模型",
"代表": "Transformer、mBART、NLLB",
},
}
print("机器翻译发展历程:")
for era, info in mt_history.items():
print(f"\n📅 {era}: {info['方法']}")
print(f" 核心: {info['核心']}")
1.2 翻译任务的难点¶
翻译难点:
1. 词序差异:
中: 我 吃 苹果 (SVO)
日: 私は りんごを 食べる (SOV)
2. 一词多义:
"bank" → 银行 / 河岸 / 存储
3. 省略与隐含:
中: (你)吃了吗?
英: Have you eaten?
4. 长距离依赖:
"The man who I told you about yesterday came."
5. 文化相关表达:
"加油" → (不能翻译成 "add oil")
2. 统计机器翻译¶
2.1 噪声信道模型¶
class SimpleStatisticalMT:
"""简化版统计机器翻译"""
def __init__(self):
self.translation_table = {} # P(f|e)
self.language_model = {} # P(e)
def train_translation_model(self, parallel_corpus):
"""从平行语料学习翻译概率 (简化版IBM Model 1)"""
from collections import defaultdict
# 统计共现
cooccurrence = defaultdict(lambda: defaultdict(int)) # defaultdict访问不存在的键时返回默认值
target_count = defaultdict(int)
for src_sent, tgt_sent in parallel_corpus:
src_words = src_sent.split()
tgt_words = tgt_sent.split()
for sw in src_words:
for tw in tgt_words:
cooccurrence[sw][tw] += 1
target_count[tw] += 1
# 计算翻译概率
for sw in cooccurrence:
total = sum(cooccurrence[sw].values())
self.translation_table[sw] = {
tw: count / total
for tw, count in cooccurrence[sw].items()
}
def translate_word(self, word):
"""翻译单个词"""
if word in self.translation_table:
translations = self.translation_table[word]
return max(translations, key=translations.get)
return word # 未知词保持不变
def translate(self, sentence):
"""逐词翻译(极简版)"""
words = sentence.split()
translated = [self.translate_word(w) for w in words]
return ' '.join(translated)
# 训练
parallel_data = [
("我 爱 你", "I love you"),
("你 好", "hello"),
("谢谢 你", "thank you"),
("我 是 学生", "I am student"),
("他 爱 她", "he love her"),
("你 是 老师", "you are teacher"),
]
smt = SimpleStatisticalMT()
smt.train_translation_model(parallel_data)
# 翻译
test_sentences = ["我 爱 她", "你 是 学生"]
for sent in test_sentences:
result = smt.translate(sent)
print(f"'{sent}' → '{result}'")
print("\n翻译概率表 (部分):")
for word in ['我', '爱', '你']:
if word in smt.translation_table:
top3 = sorted(smt.translation_table[word].items(), key=lambda x: -x[1])[:3] # lambda匿名函数 # 切片操作,取前n个元素
print(f" '{word}': {top3}")
3. 神经机器翻译基础¶
3.1 Seq2Seq + Attention翻译¶
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
class NMTEncoder(nn.Module): # 继承nn.Module定义网络层
"""NMT编码器"""
def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=2, dropout=0.3):
super().__init__() # super()调用父类方法
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.rnn = nn.GRU(embed_dim, hidden_dim, num_layers=num_layers,
bidirectional=True, batch_first=True,
dropout=dropout if num_layers > 1 else 0)
self.fc = nn.Linear(hidden_dim * 2, hidden_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, src):
embedded = self.dropout(self.embedding(src))
outputs, hidden = self.rnn(embedded)
# 合并双向隐状态
hidden = torch.cat([hidden[-2], hidden[-1]], dim=1) # [-1]负索引取最后元素 # torch.cat沿已有维度拼接张量
hidden = torch.tanh(self.fc(hidden)).unsqueeze(0) # unsqueeze增加一个维度
return outputs, hidden
class NMTAttention(nn.Module):
"""注意力层"""
def __init__(self, hidden_dim):
super().__init__()
self.attn = nn.Linear(hidden_dim * 3, hidden_dim)
self.v = nn.Linear(hidden_dim, 1, bias=False)
def forward(self, decoder_hidden, encoder_outputs):
src_len = encoder_outputs.size(1)
hidden = decoder_hidden.squeeze(0).unsqueeze(1).repeat(1, src_len, 1) # squeeze压缩维度
energy = torch.tanh(self.attn(torch.cat([hidden, encoder_outputs], dim=2)))
attention = self.v(energy).squeeze(2)
return F.softmax(attention, dim=1) # F.xxx PyTorch函数式API
class NMTDecoder(nn.Module):
"""NMT解码器(带Attention)"""
def __init__(self, vocab_size, embed_dim, hidden_dim, num_layers=1, dropout=0.3):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
self.attention = NMTAttention(hidden_dim)
self.rnn = nn.GRU(embed_dim + hidden_dim * 2, hidden_dim,
num_layers=num_layers, batch_first=True)
self.fc = nn.Linear(hidden_dim * 3 + embed_dim, vocab_size)
self.dropout = nn.Dropout(dropout)
def forward(self, trg_token, hidden, encoder_outputs):
embedded = self.dropout(self.embedding(trg_token))
# 计算注意力
attn_weights = self.attention(hidden, encoder_outputs)
context = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)
# 拼接嵌入和上下文
rnn_input = torch.cat([embedded, context], dim=2)
output, hidden = self.rnn(rnn_input, hidden)
# 预测
prediction = self.fc(torch.cat([output, context, embedded], dim=2))
return prediction, hidden, attn_weights
class NMTModel(nn.Module):
"""完整的NMT模型"""
def __init__(self, encoder, decoder, device='cpu'):
super().__init__()
self.encoder = encoder
self.decoder = decoder
self.device = device
def forward(self, src, trg, teacher_forcing_ratio=0.5):
batch_size = src.size(0)
trg_len = trg.size(1)
trg_vocab_size = self.decoder.fc.out_features
outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
encoder_outputs, hidden = self.encoder(src)
dec_input = trg[:, 0:1] # <BOS>
for t in range(1, trg_len):
output, hidden, _ = self.decoder(dec_input, hidden, encoder_outputs)
outputs[:, t:t+1, :] = output
if random.random() < teacher_forcing_ratio:
dec_input = trg[:, t:t+1]
else:
dec_input = output.argmax(dim=-1)
return outputs
def translate(self, src, max_len=50, sos_idx=1, eos_idx=2):
"""翻译单个句子"""
self.eval() # eval()评估模式
with torch.no_grad(): # 禁用梯度计算,节省内存
encoder_outputs, hidden = self.encoder(src)
dec_input = torch.LongTensor([[sos_idx]]).to(self.device)
result = []
for _ in range(max_len):
output, hidden, attn = self.decoder(dec_input, hidden, encoder_outputs)
pred_token = output.argmax(dim=-1)
if pred_token.item() == eos_idx: # 将单元素张量转为Python数值
break
result.append(pred_token.item())
dec_input = pred_token
return result
# 创建模型
src_vocab = 5000
trg_vocab = 5000
embed_dim = 128
hidden_dim = 256
encoder = NMTEncoder(src_vocab, embed_dim, hidden_dim)
decoder = NMTDecoder(trg_vocab, embed_dim, hidden_dim)
nmt = NMTModel(encoder, decoder)
total_params = sum(p.numel() for p in nmt.parameters())
print(f"NMT模型参数量: {total_params:,}")
4. 注意力机制详解¶
4.1 三种注意力计算方式¶
class AttentionComparison:
"""三种注意力机制对比"""
@staticmethod # @staticmethod不需要实例即可调用
def dot_attention(Q, K, V):
"""点积注意力"""
d_k = Q.size(-1)
scores = torch.matmul(Q, K.transpose(-2, -1)) / (d_k ** 0.5)
weights = F.softmax(scores, dim=-1)
return torch.matmul(weights, V), weights
@staticmethod
def additive_attention(Q, K, V, W_q, W_k, v):
"""加性注意力 (Bahdanau)"""
scores = v(torch.tanh(W_q(Q) + W_k(K)))
weights = F.softmax(scores, dim=1)
return (weights * V).sum(dim=1, keepdim=True), weights
@staticmethod
def multi_head_attention(Q, K, V, num_heads=8):
"""多头注意力"""
d_model = Q.size(-1)
d_k = d_model // num_heads
batch_size = Q.size(0)
# 分头
Q = Q.view(batch_size, -1, num_heads, d_k).transpose(1, 2) # 重塑张量形状
K = K.view(batch_size, -1, num_heads, d_k).transpose(1, 2)
V = V.view(batch_size, -1, num_heads, d_k).transpose(1, 2)
# 注意力
scores = torch.matmul(Q, K.transpose(-2, -1)) / (d_k ** 0.5)
weights = F.softmax(scores, dim=-1)
context = torch.matmul(weights, V)
# 合并头
context = context.transpose(1, 2).contiguous().view(batch_size, -1, d_model)
return context, weights
# 测试
batch, seq_len, d_model = 2, 10, 64
Q = torch.randn(batch, seq_len, d_model)
K = torch.randn(batch, seq_len, d_model)
V = torch.randn(batch, seq_len, d_model)
attn_comp = AttentionComparison()
out, w = attn_comp.dot_attention(Q, K, V)
print(f"点积注意力 - 输出: {out.shape}, 权重: {w.shape}")
out, w = attn_comp.multi_head_attention(Q, K, V, num_heads=8)
print(f"多头注意力 - 输出: {out.shape}, 权重: {w.shape}")
5. Transformer翻译模型¶
5.1 Transformer完整架构¶
Transformer (Encoder-Decoder):
源语言输入 目标语言输入
│ │
▼ ▼
┌───────────┐ ┌───────────┐
│ Embedding │ │ Embedding │
│ + PosEnc │ │ + PosEnc │
└───────────┘ └───────────┘
│ │
▼ ▼
┌───────────┐ ┌───────────┐
│ Encoder │ ──────→ │ Decoder │
│ × N │ K, V │ × N │
│ │ │ │
│ Self-Attn │ │Masked Attn│
│ FFN │ │Cross-Attn │
│ │ │ FFN │
└───────────┘ └───────────┘
│
▼
┌───────────┐
│ Linear │
│ Softmax │
└───────────┘
│
▼
输出翻译
5.2 PyTorch实现Transformer翻译¶
class TranslationTransformer(nn.Module):
"""基于Transformer的翻译模型"""
def __init__(self, src_vocab_size, trg_vocab_size, d_model=256,
num_heads=8, num_layers=3, d_ff=512, max_len=100, dropout=0.1):
super().__init__()
# 编码器嵌入
self.src_embedding = nn.Embedding(src_vocab_size, d_model, padding_idx=0)
self.trg_embedding = nn.Embedding(trg_vocab_size, d_model, padding_idx=0)
self.pos_encoding = PositionalEncoding(d_model, max_len, dropout)
# Transformer
self.transformer = nn.Transformer(
d_model=d_model,
nhead=num_heads,
num_encoder_layers=num_layers,
num_decoder_layers=num_layers,
dim_feedforward=d_ff,
dropout=dropout,
batch_first=True,
)
# 输出层
self.fc_out = nn.Linear(d_model, trg_vocab_size)
self.d_model = d_model
def forward(self, src, trg, src_mask=None, trg_mask=None,
src_padding_mask=None, trg_padding_mask=None):
src_emb = self.pos_encoding(self.src_embedding(src) * (self.d_model ** 0.5))
trg_emb = self.pos_encoding(self.trg_embedding(trg) * (self.d_model ** 0.5))
output = self.transformer(
src_emb, trg_emb,
src_mask=src_mask,
tgt_mask=trg_mask,
src_key_padding_mask=src_padding_mask,
tgt_key_padding_mask=trg_padding_mask,
)
return self.fc_out(output)
class PositionalEncoding(nn.Module):
"""位置编码"""
def __init__(self, d_model, max_len=5000, dropout=0.1):
super().__init__()
self.dropout = nn.Dropout(dropout)
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1).float()
div_term = torch.exp(torch.arange(0, d_model, 2).float() *
-(math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0)
self.register_buffer('pe', pe)
def forward(self, x):
x = x + self.pe[:, :x.size(1)]
return self.dropout(x)
# 创建翻译模型
trans_model = TranslationTransformer(
src_vocab_size=5000,
trg_vocab_size=5000,
d_model=256,
num_heads=8,
num_layers=3,
)
total_params = sum(p.numel() for p in trans_model.parameters())
print(f"Transformer翻译模型参数量: {total_params:,}")
6. BLEU评估指标¶
6.1 BLEU计算¶
from collections import Counter
import math
def compute_bleu(reference, candidate, max_n=4):
"""计算BLEU分数"""
def get_ngrams(tokens, n):
return [tuple(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
def count_ngram_matches(ref_tokens, cand_tokens, n):
ref_ngrams = Counter(get_ngrams(ref_tokens, n)) # Counter统计元素出现次数
cand_ngrams = Counter(get_ngrams(cand_tokens, n))
# Clipped count
matches = 0
for ngram, count in cand_ngrams.items():
matches += min(count, ref_ngrams.get(ngram, 0))
total = sum(cand_ngrams.values())
return matches, total
if isinstance(reference, str): # isinstance检查类型
ref_tokens = reference.split()
else:
ref_tokens = reference
if isinstance(candidate, str):
cand_tokens = candidate.split()
else:
cand_tokens = candidate
# 1. 计算n-gram精确度
precisions = []
for n in range(1, max_n + 1):
matches, total = count_ngram_matches(ref_tokens, cand_tokens, n)
if total == 0:
precision = 0
else:
precision = matches / total
precisions.append(precision)
# 2. 计算BP (Brevity Penalty)
c = len(cand_tokens)
r = len(ref_tokens)
if c > r:
bp = 1
elif c == 0:
bp = 0
else:
bp = math.exp(1 - r / c)
# 3. 计算BLEU
if any(p == 0 for p in precisions): # any()任一为True则返回True
bleu = 0
else:
log_avg = sum(math.log(p) for p in precisions) / max_n
bleu = bp * math.exp(log_avg)
return {
'bleu': bleu,
'precisions': precisions,
'bp': bp,
'ratio': c / r if r > 0 else 0,
}
# 测试BLEU
reference = "the cat is on the mat"
candidate1 = "the cat is on the mat" # 完美翻译
candidate2 = "the the the the the the" # 差翻译
candidate3 = "the cat sat on the mat" # 相近翻译
print("BLEU评估:")
for i, cand in enumerate([candidate1, candidate2, candidate3], 1): # enumerate同时获取索引和元素
result = compute_bleu(reference, cand)
print(f"\n 候选{i}: '{cand}'")
print(f" BLEU: {result['bleu']:.4f}")
print(f" 1-gram精确度: {result['precisions'][0]:.4f}")
print(f" 2-gram精确度: {result['precisions'][1]:.4f}")
print(f" BP: {result['bp']:.4f}")
print(f" 长度比: {result['ratio']:.4f}")
# 使用sacrebleu(工业标准)
print("\n推荐使用sacrebleu库:")
print(" pip install sacrebleu")
print(" import sacrebleu")
print(" bleu = sacrebleu.corpus_bleu([candidate], [[reference]])")
7. 翻译技巧与改进¶
translation_techniques = {
"Subword分词": {
"方法": "BPE (Byte Pair Encoding)",
"作用": "处理OOV问题,平衡词汇表大小和序列长度",
"工具": "sentencepiece, tokenizers",
},
"回译 (Back Translation)": {
"方法": "用目标→源模型翻译单语数据,生成伪平行语料",
"作用": "便宜地扩充训练数据",
},
"集成翻译": {
"方法": "多个模型的结果融合",
"作用": "提升翻译质量",
},
"标签平滑": {
"方法": "Label Smoothing,正确标签概率设为0.9",
"作用": "防止过拟合,提升泛化能力",
},
"长度惩罚": {
"方法": "Beam Search中对长度进行归一化",
"作用": "避免生成过短的翻译",
},
}
for tech, info in translation_techniques.items():
print(f"📌 {tech}: {info['作用']}")
8. 实战:中英翻译系统¶
"""
实战项目:简易中英翻译系统
使用Hugging Face Transformers
"""
# 方法1:使用预训练翻译模型
def translate_with_hf(text, model_name="Helsinki-NLP/opus-mt-zh-en"):
"""使用Hugging Face预训练翻译模型"""
try: # try/except捕获异常
from transformers import MarianMTModel, MarianTokenizer
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)
inputs = tokenizer(text, return_tensors="pt", padding=True)
translated = model.generate(**inputs)
result = tokenizer.decode(translated[0], skip_special_tokens=True)
return result
except Exception as e:
return f"需要安装transformers并下载模型: {e}"
# 方法2:使用API(以OpenAI为例的伪代码)
def translate_with_llm(text, target_lang="English"):
"""使用大模型API翻译(伪代码)"""
prompt = f"请将以下中文翻译成{target_lang},只输出翻译结果:\n{text}"
# response = openai.chat.completions.create(...)
return prompt
# 测试
test_texts = [
"自然语言处理是人工智能的重要分支",
"机器翻译是NLP最早的应用之一",
"深度学习极大地推动了翻译质量的提升",
]
print("翻译测试:")
for text in test_texts:
print(f" 中文: {text}")
print(f" Prompt: {translate_with_llm(text)}")
print()
9. 面试要点¶
🔑 面试高频考点
考点1:Transformer相比RNN在翻译中的优势?¶
✅ 标准答案要点:
1. 并行计算:Self-Attention可以并行,RNN必须串行
2. 长距离依赖:Attention直接连接任意两个位置,RNN需要多步传递
3. 信息瓶颈:RNN将全部信息压缩到固定长度向量,Transformer无此限制
4. 可解释性:Attention权重可以可视化对齐关系
5. 缺点:Transformer的计算复杂度为O(n²),长序列耗显存
考点2:BLEU的优缺点?¶
✅ 标准答案要点:
优点:
- 自动化、计算方便
- 与人工评价有一定相关性
- 业界标准,结果可比
缺点:
- 只看n-gram匹配,不考虑语义
- 无法评估流利度
- 同义表达得不到奖励
- 短文本评估不稳定
替代/补充指标:
- METEOR(考虑同义词)
- TER(最小编辑距离)
- COMET(基于预训练模型的评估)
- BERTScore(语义相似度)
考点3:机器翻译中如何处理低资源语言?¶
✅ 标准答案要点:
1. 回译(Back Translation):利用单语数据
2. 多语言预训练:mBART, mT5, NLLB
3. 迁移学习:高资源→低资源迁移
4. 枢轴翻译:A→高资源语言→B
5. 数据增强:同义替换、词序调整
6. 大模型Few-shot:利用LLM的多语言能力
10. 练习题¶
📝 基础题¶
- 解释Encoder-Decoder架构中信息瓶颈的问题,Attention如何解决。
答案:信息瓶颈:传统Seq2Seq中,Encoder将整个输入序列压缩为一个固定长度的上下文向量c,Decoder完全依赖c生成。当输入序列很长时,固定维度的c无法充分编码所有信息,导致翻译质量随句长下降。Attention的解决:Decoder生成每个词时,动态计算与Encoder所有隐状态的注意力权重,加权求和得到该时间步专属的上下文向量,从而每步能"关注"输入的不同部分,信息不再被挤压到一个向量中。
- 手动计算一个BLEU分数的例子。
答案:参考译文:"the cat is on the mat",候选:"the the the the"。①修正精度:1-gram中"the"候选出现4次,参考最多2次,precision_1=2/4=0.5;2-gram"the the"参考中0次,precision_2=0。②长度惩罚:c=4, r=6, BP=exp(1-6/4)≈0.607。③因p2=0,BLEU-2=0。此例说明BLEU能有效惩罚重复无意义翻译。正常翻译如"the cat sat on the mat"则BLEU会很高。
💻 编程题¶
- 实现一个完整的BLEU计算函数。
- 使用PyTorch实现一个带Attention的Seq2Seq翻译模型。
- 使用Hugging Face的预训练模型完成中英翻译任务。
🔬 思考题¶
- 在大模型时代,专门的翻译模型还有存在的必要吗?
答案:仍有必要。①效率/成本:专用翻译模型(如NLLB)参数量小、推理快、成本低,适合大批量翻译;②可控性:更容易控制术语一致性、翻译风格和领域适配;③离线/隐私:可本地部署满足数据隐私需求;④低资源语言:专用模型在低资源语言对上仍有优势。但大模型在通用翻译场景中质量已接近甚至超越专用模型,未来趋势是大模型取代通用翻译,专用模型服务于专业领域(法律、医学)和实时翻译场景。
✅ 自我检查清单¶
□ 我理解SMT到NMT的演进
□ 我能实现Seq2Seq + Attention翻译模型
□ 我理解Transformer在翻译中的应用
□ 我能手动计算BLEU分数
□ 我知道翻译的常见改进技巧
□ 我完成了至少3道练习题
📚 延伸阅读¶
- Attention Is All You Need - Transformer原论文
- Neural Machine Translation by Jointly Learning to Align and Translate - Attention翻译
- No Language Left Behind (NLLB) - Meta多语言翻译
- SacreBLEU: A Call for Clarity in Reporting BLEU Scores
下一篇:08-信息抽取 — 关系抽取与知识图谱构建