04 - 模型评估与基准测试(全面版)¶
⚠️ 时效性说明:本章涉及前沿模型/价格/榜单等信息,可能随版本快速变化;请以论文原文、官方发布页和 API 文档为准。
学习目标:掌握大模型评估的指标体系、主流基准测试和评估框架的使用。
目录¶
评估概述¶
1.1 为什么需要评估¶
Text Only
大模型评估的重要性
1. 能力边界识别
├── 了解模型擅长什么
├── 了解模型不擅长什么
└── 指导模型改进方向
2. 模型选择依据
├── 不同任务选择不同模型
├── 权衡性能与成本
└── 避免过度依赖单一指标
3. 安全风险识别
├── 偏见和公平性
├── 有害内容生成
└── 幻觉问题检测
4. 迭代优化指导
├── 训练过程监控
├── 消融实验验证
└── 版本对比分析
1.2 评估维度¶
Text Only
大模型评估维度
═══════════════════════════════════════════════════════════════════
维度 评估内容 常用指标
─────────────────────────────────────────────────────────────────
语言能力 语法、语义、流畅度 Perplexity, BLEU
知识能力 事实知识、常识推理 Accuracy, EM
推理能力 逻辑推理、数学推理 Accuracy, Pass@k
代码能力 代码生成、代码理解 Pass@k, HumanEval
指令遵循 理解并执行指令 Instruction Following
安全性 有害内容、偏见 Safety Score
效率 速度、内存、成本 Tokens/s, FLOPs
═══════════════════════════════════════════════════════════════════
语言建模评估指标¶
2.1 困惑度 (Perplexity)¶
Python
import torch
import torch.nn.functional as F
class PerplexityEvaluator:
"""
困惑度评估器
Perplexity = exp(-1/N * Σ log P(x_i | x_<i))
越低越好,表示模型对文本的预测能力越强
"""
@staticmethod # @staticmethod无需实例即可调用
def compute_perplexity(model, tokenizer, texts, batch_size=8, device='cuda'):
"""
计算困惑度
Args:
model: 语言模型
tokenizer: 分词器
texts: 文本列表
batch_size: 批大小
device: 计算设备
Returns:
perplexity: 平均困惑度
details: 每条文本的困惑度
"""
model.eval()
model.to(device) # .to(device)将数据移至GPU/CPU
total_loss = 0
total_tokens = 0
details = []
with torch.no_grad(): # 禁用梯度计算,节省内存(推理时使用)
for i in range(0, len(texts), batch_size):
batch_texts = texts[i:i+batch_size]
# 编码
encodings = tokenizer(
batch_texts,
return_tensors='pt',
padding=True,
truncation=True,
max_length=2048
)
input_ids = encodings['input_ids'].to(device)
attention_mask = encodings['attention_mask'].to(device)
# 前向传播
outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
loss = outputs.loss
# 计算非padding token数
num_tokens = attention_mask.sum().item()
# 累积
total_loss += loss.item() * num_tokens
total_tokens += num_tokens
# 记录每条文本的困惑度
for j, text in enumerate(batch_texts): # enumerate同时获取索引和元素
text_tokens = attention_mask[j].sum().item()
if text_tokens > 0:
# 简化计算:使用batch的平均loss
text_ppl = torch.exp(loss).item()
details.append({
'text': text[:100] + '...' if len(text) > 100 else text,
'perplexity': text_ppl,
'tokens': text_tokens
})
# 计算平均困惑度
avg_loss = total_loss / total_tokens if total_tokens > 0 else float('inf')
perplexity = torch.exp(torch.tensor(avg_loss)).item()
return perplexity, details
@staticmethod
def compute_perplexity_by_token(model, tokenizer, text, device='cuda'):
"""
逐token计算困惑度(用于分析)
"""
model.eval()
model.to(device)
# 编码
input_ids = tokenizer.encode(text, return_tensors='pt').to(device)
perplexities = []
tokens = []
with torch.no_grad():
for i in range(1, input_ids.size(1)):
# 获取前i个token的上下文
context = input_ids[:, :i]
target = input_ids[:, i]
# 预测
outputs = model(context)
logits = outputs.logits[:, -1, :]
# 计算概率
probs = F.softmax(logits, dim=-1)
target_prob = probs[0, target].item()
# 计算困惑度
token_ppl = 1 / target_prob if target_prob > 0 else float('inf')
perplexities.append(token_ppl)
tokens.append(tokenizer.decode(target))
return tokens, perplexities
# 使用示例
def example_perplexity():
from transformers import GPT2LMHeadModel, GPT2Tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
texts = [
"The cat sat on the mat.",
"Machine learning is a subset of artificial intelligence.",
"Quantum computing uses quantum bits or qubits."
]
evaluator = PerplexityEvaluator()
ppl, details = evaluator.compute_perplexity(model, tokenizer, texts)
print(f"Average Perplexity: {ppl:.2f}")
print("\nDetails:")
for d in details:
print(f" PPL: {d['perplexity']:.2f} | Tokens: {d['tokens']} | Text: {d['text'][:50]}")
# example_perplexity()
2.2 比特每字符 (Bits Per Character, BPC)¶
Python
class BPCEvaluator:
"""
比特每字符评估器
BPC = log2(PPL) / avg_chars_per_token
用于比较不同分词器的模型
"""
@staticmethod
def compute_bpc(model, tokenizer, text, device='cuda'):
"""
计算BPC
"""
model.eval()
model.to(device)
# 编码
input_ids = tokenizer.encode(text, return_tensors='pt').to(device)
with torch.no_grad():
outputs = model(input_ids, labels=input_ids)
loss = outputs.loss
# 计算困惑度
ppl = torch.exp(loss).item()
# 计算平均字符数
num_tokens = input_ids.size(1)
num_chars = len(text)
avg_chars_per_token = num_chars / num_tokens if num_tokens > 0 else 0
# 计算BPC
bpc = torch.log2(torch.tensor(ppl)).item() / avg_chars_per_token if avg_chars_per_token > 0 else float('inf')
return {
'bpc': bpc,
'perplexity': ppl,
'num_tokens': num_tokens,
'num_chars': num_chars,
'avg_chars_per_token': avg_chars_per_token
}
生成质量评估指标¶
3.1 BLEU (Bilingual Evaluation Understudy)¶
Python
from collections import Counter
import math
class BLEUEvaluator:
"""
BLEU评估器
用于评估生成文本与参考文本的相似度
范围:0-1,越高越好
"""
@staticmethod
def compute_bleu(reference, candidate, max_n=4):
"""
计算BLEU分数
Args:
reference: 参考文本(字符串或字符串列表)
candidate: 生成文本
max_n: 最大n-gram阶数
"""
if isinstance(reference, str): # isinstance检查类型
reference = [reference]
# 分词(简单空格分词,实际应用中应使用更好的分词器)
candidate_tokens = candidate.split()
reference_tokens_list = [ref.split() for ref in reference]
# 计算n-gram精确度
precisions = []
for n in range(1, max_n + 1):
precision = BLEUEvaluator._compute_ngram_precision(
candidate_tokens, reference_tokens_list, n
)
precisions.append(precision)
# 几何平均
if all(p > 0 for p in precisions): # all()全部为True才返回True
geo_mean = math.exp(sum(math.log(p) for p in precisions) / len(precisions))
else:
geo_mean = 0
# 简短惩罚(Brevity Penalty)
candidate_len = len(candidate_tokens)
reference_len = min(len(ref) for ref in reference_tokens_list)
if candidate_len > reference_len:
bp = 1
else:
bp = math.exp(1 - reference_len / candidate_len) if candidate_len > 0 else 0
# 最终BLEU分数
bleu = bp * geo_mean
return {
'bleu': bleu,
'bleu_1': precisions[0],
'bleu_2': precisions[1] if len(precisions) > 1 else 0,
'bleu_3': precisions[2] if len(precisions) > 2 else 0,
'bleu_4': precisions[3] if len(precisions) > 3 else 0,
'brevity_penalty': bp
}
@staticmethod
def _compute_ngram_precision(candidate, references, n):
"""
计算n-gram精确度
"""
# 获取候选的n-gram
candidate_ngrams = Counter() # Counter统计元素出现次数
for i in range(len(candidate) - n + 1):
ngram = tuple(candidate[i:i+n])
candidate_ngrams[ngram] += 1
# 获取参考的n-gram(取最大计数)
reference_ngrams = Counter()
for ref in references:
ref_ngrams = Counter()
for i in range(len(ref) - n + 1):
ngram = tuple(ref[i:i+n])
ref_ngrams[ngram] += 1
# 对每个n-gram取最大计数
for ngram, count in ref_ngrams.items():
reference_ngrams[ngram] = max(reference_ngrams[ngram], count)
# 计算裁剪计数
clipped_count = 0
total_count = 0
for ngram, count in candidate_ngrams.items():
clipped_count += min(count, reference_ngrams.get(ngram, 0))
total_count += count
# 精确度
precision = clipped_count / total_count if total_count > 0 else 0
return precision
# 使用示例
def example_bleu():
evaluator = BLEUEvaluator()
reference = "The cat is sitting on the mat"
candidate = "The cat sat on the mat"
result = evaluator.compute_bleu(reference, candidate)
print(f"BLEU: {result['bleu']:.4f}")
print(f"BLEU-1: {result['bleu_1']:.4f}")
print(f"BLEU-4: {result['bleu_4']:.4f}")
# example_bleu()
3.2 ROUGE (Recall-Oriented Understudy for Gisting Evaluation)¶
Python
class ROUGEEvaluator:
"""
ROUGE评估器
主要用于评估摘要质量
关注召回率(Recall)
"""
@staticmethod
def compute_rouge(reference, candidate, n=2):
"""
计算ROUGE-N分数
Args:
reference: 参考文本
candidate: 生成文本
n: n-gram阶数
"""
# 分词
ref_tokens = reference.split()
cand_tokens = candidate.split()
# 获取n-gram(使用Counter支持重复计数的n-gram)
from collections import Counter
ref_ngrams = Counter()
for i in range(len(ref_tokens) - n + 1):
ref_ngrams[tuple(ref_tokens[i:i+n])] += 1
cand_ngrams = Counter()
for i in range(len(cand_tokens) - n + 1):
cand_ngrams[tuple(cand_tokens[i:i+n])] += 1
# 计算重叠(取每个 n-gram 的最小计数)
overlap = sum((ref_ngrams & cand_ngrams).values())
# 召回率
ref_total = sum(ref_ngrams.values())
recall = overlap / ref_total if ref_total > 0 else 0
# 精确率
cand_total = sum(cand_ngrams.values())
precision = overlap / cand_total if cand_total > 0 else 0
# F1分数
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return {
f'rouge_{n}': {
'recall': recall,
'precision': precision,
'f1': f1
}
}
@staticmethod
def compute_rouge_l(reference, candidate):
"""
计算ROUGE-L(最长公共子序列)
"""
ref_tokens = reference.split()
cand_tokens = candidate.split()
# 计算LCS
lcs_length = ROUGEEvaluator._lcs_length(ref_tokens, cand_tokens)
# 召回率
recall = lcs_length / len(ref_tokens) if len(ref_tokens) > 0 else 0
# 精确率
precision = lcs_length / len(cand_tokens) if len(cand_tokens) > 0 else 0
# F1
f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
return {
'rouge_l': {
'recall': recall,
'precision': precision,
'f1': f1
}
}
@staticmethod
def _lcs_length(seq1, seq2):
"""
计算最长公共子序列长度(动态规划)
"""
m, n = len(seq1), len(seq2)
dp = [[0] * (n + 1) for _ in range(m + 1)]
for i in range(1, m + 1):
for j in range(1, n + 1):
if seq1[i-1] == seq2[j-1]:
dp[i][j] = dp[i-1][j-1] + 1
else:
dp[i][j] = max(dp[i-1][j], dp[i][j-1])
return dp[m][n]
# 使用示例
def example_rouge():
evaluator = ROUGEEvaluator()
reference = "The cat is sitting on the mat and looking outside"
candidate = "The cat sat on the mat"
rouge_1 = evaluator.compute_rouge(reference, candidate, n=1)
rouge_2 = evaluator.compute_rouge(reference, candidate, n=2)
rouge_l = evaluator.compute_rouge_l(reference, candidate)
print(f"ROUGE-1 F1: {rouge_1['rouge_1']['f1']:.4f}")
print(f"ROUGE-2 F1: {rouge_2['rouge_2']['f1']:.4f}")
print(f"ROUGE-L F1: {rouge_l['rouge_l']['f1']:.4f}")
# example_rouge()
主流基准测试¶
4.1 MMLU (Massive Multitask Language Understanding)¶
Python
class MMLUEvaluator:
"""
MMLU评估器
测试模型在57个学科的多项选择题上的表现
学科包括:数学、历史、计算机科学、法律等
"""
SUBJECTS = [
'abstract_algebra', 'anatomy', 'astronomy', 'business_ethics',
'clinical_knowledge', 'college_biology', 'college_chemistry',
'college_computer_science', 'college_mathematics', 'college_medicine',
'college_physics', 'computer_security', 'conceptual_physics',
'econometrics', 'electrical_engineering', 'elementary_mathematics',
'formal_logic', 'global_facts', 'high_school_biology',
'high_school_chemistry', 'high_school_computer_science',
'high_school_european_history', 'high_school_geography',
'high_school_government_and_politics', 'high_school_macroeconomics',
'high_school_mathematics', 'high_school_microeconomics',
'high_school_physics', 'high_school_psychology', 'high_school_statistics',
'high_school_us_history', 'high_school_world_history', 'human_aging',
'human_sexuality', 'international_law', 'jurisprudence',
'logical_fallacies', 'machine_learning', 'management',
'marketing', 'medical_genetics', 'miscellaneous',
'moral_disputes', 'moral_scenarios', 'nutrition',
'philosophy', 'prehistory', 'professional_accounting',
'professional_law', 'professional_medicine', 'professional_psychology',
'public_relations', 'security_studies', 'sociology',
'us_foreign_policy', 'virology', 'world_religions'
]
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def evaluate_subject(self, subject, test_data):
"""
评估单个学科
test_data格式:
[
{
'question': 'What is the capital of France?',
'choices': ['London', 'Paris', 'Berlin', 'Madrid'],
'answer': 1 # 正确选项的索引
},
...
]
"""
correct = 0
total = len(test_data)
for item in test_data:
# 构建prompt
prompt = self._build_prompt(item)
# 获取模型预测
predicted_answer = self._predict_answer(prompt, item['choices'])
# 检查是否正确
if predicted_answer == item['answer']:
correct += 1
accuracy = correct / total if total > 0 else 0
return {
'subject': subject,
'accuracy': accuracy,
'correct': correct,
'total': total
}
def _build_prompt(self, item):
"""
构建MMLU格式的prompt
"""
question = item['question']
choices = item['choices']
# 构建选项字符串
choice_labels = ['A', 'B', 'C', 'D', 'E']
choice_str = '\n'.join([
f"{label}. {choice}"
for label, choice in zip(choice_labels[:len(choices)], choices) # zip按位置配对多个可迭代对象
])
prompt = f"{question}\n{choice_str}\nAnswer:"
return prompt
def _predict_answer(self, prompt, choices):
"""
预测答案
方法:计算每个选项的log概率,选择最高的
"""
choice_labels = ['A', 'B', 'C', 'D', 'E'][:len(choices)]
# 编码prompt
prompt_ids = self.tokenizer.encode(prompt, return_tensors='pt')
# 计算每个选项的log概率
choice_logprobs = []
for label in choice_labels:
# 编码选项
# ⚠️ 注意: 某些 tokenizer 可能将 " A" 编码为多个 token,
# 此时 label_ids[0] 只取第一个 token 的 logprob,可能不准确。
# 更稳健的做法是对多 token 的 logprob 求和,或确保 tokenizer 将单字母编码为单 token。
label_ids = self.tokenizer.encode(f" {label}", add_special_tokens=False)
# 计算条件概率
with torch.no_grad():
# 拼接prompt和选项
input_ids = torch.cat([prompt_ids, torch.tensor([label_ids])], dim=1)
# 前向传播
outputs = self.model(input_ids)
logits = outputs.logits
# 计算选项token的log概率
log_probs = F.log_softmax(logits[0, -2, :], dim=-1)
label_logprob = log_probs[label_ids[0]].item()
choice_logprobs.append(label_logprob)
# 选择概率最高的
predicted_idx = choice_logprobs.index(max(choice_logprobs))
return predicted_idx
def evaluate_all(self, test_data_by_subject):
"""
评估所有学科
"""
results = {}
all_correct = 0
all_total = 0
for subject in self.SUBJECTS:
if subject in test_data_by_subject:
result = self.evaluate_subject(subject, test_data_by_subject[subject])
results[subject] = result
all_correct += result['correct']
all_total += result['total']
# 计算总体准确率
overall_accuracy = all_correct / all_total if all_total > 0 else 0
# 按类别分组
stem_accuracy = self._compute_category_accuracy(results, self.STEM_SUBJECTS)
humanities_accuracy = self._compute_category_accuracy(results, self.HUMANITIES_SUBJECTS)
social_sciences_accuracy = self._compute_category_accuracy(results, self.SOCIAL_SCIENCES_SUBJECTS)
other_accuracy = self._compute_category_accuracy(results, self.OTHER_SUBJECTS)
return {
'overall_accuracy': overall_accuracy,
'stem_accuracy': stem_accuracy,
'humanities_accuracy': humanities_accuracy,
'social_sciences_accuracy': social_sciences_accuracy,
'other_accuracy': other_accuracy,
'subject_results': results
}
STEM_SUBJECTS = [
'abstract_algebra', 'astronomy', 'college_biology', 'college_chemistry',
'college_computer_science', 'college_mathematics', 'college_physics',
'computer_security', 'conceptual_physics', 'electrical_engineering',
'elementary_mathematics', 'high_school_biology',
'high_school_chemistry', 'high_school_computer_science',
'high_school_mathematics', 'high_school_physics', 'high_school_statistics',
'machine_learning'
]
HUMANITIES_SUBJECTS = [
'formal_logic', 'high_school_european_history', 'high_school_us_history',
'high_school_world_history', 'international_law', 'jurisprudence',
'logical_fallacies', 'moral_disputes', 'moral_scenarios', 'philosophy',
'prehistory', 'professional_law', 'world_religions'
]
SOCIAL_SCIENCES_SUBJECTS = [
'econometrics', 'high_school_geography', 'high_school_government_and_politics',
'high_school_macroeconomics', 'high_school_microeconomics', 'high_school_psychology',
'human_sexuality', 'professional_psychology', 'public_relations',
'security_studies', 'sociology', 'us_foreign_policy'
]
OTHER_SUBJECTS = [
'anatomy', 'business_ethics', 'clinical_knowledge', 'college_medicine',
'global_facts', 'human_aging', 'management', 'marketing',
'medical_genetics', 'miscellaneous', 'nutrition', 'professional_accounting',
'professional_medicine', 'virology'
]
def _compute_category_accuracy(self, results, subjects):
"""
计算类别准确率
"""
correct = sum(results[s]['correct'] for s in subjects if s in results)
total = sum(results[s]['total'] for s in subjects if s in results)
return correct / total if total > 0 else 0
4.2 HumanEval (代码生成评估)¶
Python
class HumanEvalEvaluator:
"""
HumanEval评估器
测试模型的代码生成能力
包含164个编程问题
"""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def evaluate_problem(self, problem, num_samples=1, temperature=0.2):
"""
评估单个问题
problem格式:
{
'task_id': 'HumanEval/0',
'prompt': 'def add(a, b):\n """Add two numbers."""\n',
'entry_point': 'add',
'canonical_solution': 'def add(a, b):\n return a + b',
'test': 'def check(add):\n assert add(1, 2) == 3\n assert add(-1, 1) == 0'
}
"""
results = []
for _ in range(num_samples):
# 生成代码
generated_code = self._generate_code(problem['prompt'], temperature)
# 执行测试
passed = self._execute_test(generated_code, problem['entry_point'], problem['test'])
results.append({
'generated_code': generated_code,
'passed': passed
})
# 计算Pass@k
pass_at_k = self._compute_pass_at_k(results, num_samples)
return {
'task_id': problem['task_id'],
'pass_at_1': pass_at_k['pass_at_1'],
'results': results
}
def _generate_code(self, prompt, temperature):
"""
生成代码
"""
inputs = self.tokenizer(prompt, return_tensors='pt')
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_length=512,
temperature=temperature,
do_sample=True,
stop_strings=['\ndef', '\nclass', '\n#', '\nprint'],
tokenizer=self.tokenizer
)
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# 提取生成的代码(去掉prompt)
generated_code = generated_text[len(prompt):]
return generated_code
def _execute_test(self, generated_code, entry_point, test_code):
"""
执行测试
注意:实际执行代码有安全风险,应在沙箱环境中进行
"""
try: # try/except捕获异常,防止程序崩溃
# 创建执行环境
exec_globals = {}
# 执行生成的代码
exec(generated_code, exec_globals)
# 检查函数是否存在
if entry_point not in exec_globals:
return False
# 执行测试
exec_globals['candidate'] = exec_globals[entry_point]
exec(test_code, exec_globals)
return True
except Exception as e:
return False
def _compute_pass_at_k(self, results, n, k=1):
"""
计算Pass@k
Pass@k = 1 - C(n-c, k) / C(n, k)
其中c是通过的样本数
"""
import math
c = sum(1 for r in results if r['passed'])
if n - c < k:
return {'pass_at_1': 1.0}
# 计算组合数
def comb(n, k):
if k > n:
return 0
return math.comb(n, k)
pass_at_k = 1.0 - comb(n - c, k) / comb(n, k)
return {'pass_at_1': pass_at_k}
def evaluate_all(self, problems, num_samples_per_problem=1):
"""
评估所有问题
"""
all_results = []
for problem in problems:
result = self.evaluate_problem(problem, num_samples_per_problem)
all_results.append(result)
# 计算总体Pass@k
total_pass = sum(r['pass_at_1'] for r in all_results)
overall_pass_at_1 = total_pass / len(all_results) if all_results else 0
return {
'overall_pass_at_1': overall_pass_at_1,
'problem_results': all_results
}
4.3 GSM8K (数学推理评估)¶
Python
class GSM8KEvaluator:
"""
GSM8K评估器
测试模型的数学推理能力
包含8500个小学数学应用题
"""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
def evaluate_problem(self, problem, use_cot=True):
"""
评估单个问题
problem格式:
{
'question': 'Janet has 5 apples. She buys 3 more. How many apples does she have now?',
'answer': '8'
}
"""
# 构建prompt
if use_cot:
# Chain-of-Thought prompting
prompt = f"Question: {problem['question']}\nLet's think step by step.\nAnswer:"
else:
prompt = f"Question: {problem['question']}\nAnswer:"
# 生成答案
inputs = self.tokenizer(prompt, return_tensors='pt')
with torch.no_grad():
outputs = self.model.generate(
**inputs,
max_length=512,
temperature=0.0, # 贪婪解码
do_sample=False
)
generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
# 提取生成的答案
generated_answer = self._extract_answer(generated_text[len(prompt):])
# 检查是否正确
correct = self._compare_answers(generated_answer, problem['answer'])
return {
'question': problem['question'],
'generated_answer': generated_answer,
'correct_answer': problem['answer'],
'correct': correct,
'full_generation': generated_text
}
def _extract_answer(self, generated_text):
"""
从生成的文本中提取数字答案
"""
import re
# 查找数字(支持小数、负数)
numbers = re.findall(r'-?\d+\.?\d*', generated_text) # re.findall正则查找所有匹配项
if numbers:
# 返回最后一个数字(通常是最终答案)
return numbers[-1]
return None
def _compare_answers(self, generated, correct):
"""
比较答案
"""
try:
# 尝试转换为数字比较
gen_num = float(generated)
cor_num = float(correct)
return abs(gen_num - cor_num) < 1e-6
except:
# 字符串比较
return str(generated).strip() == str(correct).strip() # 链式调用:strip去除空白
def evaluate_all(self, problems, use_cot=True):
"""
评估所有问题
"""
results = []
correct_count = 0
for problem in problems:
result = self.evaluate_problem(problem, use_cot)
results.append(result)
if result['correct']:
correct_count += 1
accuracy = correct_count / len(problems) if problems else 0
return {
'accuracy': accuracy,
'correct': correct_count,
'total': len(problems),
'problem_results': results
}
评估框架与实践¶
5.1 lm-evaluation-harness框架¶
Python
# 使用lm-evaluation-harness进行标准化评估
"""
安装:
pip install lm-eval
使用示例:
"""
import lm_eval
from lm_eval import evaluator
from lm_eval.models.huggingface import HFLM
def evaluate_with_harness(model_name, tasks, num_fewshot=0, batch_size=8):
"""
使用lm-evaluation-harness评估模型
Args:
model_name: 模型名称或路径
tasks: 任务列表,如['mmlu', 'hellaswag', 'arc_easy']
num_fewshot: few-shot示例数量
batch_size: 批大小
"""
# 加载模型
lm = HFLM(pretrained=model_name, batch_size=batch_size)
# 执行评估
results = evaluator.simple_evaluate(
model=lm,
tasks=tasks,
num_fewshot=num_fewshot,
batch_size=batch_size,
device='cuda'
)
return results
# 示例:评估多个任务
def example_evaluation():
model_name = "meta-llama/Llama-2-7b-hf"
tasks = [
'mmlu', # 多任务语言理解
'hellaswag', # 常识推理
'arc_easy', # 科学问题(简单)
'arc_challenge', # 科学问题(困难)
'winogrande', # 代词消歧
'piqa', # 物理常识
'boolq', # 布尔问答
]
results = evaluate_with_harness(
model_name=model_name,
tasks=tasks,
num_fewshot=0, # 0-shot评估
batch_size=8
)
# 打印结果
for task, task_results in results['results'].items():
print(f"\n{task}:")
for metric, value in task_results.items():
if isinstance(value, float):
print(f" {metric}: {value:.4f}")
# example_evaluation()
5.2 自定义评估Pipeline¶
Python
class ComprehensiveEvaluator:
"""
综合评估器
整合多个评估指标和基准测试
"""
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer
# 初始化各个评估器
self.perplexity_evaluator = PerplexityEvaluator()
self.bleu_evaluator = BLEUEvaluator()
self.rouge_evaluator = ROUGEEvaluator()
self.mmlu_evaluator = MMLUEvaluator(model, tokenizer)
self.humaneval_evaluator = HumanEvalEvaluator(model, tokenizer)
self.gsm8k_evaluator = GSM8KEvaluator(model, tokenizer)
def run_full_evaluation(self, eval_config):
"""
运行完整评估
eval_config格式:
{
'perplexity': {
'enabled': True,
'test_texts': [...]
},
'mmlu': {
'enabled': True,
'test_data': {...}
},
'humaneval': {
'enabled': True,
'problems': [...]
},
...
}
"""
results = {}
# 1. 困惑度评估
if eval_config.get('perplexity', {}).get('enabled', False):
print("Evaluating Perplexity...")
ppl, details = self.perplexity_evaluator.compute_perplexity(
self.model,
self.tokenizer,
eval_config['perplexity']['test_texts']
)
results['perplexity'] = {
'average': ppl,
'details': details
}
# 2. MMLU评估
if eval_config.get('mmlu', {}).get('enabled', False):
print("Evaluating MMLU...")
mmlu_results = self.mmlu_evaluator.evaluate_all(
eval_config['mmlu']['test_data']
)
results['mmlu'] = mmlu_results
# 3. HumanEval评估
if eval_config.get('humaneval', {}).get('enabled', False):
print("Evaluating HumanEval...")
humaneval_results = self.humaneval_evaluator.evaluate_all(
eval_config['humaneval']['problems']
)
results['humaneval'] = humaneval_results
# 4. GSM8K评估
if eval_config.get('gsm8k', {}).get('enabled', False):
print("Evaluating GSM8K...")
gsm8k_results = self.gsm8k_evaluator.evaluate_all(
eval_config['gsm8k']['problems']
)
results['gsm8k'] = gsm8k_results
# 生成评估报告
report = self._generate_report(results)
return results, report
def _generate_report(self, results):
"""
生成评估报告
"""
report = []
report.append("=" * 60)
report.append("Model Evaluation Report")
report.append("=" * 60)
# 困惑度
if 'perplexity' in results:
report.append(f"\nPerplexity: {results['perplexity']['average']:.2f}")
# MMLU
if 'mmlu' in results:
mmlu = results['mmlu']
report.append(f"\nMMLU:")
report.append(f" Overall: {mmlu['overall_accuracy']:.2%}")
report.append(f" STEM: {mmlu['stem_accuracy']:.2%}")
report.append(f" Humanities: {mmlu['humanities_accuracy']:.2%}")
report.append(f" Social Sciences: {mmlu['social_sciences_accuracy']:.2%}")
# HumanEval
if 'humaneval' in results:
report.append(f"\nHumanEval:")
report.append(f" Pass@1: {results['humaneval']['overall_pass_at_1']:.2%}")
# GSM8K
if 'gsm8k' in results:
report.append(f"\nGSM8K:")
report.append(f" Accuracy: {results['gsm8k']['accuracy']:.2%}")
report.append("\n" + "=" * 60)
return '\n'.join(report)
# 使用示例
def example_comprehensive_evaluation():
from transformers import AutoModelForCausalLM, AutoTokenizer
# 加载模型
model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
# 创建评估器
evaluator = ComprehensiveEvaluator(model, tokenizer)
# 配置评估
eval_config = {
'perplexity': {
'enabled': True,
'test_texts': [
"The quick brown fox jumps over the lazy dog.",
"Machine learning is a subset of artificial intelligence."
]
}
# 其他评估可以类似配置
}
# 运行评估
results, report = evaluator.run_full_evaluation(eval_config)
# 打印报告
print(report)
# example_comprehensive_evaluation()
评估最佳实践¶
6.1 评估检查清单¶
Python
class EvaluationChecklist:
"""
评估检查清单
"""
CHECKLIST = {
'数据准备': [
'测试数据与训练数据无重叠',
'测试数据分布与真实场景一致',
'评估数据量足够(至少几百条)',
'包含边界情况和困难样本',
],
'指标选择': [
'选择与任务相关的指标',
'不只依赖单一指标',
'同时关注自动指标和人工评估',
'记录置信区间或标准差',
],
'实验设置': [
'固定随机种子',
'记录超参数设置',
'多次运行取平均',
'进行消融实验',
],
'结果分析': [
'分析错误案例',
'按类别分析表现',
'对比baseline',
'可视化结果',
],
'可复现性': [
'保存模型checkpoint',
'记录代码版本',
'提供评估脚本',
'公开评估数据',
]
}
@classmethod # @classmethod接收类作为第一个参数
def print_checklist(cls):
"""
打印检查清单
"""
for category, items in cls.CHECKLIST.items():
print(f"\n【{category}】")
for i, item in enumerate(items, 1):
print(f" {i}. [ ] {item}")
@classmethod
def validate_evaluation(cls, evaluation_results):
"""
验证评估结果是否完整
"""
warnings = []
# 检查是否包含多个指标
if len(evaluation_results.keys()) < 2:
warnings.append("建议同时使用多个评估指标")
# 检查是否有统计信息
has_stats = any(
'std' in str(result) or 'confidence' in str(result).lower()
for result in evaluation_results.values()
)
if not has_stats:
warnings.append("建议报告统计显著性信息")
return warnings
# 打印检查清单
# EvaluationChecklist.print_checklist()
6.2 常见陷阱与解决方案¶
Text Only
评估常见陷阱
═══════════════════════════════════════════════════════════════════
陷阱1: 数据泄露
├── 问题:测试数据出现在训练集中
├── 检测:检查n-gram重叠
└── 解决:严格划分训练/测试集
陷阱2: 指标选择不当
├── 问题:用BLEU评估代码生成
├── 解决:选择任务相关的指标
└── 示例:代码用Pass@k,摘要用ROUGE
陷阱3: 样本量不足
├── 问题:结论基于少量样本
├── 检测:计算置信区间
└── 解决:增加测试样本量
陷阱4: 过度拟合测试集
├── 问题:反复调优直到测试集表现好
├── 解决:保留真正的held-out测试集
└── 建议:测试集只使用一次
陷阱5: 忽略错误分析
├── 问题:只看分数不看错误案例
├── 解决:进行详细的错误分析
└── 价值:发现模型盲点
═══════════════════════════════════════════════════════════════════
总结¶
评估指标速查表¶
| 任务类型 | 推荐指标 | 说明 |
|---|---|---|
| 语言建模 | Perplexity, BPC | 越低越好 |
| 机器翻译 | BLEU, chrF++ | 越高越好 |
| 文本摘要 | ROUGE-½/L | 关注召回率 |
| 问答系统 | EM, F1 | 精确匹配或部分匹配 |
| 代码生成 | Pass@k | 功能正确性 |
| 数学推理 | Accuracy | 答案正确性 |
| 知识问答 | Accuracy | MMLU等 |
| 指令遵循 | 人工评估 | 主观性较强 |
基准测试推荐¶
| 基准 | 评估能力 | 样本数 | 难度 |
|---|---|---|---|
| MMLU | 多学科知识 | 15,908 | 高 |
| HumanEval | 代码生成 | 164 | 中 |
| GSM8K | 数学推理 | 8,500 | 中 |
| HellaSwag | 常识推理 | 10,042 | 中 |
| ARC | 科学问答 | 7,787 | 中-高 |
| WinoGrande | 代词消歧 | 44,000 | 中 |
| TruthfulQA | 真实性 | 817 | 高 |
关键要点¶
- 多维度评估:不只关注单一指标
- 统计显著性:报告置信区间
- 错误分析:理解模型失败模式
- 人工验证:自动指标有局限
- 持续监控:模型性能会随时间变化
最后更新日期:2026-02-12 适用版本:LLM学习教程 v2026
下一步:学习05-知识蒸馏,掌握模型压缩与知识迁移技术!