05 - LLM数据工程:SFT与对齐数据¶
⚠️ 时效性说明:本章涉及前沿模型/价格/榜单等信息,可能随版本快速变化;请以论文原文、官方发布页和 API 文档为准。
学习时间:8小时 | 难度:⭐⭐⭐⭐ 中高级 | 前置知识:数据工程与预处理(Ch01)、对齐技术(Ch04)
本章目标¶
- 掌握SFT(指令微调)数据的构建方法论
- 学会使用LLM进行合成数据生成(Self-Instruct、Evol-Instruct)
- 理解RLHF/DPO偏好数据的标注流程与质量控制
- 了解数据配比、去污染与评估的工程实践
📌 与Ch01的关系:Ch01覆盖预训练数据(Common Crawl清洗、去重、Tokenization),本章聚焦后训练阶段(SFT + 偏好对齐)的数据工程。
1. SFT数据构建¶
1.1 SFT数据格式¶
Python
# 标准对话格式(OpenAI兼容)
sft_sample = {
"messages": [
{"role": "system", "content": "你是一个专业的Python编程助手。"},
{"role": "user", "content": "如何用Python实现快速排序?"},
{"role": "assistant", "content": "以下是Python快速排序的实现:\n\n```python\ndef quicksort(arr):\n if len(arr) <= 1:\n return arr\n pivot = arr[len(arr) // 2]\n left = [x for x in arr if x < pivot]\n middle = [x for x in arr if x == pivot]\n right = [x for x in arr if x > pivot]\n return quicksort(left) + middle + quicksort(right)\n```\n\n时间复杂度:平均O(n log n),最坏O(n²)。"}
]
}
# Alpaca格式(经典)
alpaca_sample = {
"instruction": "解释什么是梯度下降",
"input": "", # 可选的额外输入
"output": "梯度下降是一种优化算法..."
}
# ShareGPT格式(多轮对话)
sharegpt_sample = {
"conversations": [
{"from": "human", "value": "帮我写一首关于秋天的诗"},
{"from": "gpt", "value": "《秋韵》\n金叶飘零..."},
{"from": "human", "value": "能更忧伤一点吗?"},
{"from": "gpt", "value": "《秋殇》\n落叶知秋..."}
]
}
1.2 高质量SFT数据的特征¶
Text Only
好的SFT数据:
├── 多样性: 覆盖多种任务类型(问答/编码/推理/创作/翻译...)
├── 复杂度: 包含不同难度梯度(简单→复杂推理)
├── 准确性: 答案事实正确、代码可运行
├── 格式规范: 含Markdown/代码块/列表等结构化输出
└── 无污染: 不包含评测集数据(benchmark contamination)
差的SFT数据:
├── 同质化: 全是简单问答或翻译
├── 错误答案: 编造事实、代码bug
├── 冗余: 大量重复或相似指令
└── 过短: 回答只有一句话,缺乏深度
1.3 人工数据构建流程¶
Python
class SFTDataPipeline:
"""SFT数据构建流水线"""
def __init__(self, quality_model=None):
self.quality_model = quality_model # 用于质量打分的LLM
def construct_from_seed(self, seed_tasks: list, target_count: int):
"""从种子任务扩展"""
all_data = []
for seed in seed_tasks:
# 1. 生成变体指令
variants = self._generate_variants(seed, n=5)
# 2. 生成高质量回答
for variant in variants:
response = self._generate_response(variant)
# 3. 质量过滤
score = self._quality_score(variant, response)
if score >= 4.0: # 5分制,只保留4分以上
all_data.append({
"messages": [
{"role": "user", "content": variant},
{"role": "assistant", "content": response}
],
"quality_score": score,
"category": seed.get("category", "general")
})
if len(all_data) >= target_count:
break
return all_data[:target_count]
def _quality_score(self, instruction, response) -> float:
"""LLM-as-Judge打分"""
prompt = f"""请对以下回答的质量打分(1-5分):
指令: {instruction}
回答: {response}
评分标准:
- 5分: 完美,准确、详细、格式好
- 4分: 好,基本正确,有一些可改进处
- 3分: 一般,有小错误或不够深入
- 2分: 差,有明显错误
- 1分: 很差,答非所问或完全错误
只输出数字分数:"""
result = self.quality_model.invoke(prompt)
try: # try/except捕获异常,防止程序崩溃
return float(result.content.strip()) # 链式调用:strip去除空白
except ValueError:
return 3.0
2. 合成数据生成¶
2.1 Self-Instruct¶
Python
"""Self-Instruct: 用LLM自己生成指令数据 (Wang et al., 2023)"""
import json
import random
from openai import OpenAI
client = OpenAI()
def self_instruct(seed_instructions: list, total=1000, batch_size=5):
"""Self-Instruct流水线"""
generated = list(seed_instructions) # 从种子开始
all_data = []
while len(all_data) < total:
# 随机采样上下文示例
examples = random.sample(generated, min(3, len(generated)))
examples_text = "\n".join(f"- {ex}" for ex in examples)
# Step 1: 生成新指令
prompt = f"""以下是一些任务指令的示例:
{examples_text}
请生成{batch_size}个全新的、多样化的任务指令。要求:
1. 与示例不同,覆盖不同领域
2. 包含不同类型(分析/创作/编码/数学/翻译等)
3. 有足够的复杂度
4. 用中文
输出JSON数组:"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
new_instructions = json.loads(response.choices[0].message.content) # json.loads将JSON字符串→Python对象
for instr in new_instructions.get("instructions", []):
# Step 2: 分类(is_classification → 用不同策略)
# Step 3: 生成回答
answer_resp = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": instr}]
)
answer = answer_resp.choices[0].message.content
# Step 4: 过滤(去重 + 质量检查)
if not _is_duplicate(instr, generated) and len(answer) > 50:
generated.append(instr)
all_data.append({
"messages": [
{"role": "user", "content": instr},
{"role": "assistant", "content": answer}
]
})
return all_data[:total]
def _is_duplicate(new_instr, existing, threshold=0.7):
"""ROUGE-L去重(简化版用编辑距离)"""
for ex in existing:
if _jaccard_similarity(new_instr, ex) > threshold:
return True
return False
def _jaccard_similarity(a, b):
set_a = set(a)
set_b = set(b)
return len(set_a & set_b) / max(len(set_a | set_b), 1)
2.2 Evol-Instruct(WizardLM方法)¶
Python
"""Evol-Instruct: 渐进式指令进化 (Xu et al., 2023)"""
EVOLUTION_STRATEGIES = {
"deepen": "在原指令基础上增加约束条件,使问题更复杂",
"concretize": "将抽象指令具体化为特定场景",
"reasoning": "要求多步推理或对比分析",
"breadth": "从原始指令出发,创建一个全新话题的指令",
}
def evol_instruct(seed_instruction: str, depth: int = 3) -> list:
"""对单条指令进行多轮进化"""
evolved = [seed_instruction]
current = seed_instruction
for step in range(depth):
strategy = random.choice(list(EVOLUTION_STRATEGIES.keys()))
strategy_desc = EVOLUTION_STRATEGIES[strategy]
prompt = f"""请对以下指令进行进化。
进化策略: {strategy_desc}
原始指令: {current}
要求:
1. 进化后的指令必须比原始指令更复杂或更具体
2. 保持指令的可回答性(不要过于抽象无法回答)
3. 用中文
进化后的指令:"""
response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}]
)
current = response.choices[0].message.content.strip()
evolved.append(current)
return evolved
# 示例
seed = "解释什么是机器学习"
chain = evol_instruct(seed, depth=3)
# 第0步: "解释什么是机器学习"
# 第1步: "解释机器学习中监督学习和无监督学习的区别,并各举两个工业应用案例"
# 第2步: "比较XGBoost和LightGBM在CTR预估场景下的优劣势,从特征处理、训练速度和分布式支持三个维度分析"
# 第3步: "设计一个CTR预估系统,需要处理千万级特征、支持实时更新和A/B测试,对比XGBoost和深度学习方案的架构差异"
2.3 数据配比策略¶
Python
"""数据配比: 控制不同类别数据的比例"""
DATA_MIX = {
# 任务类型: (目标比例, 最大条数)
"general_qa": (0.20, 20000), # 通用问答
"coding": (0.25, 25000), # 代码生成/调试
"reasoning": (0.15, 15000), # 逻辑推理/数学
"creative_writing": (0.10, 10000),# 创意写作
"instruction_following": (0.10, 10000), # 指令遵循
"summarization": (0.05, 5000), # 摘要
"translation": (0.05, 5000), # 翻译
"roleplay": (0.05, 5000), # 角色扮演
"safety": (0.05, 5000), # 安全/拒绝
}
def mix_datasets(category_data: dict, total_target: int) -> list:
"""按配比混合数据"""
mixed = []
for category, (ratio, max_count) in DATA_MIX.items():
target = min(int(total_target * ratio), max_count)
available = category_data.get(category, [])
if len(available) < target:
print(f"⚠️ {category}: 只有{len(available)}条,目标{target}条,需要补充")
mixed.extend(available)
else:
mixed.extend(random.sample(available, target))
random.shuffle(mixed)
return mixed
3. 偏好数据与RLHF标注¶
3.1 偏好数据格式¶
Python
# DPO/RLHF偏好对格式
preference_sample = {
"prompt": "请解释量子纠缠现象",
"chosen": "量子纠缠是量子力学中一种特殊的关联现象...(详细、准确的回答)",
"rejected": "量子纠缠就是两个粒子用某种超光速信号连接...(不准确的回答)",
"chosen_score": 5,
"rejected_score": 2,
}
# 多维度偏好标注
detailed_preference = {
"prompt": "写一段Python爬虫代码",
"response_a": "...(回答A)",
"response_b": "...(回答B)",
"annotations": {
"helpfulness": {"winner": "a", "confidence": "high"},
"correctness": {"winner": "a", "confidence": "high"},
"harmlessness": {"winner": "tie", "confidence": "medium"},
"verbosity": {"preference": "concise", "winner": "b"},
},
"overall_winner": "a",
"annotator_id": "expert-003",
}
3.2 标注流程设计¶
Python
class PreferenceAnnotationPipeline:
"""偏好数据标注流水线"""
def __init__(self, model_a, model_b):
self.model_a = model_a
self.model_b = model_b
def generate_pairs(self, prompts: list) -> list:
"""为每个prompt生成一对回答"""
pairs = []
for prompt in prompts:
resp_a = self.model_a.invoke(prompt)
resp_b = self.model_b.invoke(prompt)
# 随机打乱顺序(避免标注偏差)
if random.random() > 0.5:
resp_a, resp_b = resp_b, resp_a
pairs.append({
"prompt": prompt,
"response_a": resp_a,
"response_b": resp_b,
})
return pairs
def auto_annotate(self, pairs: list) -> list:
"""LLM自动标注偏好(降低成本,但需要人工抽检)"""
annotated = []
for pair in pairs:
prompt = f"""请比较以下两个回答的质量。
用户问题: {pair['prompt']}
回答A: {pair['response_a']}
回答B: {pair['response_b']}
评判标准: 准确性 > 有用性 > 安全性 > 表达流畅度
输出JSON: {{"winner": "a"或"b"或"tie", "reason": "简短理由"}}"""
judge_response = client.chat.completions.create(
model="gpt-4o",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
result = json.loads(judge_response.choices[0].message.content)
pair["overall_winner"] = result["winner"]
pair["judge_reason"] = result["reason"]
# 转换为chosen/rejected格式
if result["winner"] == "a":
pair["chosen"] = pair["response_a"]
pair["rejected"] = pair["response_b"]
elif result["winner"] == "b":
pair["chosen"] = pair["response_b"]
pair["rejected"] = pair["response_a"]
else:
continue # 平局跳过
annotated.append(pair)
return annotated
def quality_control(self, annotated_data: list, sample_rate=0.1):
"""标注质量控制"""
sample_size = max(int(len(annotated_data) * sample_rate), 10)
sample = random.sample(annotated_data, min(sample_size, len(annotated_data)))
# 计算标注一致性(用第二个LLM重新标注)
agreements = 0
for item in sample:
re_annotation = self._re_annotate(item)
if re_annotation == item["overall_winner"]:
agreements += 1
agreement_rate = agreements / len(sample)
print(f"标注一致性: {agreement_rate:.2%} (样本{len(sample)}条)")
if agreement_rate < 0.7:
print("⚠️ 一致性过低,建议检查标注标准")
return agreement_rate
4. 数据去污染¶
4.1 Benchmark Contamination检测¶
Python
"""防止SFT数据中包含评测集数据(导致虚假高分)"""
import hashlib
from datasets import load_dataset
class DataDecontaminator:
"""数据去污染器"""
def __init__(self):
# 加载常见benchmark
self.benchmark_texts = set()
self._load_benchmarks()
def _load_benchmarks(self):
"""加载评测集构建fingerprint库"""
benchmarks = {
"mmlu": ("cais/mmlu", "all", "test"),
"gsm8k": ("gsm8k", "main", "test"),
"humaneval": ("openai/openai_humaneval", None, "test"),
}
for name, (dataset_name, subset, split) in benchmarks.items():
try:
ds = load_dataset(dataset_name, subset, split=split)
for item in ds:
# 提取文本并生成fingerprint
text = str(item)
# 8-gram fingerprint
for ngram in self._ngrams(text, n=8):
self.benchmark_texts.add(ngram)
except Exception as e:
print(f"跳过 {name}: {e}")
def _ngrams(self, text, n=8):
"""生成n-gram指纹"""
words = text.lower().split()
return [" ".join(words[i:i+n]) for i in range(len(words) - n + 1)]
def check_contamination(self, sft_data: list) -> tuple:
"""检查SFT数据是否包含benchmark数据"""
clean = []
contaminated = []
for item in sft_data:
text = json.dumps(item, ensure_ascii=False) # ensure_ascii=False:保留中文原字符输出,否则会被转义为\uXXXX形式
is_contaminated = False
for ngram in self._ngrams(text, n=8):
if ngram in self.benchmark_texts:
is_contaminated = True
break
if is_contaminated:
contaminated.append(item)
else:
clean.append(item)
rate = len(contaminated) / max(len(sft_data), 1)
print(f"污染率: {rate:.2%} ({len(contaminated)}/{len(sft_data)})")
return clean, contaminated
# 使用
decontaminator = DataDecontaminator()
clean_data, dirty_data = decontaminator.check_contamination(sft_data)
5. 数据质量评估¶
5.1 多维度评估框架¶
Python
class SFTDataQualityEvaluator:
"""SFT数据质量评估"""
def evaluate(self, dataset: list) -> dict:
"""全面评估数据集质量"""
return {
"size": len(dataset),
"diversity": self._diversity_score(dataset),
"complexity": self._complexity_distribution(dataset),
"quality": self._quality_sampling(dataset),
"format": self._format_check(dataset),
}
def _diversity_score(self, dataset):
"""任务多样性分析"""
from collections import Counter
categories = [d.get("category", "unknown") for d in dataset]
dist = Counter(categories) # Counter统计元素出现次数
# 计算熵
total = sum(dist.values())
import math
entropy = -sum(
(c/total) * math.log2(c/total) for c in dist.values()
)
max_entropy = math.log2(len(dist))
return {
"distribution": dict(dist),
"entropy": entropy,
"normalized_entropy": entropy / max_entropy if max_entropy > 0 else 0,
"verdict": "好" if entropy / max_entropy > 0.7 else "需改善"
}
def _complexity_distribution(self, dataset):
"""指令复杂度分布"""
lengths = []
for d in dataset:
user_msg = next(
(m["content"] for m in d.get("messages", []) if m["role"] == "user"),
""
)
lengths.append(len(user_msg))
import numpy as np
arr = np.array(lengths)
return {
"mean_length": float(np.mean(arr)),
"p25": float(np.percentile(arr, 25)),
"p50": float(np.percentile(arr, 50)),
"p75": float(np.percentile(arr, 75)),
"p95": float(np.percentile(arr, 95)),
}
def _quality_sampling(self, dataset, sample_size=50):
"""随机抽样LLM打分"""
sample = random.sample(dataset, min(sample_size, len(dataset)))
scores = []
for item in sample:
messages = item.get("messages", [])
user_msg = next((m["content"] for m in messages if m["role"] == "user"), "")
asst_msg = next((m["content"] for m in messages if m["role"] == "assistant"), "")
score = self._llm_quality_score(user_msg, asst_msg)
scores.append(score)
import numpy as np
return {
"mean_score": float(np.mean(scores)),
"std": float(np.std(scores)),
"below_3": sum(1 for s in scores if s < 3) / len(scores),
}
6. 练习题¶
代码实践¶
- 入门:用Self-Instruct方法从10条种子指令生成100条SFT数据
- 进阶:实现Evol-Instruct数据进化,对比进化前后的数据复杂度
- 高级:搭建完整的SFT数据生产流水线(生成→过滤→去重→去污染→质量评估→配比)
面试题¶
- SFT数据质量和数量哪个更重要?为什么LIMA论文只用了1000条数据也能训出好效果?
- 合成数据的主要风险是什么?如何缓解"模型塌缩"(Model Collapse)问题?
- Benchmark Contamination如何检测和预防?
- 偏好数据标注中,如何保证标注一致性?
- 数据配比策略对模型能力有什么影响?
📚 关键论文¶
- Self-Instruct (Wang et al., 2023) — 自动指令生成范式
- WizardLM / Evol-Instruct (Xu et al., 2023) — 指令进化方法
- LIMA (Zhou et al., 2023) — 少量高质量数据的重要性
- UltraChat (Ding et al., 2023) — 大规模高质量对话数据
- Textbooks Are All You Need (Gunasekar et al., 2023) — 合成数据训练小模型
最后更新日期:2026-02-12 适用版本:LLM学习教程 v2026