05 - LLM 数据工程: SFT 与对齐数据¶
⚠️ 时效性说明:本章涉及前沿模型/价格/榜单等信息,可能随版本快速变化;请以论文原文、官方发布页和 API 文档为准。
⚠️ 核验说明(2026-04-20):本章中的 OpenAI 示例模型已统一按最新模型指南复核,默认示例更新为
gpt-5.4;轻量示例可切换为gpt-5-mini。学习时间: 8 小时 | 难度:⭐⭐⭐⭐ 中高级 | 前置知识:数据工程与预处理(Ch01)、对齐技术(Ch04)
本章目标¶
- 掌握 SFT (指令微调)数据的构建方法论
- 学会使用 LLM 进行合成数据生成( Self-Instruct 、 Evol-Instruct )
- 理解 RLHF/DPO 偏好数据的标注流程与质量控制
- 了解数据配比、去污染与评估的工程实践
📌 与 Ch01 的关系: Ch01 覆盖预训练数据( Common Crawl 清洗、去重、 Tokenization ),本章聚焦后训练阶段( SFT + 偏好对齐)的数据工程。
1. SFT 数据构建¶
1.1 SFT 数据格式¶
# 标准对话格式(OpenAI兼容)
sft_sample = {
"messages": [
{"role": "system", "content": "你是一个专业的Python编程助手。"},
{"role": "user", "content": "如何用Python实现快速排序?"},
{"role": "assistant", "content": "以下是Python快速排序的实现:\n\n```python\ndef quicksort(arr):\n if len(arr) <= 1:\n return arr\n pivot = arr[len(arr) // 2]\n left = [x for x in arr if x < pivot]\n middle = [x for x in arr if x == pivot]\n right = [x for x in arr if x > pivot]\n return quicksort(left) + middle + quicksort(right)\n```\n\n时间复杂度:平均O(n log n),最坏O(n²)。"}
]
}
# Alpaca格式(经典)
alpaca_sample = {
"instruction": "解释什么是梯度下降",
"input": "", # 可选的额外输入
"output": "梯度下降是一种优化算法..."
}
# ShareGPT格式(多轮对话)
sharegpt_sample = {
"conversations": [
{"from": "human", "value": "帮我写一首关于秋天的诗"},
{"from": "gpt", "value": "《秋韵》\n金叶飘零..."},
{"from": "human", "value": "能更忧伤一点吗?"},
{"from": "gpt", "value": "《秋殇》\n落叶知秋..."}
]
}
💡 格式选择建议:新项目建议使用 OpenAI 兼容格式(
messages),生态兼容性最好。Alpaca 格式适合单轮指令任务,ShareGPT 格式适合多轮对话场景。
1.2 高质量 SFT 数据的特征¶
好的SFT数据:
├── 多样性: 覆盖多种任务类型(问答/编码/推理/创作/翻译...)
├── 复杂度: 包含不同难度梯度(简单→复杂推理)
├── 准确性: 答案事实正确、代码可运行
├── 格式规范: 含Markdown/代码块/列表等结构化输出
└── 无污染: 不包含评测集数据(benchmark contamination)
差的SFT数据:
├── 同质化: 全是简单问答或翻译
├── 错误答案: 编造事实、代码bug
├── 冗余: 大量重复或相似指令
└── 过短: 回答只有一句话,缺乏深度
1.3 人工数据构建流程¶
import json
import random
from typing import Callable, Optional
class SFTDataPipeline:
"""SFT数据构建流水线"""
def __init__(
self,
llm_call: Callable[[str], str],
quality_threshold: float = 4.0,
):
"""
Args:
llm_call: 通用LLM调用函数,接受prompt字符串,返回回复字符串
例如: lambda prompt: client.chat.completions.create(
model="gpt-5-mini",
messages=[{"role": "user", "content": prompt}]
).choices[0].message.content
quality_threshold: 质量过滤阈值(5分制)
"""
self.llm_call = llm_call
self.quality_threshold = quality_threshold
def construct_from_seed(self, seed_tasks: list, target_count: int):
"""从种子任务扩展"""
all_data = []
for seed in seed_tasks:
# 1. 生成变体指令
variants = self._generate_variants(seed, n=5)
# 2. 生成高质量回答
for variant in variants:
response = self._generate_response(variant)
# 3. 质量过滤
score = self._quality_score(variant, response)
if score >= self.quality_threshold:
all_data.append({
"messages": [
{"role": "user", "content": variant},
{"role": "assistant", "content": response}
],
"quality_score": score,
"category": seed.get("category", "general")
})
if len(all_data) >= target_count:
break
return all_data[:target_count]
def _generate_variants(self, seed: dict, n: int = 5) -> list:
"""从种子任务生成n个变体指令"""
seed_instruction = seed.get("instruction", seed.get("task", ""))
prompt = f"""请基于以下种子任务,生成{n}个不同的变体指令。
种子任务: {seed_instruction}
类别: {seed.get('category', 'general')}
要求:
1. 保持核心意图但改变表述方式
2. 覆盖不同难度和场景
3. 用中文
输出JSON格式: {{"variants": ["变体1", "变体2", ...]}}"""
result = self.llm_call(prompt)
try:
parsed = json.loads(result)
return parsed.get("variants", [seed_instruction])
except json.JSONDecodeError:
return [seed_instruction]
def _generate_response(self, instruction: str) -> str:
"""为指令生成高质量回答"""
prompt = f"""请对以下指令给出高质量、详细的回答。
要求准确、结构清晰、包含必要的代码示例或解释。
指令: {instruction}
回答:"""
return self.llm_call(prompt)
def _quality_score(self, instruction: str, response: str) -> float:
"""LLM-as-Judge打分"""
prompt = f"""请对以下回答的质量打分(1-5分):
指令: {instruction}
回答: {response}
评分标准:
- 5分: 完美,准确、详细、格式好
- 4分: 好,基本正确,有一些可改进处
- 3分: 一般,有小错误或不够深入
- 2分: 差,有明显错误
- 1分: 很差,答非所问或完全错误
只输出数字分数:"""
result = self.llm_call(prompt)
try:
# 提取第一个数字作为分数
score_str = result.strip()
# 处理可能的非纯数字输出(如"4分"、"4.5"等)
import re
match = re.search(r'(\d+\.?\d*)', score_str)
if match:
return float(match.group(1))
return 3.0
except (ValueError, AttributeError):
return 3.0
2. 合成数据生成¶
2.1 Self-Instruct¶
"""Self-Instruct: 用LLM自己生成指令数据 (Wang et al., 2023)"""
import json
import random
from openai import OpenAI
def self_instruct(
seed_instructions: list,
total: int = 1000,
batch_size: int = 5,
client: OpenAI = None,
model: str = "gpt-5.4",
):
"""
Self-Instruct流水线
Args:
seed_instructions: 种子指令列表
total: 目标生成数量
batch_size: 每批生成指令数
client: OpenAI客户端实例(若为None则自动创建)
model: 使用的模型名称
"""
if client is None:
client = OpenAI()
generated = list(seed_instructions) # 从种子开始
all_data = []
while len(all_data) < total:
# 随机采样上下文示例
examples = random.sample(generated, min(3, len(generated)))
examples_text = "\n".join(f"- {ex}" for ex in examples)
# Step 1: 生成新指令
prompt = f"""以下是一些任务指令的示例:
{examples_text}
请生成{batch_size}个全新的、多样化的任务指令。要求:
1. 与示例不同,覆盖不同领域
2. 包含不同类型(分析/创作/编码/数学/翻译等)
3. 有足够的复杂度
4. 用中文
输出JSON格式: {{"instructions": ["指令1", "指令2", ...]}}"""
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
try:
new_instructions = json.loads(response.choices[0].message.content)
except json.JSONDecodeError:
continue
for instr in new_instructions.get("instructions", []):
# Step 2: 生成回答
answer_resp = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": instr}]
)
answer = answer_resp.choices[0].message.content
# Step 3: 过滤(去重 + 质量检查)
if not _is_duplicate(instr, generated) and len(answer) > 50:
generated.append(instr)
all_data.append({
"messages": [
{"role": "user", "content": instr},
{"role": "assistant", "content": answer}
]
})
if len(all_data) >= total:
break
return all_data[:total]
def _is_duplicate(new_instr: str, existing: list, threshold: float = 0.7) -> bool:
"""基于Jaccard相似度的去重"""
for ex in existing:
if _jaccard_similarity(new_instr, ex) > threshold:
return True
return False
def _jaccard_similarity(a: str, b: str) -> float:
"""计算两个字符串的Jaccard相似度(基于字符级n-gram)"""
# 使用字符级4-gram,比词级更适合中文
n = 4
set_a = set(a[i:i+n] for i in range(len(a) - n + 1))
set_b = set(b[i:i+n] for i in range(len(b) - n + 1))
intersection = len(set_a & set_b)
union = len(set_a | set_b)
return intersection / max(union, 1)
💡 Self-Instruct 关键改进:原始论文使用 ROUGE-L 去重。实践中,对中文数据建议使用字符级 n-gram Jaccard 相似度(如上方实现),因为中文分词质量会影响词级方法的效果。
2.2 Evol-Instruct ( WizardLM 方法)¶
"""Evol-Instruct: 渐进式指令进化 (Xu et al., 2023)"""
import random
from openai import OpenAI
EVOLUTION_STRATEGIES = {
"deepen": "在原指令基础上增加约束条件,使问题更复杂",
"concretize": "将抽象指令具体化为特定场景",
"reasoning": "要求多步推理或对比分析",
"breadth": "从原始指令出发,创建一个全新话题的指令",
}
def evol_instruct(
seed_instruction: str,
depth: int = 3,
client: OpenAI = None,
model: str = "gpt-5.4",
) -> list:
"""
对单条指令进行多轮进化
Args:
seed_instruction: 种子指令
depth: 进化轮数
client: OpenAI客户端实例
model: 使用的模型名称
Returns:
进化链列表,从原始指令到最终进化结果
"""
if client is None:
client = OpenAI()
evolved = [seed_instruction]
current = seed_instruction
for step in range(depth):
strategy = random.choice(list(EVOLUTION_STRATEGIES.keys()))
strategy_desc = EVOLUTION_STRATEGIES[strategy]
prompt = f"""请对以下指令进行进化。
进化策略: {strategy_desc}
原始指令: {current}
要求:
1. 进化后的指令必须比原始指令更复杂或更具体
2. 保持指令的可回答性(不要过于抽象无法回答)
3. 用中文
进化后的指令:"""
response = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}]
)
current = response.choices[0].message.content.strip()
evolved.append(current)
return evolved
# 示例
if __name__ == "__main__":
seed = "解释什么是机器学习"
chain = evol_instruct(seed, depth=3)
# 第0步: "解释什么是机器学习"
# 第1步: "解释机器学习中监督学习和无监督学习的区别,并各举两个工业应用案例"
# 第2步: "比较XGBoost和LightGBM在CTR预估场景下的优劣势,从特征处理、训练速度和分布式支持三个维度分析"
# 第3步: "设计一个CTR预估系统,需要处理千万级特征、支持实时更新和A/B测试,对比XGBoost和深度学习方案的架构差异"
2.3 数据配比策略¶
"""数据配比: 控制不同类别数据的比例"""
DATA_MIX = {
# 任务类型: (目标比例, 最大条数)
"general_qa": (0.20, 20000), # 通用问答
"coding": (0.25, 25000), # 代码生成/调试
"reasoning": (0.15, 15000), # 逻辑推理/数学
"creative_writing": (0.10, 10000),# 创意写作
"instruction_following": (0.10, 10000), # 指令遵循
"summarization": (0.05, 5000), # 摘要
"translation": (0.05, 5000), # 翻译
"roleplay": (0.05, 5000), # 角色扮演
"safety": (0.05, 5000), # 安全/拒绝
}
def mix_datasets(category_data: dict, total_target: int) -> list:
"""按配比混合数据"""
mixed = []
for category, (ratio, max_count) in DATA_MIX.items():
target = min(int(total_target * ratio), max_count)
available = category_data.get(category, [])
if len(available) < target:
print(f"⚠️ {category}: 只有{len(available)}条,目标{target}条,需要补充")
mixed.extend(available)
else:
mixed.extend(random.sample(available, target))
random.shuffle(mixed)
return mixed
📊 配比经验法则(基于 LLaMA-3、Qwen-2.5 等公开技术报告):
模型 代码比例 推理/数学 通用QA 备注 LLaMA-3 ~30% ~15% ~20% 代码是最大单一类别 Qwen-2.5 ~25% ~20% ~15% 强调STEM推理 DeepSeek-V3 ~30% ~20% ~15% 数学推理占比高 代码和推理数据通常占比最高,因为这两类能力对模型整体智能水平提升最显著。
3. 偏好数据与 RLHF 标注¶
3.1 偏好数据格式¶
# DPO/RLHF偏好对格式
preference_sample = {
"prompt": "请解释量子纠缠现象",
"chosen": "量子纠缠是量子力学中一种特殊的关联现象...(详细、准确的回答)",
"rejected": "量子纠缠就是两个粒子用某种超光速信号连接...(不准确的回答)",
"chosen_score": 5,
"rejected_score": 2,
}
# 多维度偏好标注
detailed_preference = {
"prompt": "写一段Python爬虫代码",
"response_a": "...(回答A)",
"response_b": "...(回答B)",
"annotations": {
"helpfulness": {"winner": "a", "confidence": "high"},
"correctness": {"winner": "a", "confidence": "high"},
"harmlessness": {"winner": "tie", "confidence": "medium"},
"verbosity": {"preference": "concise", "winner": "b"},
},
"overall_winner": "a",
"annotator_id": "expert-003",
}
3.2 标注流程设计¶
class PreferenceAnnotationPipeline:
"""偏好数据标注流水线"""
def __init__(
self,
llm_call_a: Callable,
llm_call_b: Callable,
judge_call: Callable,
):
"""
Args:
llm_call_a: 模型A的调用函数 (prompt str → response str)
llm_call_b: 模型B的调用函数
judge_call: 评判模型的调用函数
"""
self.llm_call_a = llm_call_a
self.llm_call_b = llm_call_b
self.judge_call = judge_call
def generate_pairs(self, prompts: list) -> list:
"""为每个prompt生成一对回答"""
pairs = []
for prompt in prompts:
resp_a = self.llm_call_a(prompt)
resp_b = self.llm_call_b(prompt)
# 随机打乱顺序(避免标注偏差)
if random.random() > 0.5:
resp_a, resp_b = resp_b, resp_a
pairs.append({
"prompt": prompt,
"response_a": resp_a,
"response_b": resp_b,
})
return pairs
def auto_annotate(self, pairs: list) -> list:
"""LLM自动标注偏好(降低成本,但需要人工抽检)"""
annotated = []
for pair in pairs:
prompt = f"""请比较以下两个回答的质量。
用户问题: {pair['prompt']}
回答A: {pair['response_a']}
回答B: {pair['response_b']}
评判标准: 准确性 > 有用性 > 安全性 > 表达流畅度
输出JSON: {{"winner": "a"或"b"或"tie", "reason": "简短理由"}}"""
judge_response = self.judge_call(prompt)
try:
result = json.loads(judge_response)
except json.JSONDecodeError:
continue
pair["overall_winner"] = result.get("winner", "tie")
pair["judge_reason"] = result.get("reason", "")
# 转换为chosen/rejected格式
if result.get("winner") == "a":
pair["chosen"] = pair["response_a"]
pair["rejected"] = pair["response_b"]
elif result.get("winner") == "b":
pair["chosen"] = pair["response_b"]
pair["rejected"] = pair["response_a"]
else:
continue # 平局跳过
annotated.append(pair)
return annotated
def quality_control(self, annotated_data: list, sample_rate: float = 0.1):
"""标注质量控制:用第二个评判模型重新标注抽检"""
sample_size = max(int(len(annotated_data) * sample_rate), 10)
sample = random.sample(annotated_data, min(sample_size, len(annotated_data)))
agreements = 0
for item in sample:
re_annotation = self._re_annotate(item)
if re_annotation == item["overall_winner"]:
agreements += 1
agreement_rate = agreements / max(len(sample), 1)
print(f"标注一致性: {agreement_rate:.2%} (样本{len(sample)}条)")
if agreement_rate < 0.7:
print("⚠️ 一致性过低,建议检查标注标准")
return agreement_rate
def _re_annotate(self, item: dict) -> str:
"""用评判模型对单条数据重新标注"""
prompt = f"""请比较以下两个回答的质量。
用户问题: {item['prompt']}
回答A: {item['response_a']}
回答B: {item['response_b']}
评判标准: 准确性 > 有用性 > 安全性 > 表达流畅度
输出JSON: {{"winner": "a"或"b"或"tie"}}"""
result = self.judge_call(prompt)
try:
parsed = json.loads(result)
return parsed.get("winner", "tie")
except json.JSONDecodeError:
return "tie"
⚠️ LLM-as-Judge 注意事项: - 位置偏差(Position Bias):模型倾向于选择排在前面的回答,因此必须随机打乱 A/B 顺序 - 冗长偏差(Verbosity Bias):模型倾向于选择更长的回答,即使内容质量相同 - 自我偏好(Self-Preference):某些模型倾向于偏好自己生成的回答 - 建议使用多个评判模型交叉验证,或结合人工抽检
4. 数据去污染¶
4.1 Benchmark Contamination 检测¶
"""防止SFT数据中包含评测集数据(导致虚假高分)"""
import hashlib
import json
from datasets import load_dataset
class DataDecontaminator:
"""数据去污染器:基于n-gram和哈希双重检测"""
def __init__(self, ngram_size: int = 8, use_hash: bool = True):
"""
Args:
ngram_size: n-gram窗口大小
use_hash: 是否同时使用哈希指纹匹配(更快但可能有碰撞)
"""
self.ngram_size = ngram_size
self.use_hash = use_hash
self.benchmark_ngrams = set() # n-gram集合
self.benchmark_hashes = set() # 哈希指纹集合
self._load_benchmarks()
def _load_benchmarks(self):
"""加载评测集构建fingerprint库"""
benchmarks = {
"mmlu": ("cais/mmlu", "all", "test"),
"gsm8k": ("gsm8k", "main", "test"),
"humaneval": ("openai/openai_humaneval", None, "test"),
}
for name, (dataset_name, subset, split) in benchmarks.items():
try:
ds = load_dataset(dataset_name, subset, split=split)
for item in ds:
text = str(item)
# n-gram指纹
for ngram in self._ngrams(text, self.ngram_size):
self.benchmark_ngrams.add(ngram)
# 哈希指纹
if self.use_hash:
self.benchmark_hashes.add(hashlib.md5(text.encode()).hexdigest())
except Exception as e:
print(f"跳过 {name}: {e}")
def _ngrams(self, text: str, n: int = 8) -> list:
"""生成n-gram指纹"""
words = text.lower().split()
return [" ".join(words[i:i+n]) for i in range(len(words) - n + 1)]
def check_contamination(self, sft_data: list) -> tuple:
"""
检查SFT数据是否包含benchmark数据
Returns:
(clean_data, contaminated_data) 元组
"""
clean = []
contaminated = []
for item in sft_data:
text = json.dumps(item, ensure_ascii=False)
is_contaminated = False
# 方法1: n-gram匹配
for ngram in self._ngrams(text, self.ngram_size):
if ngram in self.benchmark_ngrams:
is_contaminated = True
break
# 方法2: 哈希匹配(补充检测)
if not is_contaminated and self.use_hash:
item_hash = hashlib.md5(text.encode()).hexdigest()
if item_hash in self.benchmark_hashes:
is_contaminated = True
if is_contaminated:
contaminated.append(item)
else:
clean.append(item)
rate = len(contaminated) / max(len(sft_data), 1)
print(f"污染率: {rate:.2%} ({len(contaminated)}/{len(sft_data)})")
return clean, contaminated
# 使用示例
# decontaminator = DataDecontaminator()
# clean_data, dirty_data = decontaminator.check_contamination(sft_data)
💡 去污染最佳实践: - n-gram 大小选择:8-gram 是常用设置,太短(如 3-gram)误报率高,太长(如 13-gram)可能漏检改写后的数据 - 双重检测:结合 n-gram 和哈希匹配,提高召回率 - 定期更新:每次新增评测集后,应重新运行去污染 - 开源工具:lm-evaluation-harness 内置了 contamination 检测功能
5. 数据质量评估¶
5.1 多维度评估框架¶
import math
import random
from collections import Counter
from typing import Callable, Optional
import numpy as np
class SFTDataQualityEvaluator:
"""SFT数据质量评估"""
def __init__(self, llm_call: Optional[Callable] = None):
"""
Args:
llm_call: LLM调用函数(用于质量打分),可选
"""
self.llm_call = llm_call
def evaluate(self, dataset: list) -> dict:
"""全面评估数据集质量"""
results = {
"size": len(dataset),
"diversity": self._diversity_score(dataset),
"complexity": self._complexity_distribution(dataset),
"format": self._format_check(dataset),
}
# LLM质量打分(需要提供llm_call)
if self.llm_call is not None:
results["quality"] = self._quality_sampling(dataset)
return results
def _diversity_score(self, dataset: list) -> dict:
"""任务多样性分析(基于信息熵)"""
categories = [d.get("category", "unknown") for d in dataset]
dist = Counter(categories)
total = sum(dist.values())
entropy = -sum(
(c / total) * math.log2(c / total) for c in dist.values()
)
max_entropy = math.log2(len(dist)) if len(dist) > 1 else 1.0
normalized = entropy / max_entropy if max_entropy > 0 else 0
return {
"distribution": dict(dist),
"num_categories": len(dist),
"entropy": round(entropy, 3),
"normalized_entropy": round(normalized, 3),
"verdict": "好" if normalized > 0.7 else "需改善",
}
def _complexity_distribution(self, dataset: list) -> dict:
"""指令复杂度分布(基于指令长度)"""
lengths = []
for d in dataset:
user_msg = next(
(m["content"] for m in d.get("messages", []) if m["role"] == "user"),
""
)
lengths.append(len(user_msg))
arr = np.array(lengths)
return {
"mean_length": float(np.mean(arr)),
"p25": float(np.percentile(arr, 25)),
"p50": float(np.percentile(arr, 50)),
"p75": float(np.percentile(arr, 75)),
"p95": float(np.percentile(arr, 95)),
}
def _format_check(self, dataset: list) -> dict:
"""格式合规性检查"""
issues = {
"missing_messages": 0,
"empty_user": 0,
"empty_assistant": 0,
"no_system_prompt": 0,
"too_short_response": 0, # 回答少于20字符
}
for d in dataset:
messages = d.get("messages", [])
if not messages:
issues["missing_messages"] += 1
continue
user_msgs = [m for m in messages if m["role"] == "user"]
asst_msgs = [m for m in messages if m["role"] == "assistant"]
system_msgs = [m for m in messages if m["role"] == "system"]
if not user_msgs or not any(m.get("content", "").strip() for m in user_msgs):
issues["empty_user"] += 1
if not asst_msgs or not any(m.get("content", "").strip() for m in asst_msgs):
issues["empty_assistant"] += 1
if not system_msgs:
issues["no_system_prompt"] += 1
# 检查回答长度
for m in asst_msgs:
if len(m.get("content", "")) < 20:
issues["too_short_response"] += 1
total = len(dataset)
return {
"total_samples": total,
"issues": issues,
"issue_rate": sum(issues.values()) / max(total, 1),
"verdict": "通过" if sum(issues.values()) / max(total, 1) < 0.05 else "需修复",
}
def _quality_sampling(self, dataset: list, sample_size: int = 50) -> dict:
"""随机抽样LLM打分"""
sample = random.sample(dataset, min(sample_size, len(dataset)))
scores = []
for item in sample:
messages = item.get("messages", [])
user_msg = next((m["content"] for m in messages if m["role"] == "user"), "")
asst_msg = next((m["content"] for m in messages if m["role"] == "assistant"), "")
score = self._llm_quality_score(user_msg, asst_msg)
scores.append(score)
return {
"sample_size": len(scores),
"mean_score": round(float(np.mean(scores)), 2),
"std": round(float(np.std(scores)), 2),
"below_3_rate": round(sum(1 for s in scores if s < 3) / max(len(scores), 1), 3),
}
def _llm_quality_score(self, instruction: str, response: str) -> float:
"""使用LLM对单条数据打分"""
prompt = f"""请对以下问答对的质量打分(1-5分):
问题: {instruction}
回答: {response}
评分标准:
- 5分: 完美回答,准确、详细、格式规范
- 4分: 好的回答,基本正确,有少量可改进处
- 3分: 一般,有小错误或不够深入
- 2分: 差,有明显错误
- 1分: 很差,答非所问
只输出一个数字:"""
result = self.llm_call(prompt)
try:
import re
match = re.search(r'(\d+\.?\d*)', result.strip())
if match:
score = float(match.group(1))
return max(1.0, min(5.0, score)) # 钳制到[1, 5]
return 3.0
except (ValueError, AttributeError):
return 3.0
5.2 评估报告生成¶
def generate_quality_report(evaluation_results: dict) -> str:
"""生成可读的质量评估报告"""
report = []
report.append("=" * 60)
report.append("SFT 数据质量评估报告")
report.append("=" * 60)
# 基本信息
report.append(f"\n📊 数据集大小: {evaluation_results['size']} 条")
# 多样性
div = evaluation_results["diversity"]
report.append(f"\n🔄 多样性评估:")
report.append(f" - 类别数: {div['num_categories']}")
report.append(f" - 归一化熵: {div['normalized_entropy']:.3f} ({div['verdict']})")
report.append(f" - 分布: {div['distribution']}")
# 复杂度
comp = evaluation_results["complexity"]
report.append(f"\n📏 复杂度分布:")
report.append(f" - 平均指令长度: {comp['mean_length']:.0f} 字符")
report.append(f" - P25/P50/P75/P95: {comp['p25']:.0f}/{comp['p50']:.0f}/{comp['p75']:.0f}/{comp['p95']:.0f}")
# 格式
fmt = evaluation_results["format"]
report.append(f"\n✅ 格式检查: {fmt['verdict']}")
if fmt["issues"]:
report.append(f" - 问题率: {fmt['issue_rate']:.2%}")
for issue_type, count in fmt["issues"].items():
if count > 0:
report.append(f" - {issue_type}: {count}")
# LLM质量
if "quality" in evaluation_results:
qual = evaluation_results["quality"]
report.append(f"\n🏆 LLM质量评分:")
report.append(f" - 抽样数: {qual['sample_size']}")
report.append(f" - 平均分: {qual['mean_score']:.2f} ± {qual['std']:.2f}")
report.append(f" - 低于3分比例: {qual['below_3_rate']:.1%}")
report.append("\n" + "=" * 60)
return "\n".join(report)
6. 练习题¶
代码实践¶
- 入门:用 Self-Instruct 方法从 10 条种子指令生成 100 条 SFT 数据
- 进阶:实现 Evol-Instruct 数据进化,对比进化前后的数据复杂度
- 高级:搭建完整的 SFT 数据生产流水线(生成→过滤→去重→去污染→质量评估→配比)
面试题¶
- SFT 数据质量和数量哪个更重要?为什么 LIMA 论文只用了 1000 条数据也能训出好效果?
- 合成数据的主要风险是什么?如何缓解"模型塌缩"( Model Collapse )问题?
- Benchmark Contamination 如何检测和预防?
- 偏好数据标注中,如何保证标注一致性?
- 数据配比策略对模型能力有什么影响?
💡 参考答案
#### 代码实践参考答案 **实践 1:Self-Instruct 从种子生成 SFT 数据**import json
import random
from openai import OpenAI
client = OpenAI()
SEED_INSTRUCTIONS = [
"写一首关于春天的五言绝句",
"解释什么是机器学习",
"用 Python 实现快速排序",
# ... 共 10 条种子指令
]
def self_instruct_generate(seed_tasks, num_generate=100):
"""Self-Instruct:从种子指令生成新指令"""
generated = []
batch_size = 5
for i in range(0, num_generate, batch_size):
# 随机采样种子作为 few-shot 示例
examples = random.sample(seed_tasks, min(3, len(seed_tasks)))
prompt = f"""请根据以下种子指令的风格和格式,生成 {batch_size} 条全新的指令。
要求:指令应涵盖不同领域(写作、编程、数学、推理、创意等),避免与种子指令重复。
种子指令示例:
{chr(10).join(f'- {ex}' for ex in examples)}
请输出 JSON 格式:[{{"instruction": "...", "input": "", "output": ""}}]"""
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompt}],
temperature=0.9,
)
try:
batch = json.loads(response.choices[0].message.content)
generated.extend(batch)
except json.JSONDecodeError:
continue
return generated[:num_generate]
results = self_instruct_generate(SEED_INSTRUCTIONS, num_generate=100)
print(f"生成 {len(results)} 条 SFT 数据")
# 质量过滤:去除过短或重复的指令
filtered = [r for r in results if len(r.get("instruction", "")) > 10]
print(f"过滤后剩余 {len(filtered)} 条")
def evol_instruct(instruction, evolution_type="complexity"):
"""Evol-Instruct:通过进化策略提升指令复杂度"""
prompts = {
"complexity": f"请将以下指令改写为更复杂、更具挑战性的版本。"
f"可以增加约束条件、要求更深入的分析、或组合多个子任务。\n\n"
f"原始指令:{instruction}\n\n改写后的指令:",
"breadth": f"请将以下指令改写为同一主题但不同角度的新指令。\n\n"
f"原始指令:{instruction}\n\n新角度的指令:",
"depth": f"请将以下指令改写为需要更深层推理的版本。\n\n"
f"原始指令:{instruction}\n\n深化后的指令:",
}
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[{"role": "user", "content": prompts[evolution_type]}],
temperature=0.8,
)
return response.choices[0].message.content
# 对比进化前后的复杂度
original = "写一个 Python 函数计算斐波那契数列"
for evo_type in ["complexity", "breadth", "depth"]:
evolved = evol_instruct(original, evo_type)
print(f"[{evo_type}] {evolved}\n")
print(f" Token 数变化: {len(original.split())} → {len(evolved.split())}")
import hashlib
from collections import defaultdict
class SFTDataPipeline:
"""完整的 SFT 数据生产流水线"""
def __init__(self):
self.data = []
self.stats = defaultdict(int)
def generate(self, seed_data, num_target=1000):
"""阶段 1:数据生成"""
print(f"[生成] 目标 {num_target} 条,种子 {len(seed_data)} 条")
# 使用 Self-Instruct 或 Evol-Instruct 生成
generated = self_instruct_generate(seed_data, num_target)
self.data.extend(generated)
self.stats["generated"] = len(generated)
return self
def filter_quality(self, min_length=20, max_length=4096):
"""阶段 2:质量过滤"""
before = len(self.data)
self.data = [
d for d in self.data
if min_length < len(d.get("output", "")) < max_length
and len(d.get("instruction", "")) > 5
]
self.stats["filtered"] = before - len(self.data)
print(f"[过滤] {before} → {len(self.data)} 条")
return self
def deduplicate(self, threshold=0.85):
"""阶段 3:去重(基于 MinHash 近似去重)"""
before = len(self.data)
seen_hashes = set()
unique = []
for d in self.data:
h = hashlib.md5(d["instruction"].encode()).hexdigest()
if h not in seen_hashes:
seen_hashes.add(h)
unique.append(d)
self.data = unique
self.stats["deduplicated"] = before - len(self.data)
print(f"[去重] {before} → {len(self.data)} 条")
return self
def decontaminate(self, benchmark_texts):
"""阶段 4:去污染(移除与 benchmark 重叠的数据)"""
before = len(self.data)
benchmark_ngrams = set()
for text in benchmark_texts:
words = text.split()
for i in range(len(words) - 10):
benchmark_ngrams.add(tuple(words[i:i+10]))
clean = []
for d in self.data:
words = d["instruction"].split()
overlap = sum(1 for i in range(len(words) - 10)
if tuple(words[i:i+10]) in benchmark_ngrams)
if overlap == 0:
clean.append(d)
self.data = clean
self.stats["decontaminated"] = before - len(self.data)
print(f"[去污染] {before} → {len(self.data)} 条")
return self
def evaluate_quality(self):
"""阶段 5:质量评估"""
scores = {
"avg_instruction_length": sum(len(d["instruction"].split()) for d in self.data) / max(len(self.data), 1),
"avg_output_length": sum(len(d["output"].split()) for d in self.data) / max(len(self.data), 1),
"total": len(self.data),
}
self.stats["quality_report"] = scores
print(f"[评估] {scores}")
return self
def mix(self, datasets_with_weights):
"""阶段 6:数据配比混合"""
total = sum(w for _, w in datasets_with_weights)
mixed = []
for dataset, weight in datasets_with_weights:
n = int(len(self.data) * weight / total)
mixed.extend(random.sample(dataset, min(n, len(dataset))))
self.data = mixed
print(f"[混合] 最终数据量: {len(self.data)} 条")
return self
def export(self, path="sft_data.json"):
"""导出最终数据"""
with open(path, "w", encoding="utf-8") as f:
json.dump(self.data, f, ensure_ascii=False, indent=2)
print(f"[导出] 已保存到 {path},共 {len(self.data)} 条")
return self.data
# 使用示例
pipeline = SFTDataPipeline()
pipeline.generate(SEED_INSTRUCTIONS, 1000) \
.filter_quality() \
.deduplicate() \
.decontaminate([]) \
.evaluate_quality() \
.export()
📚 关键论文¶
- Self-Instruct (Wang et al., 2023) — 自动指令生成范式
- WizardLM / Evol-Instruct (Xu et al., 2023) — 指令进化方法
- LIMA (Zhou et al., 2023) — 少量高质量数据的重要性
- UltraChat (Ding et al., 2023) — 大规模高质量对话数据
- Textbooks Are All You Need (Gunasekar et al., 2023) — 合成数据训练小模型(Phi-1)
- Magpie (Xu et al., 2024) — 基于模型对齐模板自动生成指令数据
- DCLM (Li et al., 2024) — 数据清洗系统化实践
最后更新日期: 2026-04-20 适用版本: LLM 学习教程 v2026