📖 第12章:NLP实战项目¶
⚠️ 时效性说明:本章涉及前沿模型/价格/榜单等信息,可能随版本快速变化;请以论文原文、官方发布页和 API 文档为准。
学习时间:15小时 难度星级:⭐⭐⭐⭐⭐ 前置知识:全部前序章节 学习目标:通过三个完整项目掌握NLP系统的端到端开发
📎 交叉引用: - RAG系统构建完整教程 → LLM应用/RAG系统构建 - 高级RAG技术 → LLM应用/高级RAG技术 - Agent与RAG结合 → AI Agent开发实战/Agent基础与架构
📋 目录¶
项目一:智能客服系统¶
1. 需求分析¶
Text Only
项目目标:构建一个多轮对话智能客服系统
核心功能:
1. 意图识别:识别用户的查询意图(如:查订单、退款、咨询等)
2. 实体提取:提取关键信息(订单号、商品名、日期等)
3. 对话管理:管理多轮对话状态,实现上下文理解
4. FAQ检索:从知识库检索匹配的答案
5. 兜底策略:无法回答时转人工
技术选型:
- 意图识别: BERT微调 / 大模型
- 实体提取: BiLSTM-CRF / BERT-NER
- FAQ检索: 语义向量检索 (Sentence-BERT)
- 对话管理: 状态机 + LLM
2. 架构设计¶
Text Only
┌─────────────┐
│ 用户输入 │
└──────┬──────┘
│
┌──────▼──────┐
│ 对话管理器 │
│ (Session) │
└──────┬──────┘
│
┌────────────┼────────────┐
│ │ │
┌─────▼─────┐ ┌───▼───┐ ┌─────▼─────┐
│ 意图识别 │ │ NER │ │ FAQ检索 │
│ (BERT) │ │ │ │(向量检索) │
└─────┬─────┘ └───┬───┘ └─────┬─────┘
│ │ │
└────────────┼────────────┘
│
┌──────▼──────┐
│ 回复生成 │
│ (模板/LLM) │
└──────┬──────┘
│
┌──────▼──────┐
│ 输出回复 │
└─────────────┘
3. 代码实现¶
Python
import torch
import torch.nn as nn
from collections import defaultdict
import json
import numpy as np
# ========================
# 3.1 意图识别模块
# ========================
class IntentClassifier:
"""基于规则+模型的意图识别"""
INTENT_LABELS = [
"查询订单", # 0
"申请退款", # 1
"商品咨询", # 2
"投诉建议", # 3
"物流查询", # 4
"账户问题", # 5
"闲聊", # 6
"其他", # 7
]
def __init__(self):
self.keyword_rules = {
"查询订单": ["查订单", "订单状态", "订单号", "我的订单", "查看订单"],
"申请退款": ["退款", "退货", "退钱", "不想要了", "退回"],
"商品咨询": ["多少钱", "价格", "有没有", "推荐", "功能", "区别"],
"投诉建议": ["投诉", "建议", "差评", "态度差", "不满意"],
"物流查询": ["快递", "物流", "发货", "到哪了", "运费", "几天到"],
"账户问题": ["密码", "登录", "注册", "账号", "绑定"],
}
def rule_based_predict(self, text):
"""基于关键词规则的意图识别"""
scores = {}
for intent, keywords in self.keyword_rules.items():
score = sum(1 for kw in keywords if kw in text)
if score > 0:
scores[intent] = score
if scores:
return max(scores, key=scores.get)
return "其他"
def predict(self, text):
"""综合预测"""
# 优先使用规则
rule_result = self.rule_based_predict(text)
if rule_result != "其他":
return rule_result, 0.9
# 后备:返回"其他"
return "其他", 0.5
# ========================
# 3.2 实体提取模块
# ========================
class EntityExtractor:
"""实体提取"""
def __init__(self):
import re
self.patterns = {
"订单号": re.compile(r'[A-Z]?\d{10,20}'),
"手机号": re.compile(r'1[3-9]\d{9}'),
"日期": re.compile(r'\d{4}[-/年]\d{1,2}[-/月]\d{1,2}[日]?'),
"金额": re.compile(r'\d+\.?\d*元'),
"商品名": None, # 需要NER模型
}
def extract(self, text):
"""提取实体"""
entities = {}
for entity_type, pattern in self.patterns.items():
if pattern:
matches = pattern.findall(text)
if matches:
entities[entity_type] = matches
return entities
# ========================
# 3.3 FAQ检索模块
# ========================
class FAQRetriever:
"""基于向量的FAQ检索"""
def __init__(self):
self.faq_database = [
{"question": "如何查看订单状态", "answer": "您可以在"我的订单"页面查看所有订单的最新状态...", "category": "订单"},
{"question": "退款多久到账", "answer": "退款一般在1-3个工作日内原路退回...", "category": "退款"},
{"question": "如何修改收货地址", "answer": "在订单未发货前,您可以在订单详情页修改收货地址...", "category": "物流"},
{"question": "发票怎么开", "answer": "请在订单完成后,进入订单详情页点击"申请发票"...", "category": "其他"},
{"question": "商品质量有问题怎么办", "answer": "如遇到质量问题,可以拍照上传并申请售后...", "category": "投诉"},
]
self.vectors = None
def _simple_vectorize(self, text):
"""简单的字符向量化(实际中用Sentence-BERT)"""
# 字符级别的简单向量表示
vector = np.zeros(256)
for char in text:
idx = ord(char) % 256
vector[idx] += 1
norm = np.linalg.norm(vector) # np.linalg线性代数运算
return vector / norm if norm > 0 else vector
def build_index(self):
"""构建索引"""
self.vectors = np.array([ # np.array创建NumPy数组
self._simple_vectorize(item["question"])
for item in self.faq_database
])
def search(self, query, top_k=3):
"""检索最相关的FAQ"""
if self.vectors is None:
self.build_index()
query_vec = self._simple_vectorize(query)
scores = self.vectors @ query_vec
top_indices = np.argsort(scores)[::-1][:top_k]
results = []
for idx in top_indices:
results.append({
"question": self.faq_database[idx]["question"],
"answer": self.faq_database[idx]["answer"],
"score": float(scores[idx]),
})
return results
# ========================
# 3.4 对话管理器
# ========================
class DialogueManager:
"""多轮对话管理"""
def __init__(self):
self.sessions = {}
def get_session(self, user_id):
"""获取或创建会话"""
if user_id not in self.sessions:
self.sessions[user_id] = {
"state": "idle",
"intent": None,
"entities": {},
"history": [],
"turn_count": 0,
}
return self.sessions[user_id]
def update_session(self, user_id, intent, entities, user_msg, bot_msg):
"""更新会话状态"""
session = self.get_session(user_id)
session["intent"] = intent
session["entities"].update(entities)
session["history"].append({"role": "user", "content": user_msg})
session["history"].append({"role": "assistant", "content": bot_msg})
session["turn_count"] += 1
# ========================
# 3.5 完整客服系统
# ========================
class CustomerServiceBot:
"""智能客服机器人"""
def __init__(self):
self.intent_clf = IntentClassifier()
self.entity_ext = EntityExtractor()
self.faq_retriever = FAQRetriever()
self.dialogue_mgr = DialogueManager()
self.response_templates = {
"查询订单": "好的,正在为您查询订单{订单号}的状态,请稍等...",
"申请退款": "已收到您的退款申请,退款将在1-3个工作日内原路退回。",
"商品咨询": "关于您咨询的商品,以下是相关信息...",
"投诉建议": "非常抱歉给您带来不好的体验,我们会认真处理您的反馈。",
"物流查询": "正在为您查询物流信息...",
"账户问题": "关于账户问题,建议您...",
}
def respond(self, user_id, message):
"""生成回复"""
# 1. 意图识别
intent, confidence = self.intent_clf.predict(message)
# 2. 实体提取
entities = self.entity_ext.extract(message)
# 3. 生成回复
if intent in self.response_templates:
response = self.response_templates[intent]
# 填充实体
for ent_type, ent_values in entities.items():
response = response.replace(f"{{{ent_type}}}", ent_values[0])
else:
# FAQ检索
faq_results = self.faq_retriever.search(message)
if faq_results and faq_results[0]["score"] > 0.3:
response = faq_results[0]["answer"]
else:
response = "抱歉,我暂时无法回答这个问题,正在为您转接人工客服..."
# 4. 更新对话状态
self.dialogue_mgr.update_session(user_id, intent, entities, message, response)
return {
"response": response,
"intent": intent,
"confidence": confidence,
"entities": entities,
}
# 测试
bot = CustomerServiceBot()
test_queries = [
"我想查一下订单12345678901的状态",
"退款什么时候到账?",
"你们有什么推荐的手机吗",
"快递到哪了",
"今天天气怎么样",
]
print("=" * 60)
print("智能客服系统测试")
print("=" * 60)
for query in test_queries:
result = bot.respond("user_001", query)
print(f"\n用户: {query}")
print(f"意图: {result['intent']} (置信度: {result['confidence']})")
print(f"实体: {result['entities']}")
print(f"回复: {result['response']}")
项目二:舆情分析系统¶
1. 需求分析¶
Text Only
项目目标:构建实时舆情监控和分析系统
核心功能:
1. 数据采集:从多个来源采集文本数据
2. 情感分析:判断文本的情感倾向
3. 热点检测:发现热门话题和趋势
4. 实体关联:分析事件涉及的实体和关系
5. 预警通知:负面舆情自动预警
技术栈:
- 情感分析: BERT微调 / 大模型API
- 话题聚类: LDA / K-Means / BERTopic
- 关键词抽取: TF-IDF / TextRank / KeyBERT
- 趋势检测: 时间序列分析
2. 架构设计¶
Text Only
┌──────────────────────────────────────────────────────┐
│ 舆情分析系统 │
├─────────┬──────────┬──────────┬─────────┬────────────┤
│ 数据采集 │ 文本分析 │ 热点检测 │ 情感分析 │ 预警通知 │
│ Layer │ Layer │ Layer │ Layer │ Layer │
├─────────┴──────────┴──────────┴─────────┴────────────┤
│ 数据存储层 │
│ (Elasticsearch / MySQL) │
└──────────────────────────────────────────────────────┘
3. 代码实现¶
Python
import re
import json
import math
from datetime import datetime, timedelta
from collections import Counter, defaultdict
# ========================
# 3.1 文本预处理
# ========================
class TextPreprocessor:
"""文本预处理"""
def __init__(self):
self.stopwords = set("的了是在我有和人这中大为上个国就说也时"
"要出会可以到他们你我的不了也就是")
def clean(self, text):
"""清洗文本"""
# 去除URL
text = re.sub(r'https?://\S+', '', text)
# 去除@用户
text = re.sub(r'@\w+', '', text)
# 去除话题标签
text = re.sub(r'#[^#]+#', '', text)
# 去除多余空白
text = re.sub(r'\s+', ' ', text).strip()
return text
def tokenize(self, text):
"""简单分词"""
# 实际使用jieba分词
return list(text)
def remove_stopwords(self, tokens):
"""去除停用词"""
return [t for t in tokens if t not in self.stopwords]
# ========================
# 3.2 情感分析模块
# ========================
class SentimentAnalyzer:
"""情感分析器"""
def __init__(self):
self.positive_words = set("好棒赞喜欢优秀出色精彩满意推荐点赞值得感谢支持")
self.negative_words = set("差烂垃圾失望难吃坑骗差评退款投诉恶心讨厌糟糕")
self.negation_words = set("不没无别非莫勿")
def lexicon_sentiment(self, text):
"""基于词典的情感分析"""
pos_count = sum(1 for w in self.positive_words if w in text)
neg_count = sum(1 for w in self.negative_words if w in text)
# 否定词翻转
for neg in self.negation_words:
if neg in text:
pos_count, neg_count = neg_count, pos_count
break
if pos_count > neg_count:
return "正面", pos_count / max(pos_count + neg_count, 1)
elif neg_count > pos_count:
return "负面", neg_count / max(pos_count + neg_count, 1)
else:
return "中性", 0.5
def analyze(self, text):
"""综合情感分析"""
sentiment, confidence = self.lexicon_sentiment(text)
return {
"text": text[:50] + "..." if len(text) > 50 else text, # 切片操作,取前n个元素
"sentiment": sentiment,
"confidence": round(confidence, 3),
}
# ========================
# 3.3 关键词提取 (TF-IDF)
# ========================
class KeywordExtractor:
"""关键词提取"""
def __init__(self):
self.idf = {}
def fit(self, documents):
"""计算IDF"""
num_docs = len(documents)
df = Counter() # Counter统计元素出现次数
for doc in documents:
chars = set(doc)
for char in chars:
df[char] += 1
self.idf = {
char: math.log(num_docs / (freq + 1))
for char, freq in df.items()
}
def extract_keywords(self, text, top_k=10):
"""提取关键词"""
tf = Counter(text)
total = len(text)
tfidf_scores = {}
for char, freq in tf.items():
tf_score = freq / total
idf_score = self.idf.get(char, 0)
tfidf_scores[char] = tf_score * idf_score
sorted_keywords = sorted(tfidf_scores.items(), key=lambda x: x[1], reverse=True) # lambda匿名函数
return sorted_keywords[:top_k]
# ========================
# 3.4 话题聚类
# ========================
class TopicDetector:
"""话题检测(简化版)"""
def __init__(self, num_topics=5):
self.num_topics = num_topics
def detect_by_keywords(self, documents):
"""基于关键词的话题检测"""
all_keywords = Counter()
doc_keywords = []
for doc in documents:
# 简单2-gram
bigrams = [doc[i:i+2] for i in range(len(doc)-1)]
doc_keywords.append(bigrams)
all_keywords.update(bigrams)
# 返回高频话题词
top_topics = all_keywords.most_common(self.num_topics * 3)
return top_topics
# ========================
# 3.5 舆情预警
# ========================
class AlertSystem:
"""舆情预警系统"""
def __init__(self, negative_threshold=0.3, volume_threshold=100):
self.negative_threshold = negative_threshold
self.volume_threshold = volume_threshold
self.alerts = []
def check_alerts(self, analysis_results):
"""检查是否需要预警"""
total = len(analysis_results)
if total == 0:
return []
negative_count = sum(1 for r in analysis_results if r["sentiment"] == "负面")
negative_ratio = negative_count / total
alerts = []
# 负面比例预警
if negative_ratio > self.negative_threshold:
alerts.append({
"type": "负面舆情占比过高",
"level": "严重" if negative_ratio > 0.5 else "警告",
"value": f"{negative_ratio:.1%}",
"threshold": f"{self.negative_threshold:.1%}",
"time": datetime.now().strftime("%Y-%m-%d %H:%M"),
})
# 数量异常预警
if total > self.volume_threshold:
alerts.append({
"type": "舆情数量激增",
"level": "注意",
"value": str(total),
"threshold": str(self.volume_threshold),
"time": datetime.now().strftime("%Y-%m-%d %H:%M"),
})
self.alerts.extend(alerts)
return alerts
# ========================
# 3.6 完整舆情分析系统
# ========================
class PublicOpinionSystem:
"""舆情分析系统"""
def __init__(self):
self.preprocessor = TextPreprocessor()
self.sentiment = SentimentAnalyzer()
self.keyword_ext = KeywordExtractor()
self.topic_detector = TopicDetector()
self.alert_system = AlertSystem()
def analyze_batch(self, texts):
"""批量分析"""
# 1. 预处理
cleaned_texts = [self.preprocessor.clean(t) for t in texts]
# 2. 情感分析
sentiment_results = [self.sentiment.analyze(t) for t in cleaned_texts]
# 3. 统计情感分布
sentiment_dist = Counter(r["sentiment"] for r in sentiment_results)
# 4. 关键词提取
self.keyword_ext.fit(cleaned_texts)
all_text = "".join(cleaned_texts)
keywords = self.keyword_ext.extract_keywords(all_text, top_k=10)
# 5. 话题检测
topics = self.topic_detector.detect_by_keywords(cleaned_texts)
# 6. 预警检查
alerts = self.alert_system.check_alerts(sentiment_results)
return {
"total_count": len(texts),
"sentiment_distribution": dict(sentiment_dist),
"keywords": [{"word": w, "score": round(s, 4)} for w, s in keywords],
"topics": [{"topic": t, "count": c} for t, c in topics[:5]],
"alerts": alerts,
"sample_results": sentiment_results[:5],
}
# 测试
system = PublicOpinionSystem()
test_texts = [
"这个产品太好了,强烈推荐!",
"差评,质量太差了,很失望",
"一般般,不好不坏",
"客服态度很好,问题很快就解决了",
"退款困难,投诉无门,太坑了",
"产品功能很强大,使用方便",
"物流太慢了,等了一周才到",
"性价比很高,值得购买",
"包装破损,商品有瑕疵",
"整体体验不错,好评",
]
result = system.analyze_batch(test_texts)
print("=" * 60)
print("舆情分析报告")
print("=" * 60)
print(f"\n总数量: {result['total_count']}")
print(f"情感分布: {result['sentiment_distribution']}")
print(f"\n热门关键词:")
for kw in result["keywords"][:5]:
print(f" {kw['word']}: {kw['score']}")
print(f"\n预警信息:")
for alert in result["alerts"]:
print(f" [{alert['level']}] {alert['type']}: {alert['value']} (阈值: {alert['threshold']})")
print(f"\n情感分析示例:")
for sample in result["sample_results"][:3]:
print(f" {sample['text']} → {sample['sentiment']} ({sample['confidence']})")
项目三:文档智能处理系统¶
1. 需求分析¶
Text Only
项目目标:构建文档智能处理和知识提取系统
核心功能:
1. 文档解析:处理PDF/Word/HTML等格式
2. 文档摘要:自动生成文档摘要
3. 关键信息提取:提取文档中的关键实体和事实
4. 文档分类:自动分类归档
5. 智能问答:基于文档内容的问答
6. 知识图谱构建:从文档中提取知识三元组
技术栈:
- 文档解析: PyPDF2, python-docx
- 摘要: TextRank / BART / 大模型
- 信息抽取: BERT-NER + 关系抽取
- 问答: RAG (检索增强生成)
- 向量化: Sentence-BERT / BGE
2. 架构设计¶
Text Only
┌───────────────────────────────────────────┐
│ 文档智能处理系统 │
├────────────────────┬──────────────────────┤
│ 文档解析层 │ 智能处理层 │
│ ┌─────────────┐ │ ┌──────────────────┐ │
│ │ PDF解析 │ │ │ 自动摘要 │ │
│ │ Word解析 │ │ │ 关键信息提取 │ │
│ │ HTML解析 │ │ │ 文档分类 │ │
│ │ OCR识别 │ │ │ 智能问答(RAG) │ │
│ └─────────────┘ │ └──────────────────┘ │
├────────────────────┴──────────────────────┤
│ 知识管理层 │
│ ┌────────┐ ┌───────────┐ ┌──────────┐ │
│ │ 向量DB │ │ 知识图谱 │ │ 全文检索 │ │
│ └────────┘ └───────────┘ └──────────┘ │
└───────────────────────────────────────────┘
3. 代码实现¶
Python
import re
import math
import numpy as np
from collections import Counter, defaultdict
# ========================
# 3.1 文档解析模块
# ========================
class DocumentParser:
"""文档解析器"""
def parse_text(self, text, doc_type="plain"):
"""解析文本"""
document = {
"content": text,
"paragraphs": self._split_paragraphs(text),
"sentences": self._split_sentences(text),
"word_count": len(text),
"doc_type": doc_type,
}
return document
def _split_paragraphs(self, text):
"""分段"""
paragraphs = [p.strip() for p in text.split('\n') if p.strip()]
return paragraphs
def _split_sentences(self, text):
"""分句"""
sentences = re.split(r'[。!?;\n]+', text)
return [s.strip() for s in sentences if s.strip() and len(s.strip()) > 2]
# ========================
# 3.2 自动摘要
# ========================
class TextRankSummarizer:
"""TextRank摘要生成器"""
def __init__(self, damping=0.85, max_iter=100):
self.damping = damping
self.max_iter = max_iter
def _sentence_similarity(self, s1, s2):
"""计算句子相似度"""
words1 = set(s1)
words2 = set(s2)
if not words1 or not words2:
return 0
intersection = words1 & words2
return len(intersection) / (math.log(len(words1)) + math.log(len(words2)) + 1)
def summarize(self, text, num_sentences=3):
"""生成摘要"""
sentences = re.split(r'[。!?]+', text)
sentences = [s.strip() for s in sentences if len(s.strip()) > 5]
if len(sentences) <= num_sentences:
return "。".join(sentences) + "。"
n = len(sentences)
similarity_matrix = np.zeros((n, n))
for i in range(n):
for j in range(n):
if i != j:
similarity_matrix[i][j] = self._sentence_similarity(
sentences[i], sentences[j]
)
# 归一化
for i in range(n):
row_sum = similarity_matrix[i].sum()
if row_sum > 0:
similarity_matrix[i] /= row_sum
# PageRank迭代
scores = np.ones(n) / n
for _ in range(self.max_iter):
new_scores = (1 - self.damping) / n + self.damping * similarity_matrix.T @ scores
if np.abs(new_scores - scores).sum() < 1e-6:
break
scores = new_scores
# 选择top-k句子(保持原文顺序)
ranked_indices = np.argsort(scores)[::-1][:num_sentences]
selected = sorted(ranked_indices)
summary = "。".join(sentences[i] for i in selected) + "。"
return summary
# ========================
# 3.3 关键信息提取
# ========================
class KeyInfoExtractor:
"""关键信息提取"""
def __init__(self):
self.patterns = {
"日期": re.compile(r'\d{4}[-/年]\d{1,2}[-/月]\d{1,2}[日号]?'),
"金额": re.compile(r'[\d,]+\.?\d*[元万亿美元]'),
"百分比": re.compile(r'\d+\.?\d*%'),
"组织": re.compile(r'[\u4e00-\u9fa5]{2,8}(?:公司|集团|机构|大学|学院|研究院|委员会|部门)'),
"电话": re.compile(r'1[3-9]\d{9}|\d{3,4}-\d{7,8}'),
"邮箱": re.compile(r'[\w.-]+@[\w.-]+\.\w+'),
}
def extract(self, text):
"""提取关键信息"""
results = {}
for info_type, pattern in self.patterns.items():
matches = pattern.findall(text)
if matches:
results[info_type] = list(set(matches))
return results
# ========================
# 3.4 文档分类
# ========================
class DocumentClassifier:
"""文档分类器"""
def __init__(self):
self.category_keywords = {
"技术文档": ["API", "函数", "代码", "算法", "接口", "系统", "模块", "数据库"],
"合同协议": ["甲方", "乙方", "合同", "协议", "条款", "责任", "期限", "违约"],
"新闻报道": ["记者", "报道", "新华社", "消息", "据悉", "表示", "发布会"],
"学术论文": ["摘要", "引言", "方法", "实验", "结论", "参考文献", "假设"],
"财务报告": ["营收", "净利润", "同比", "环比", "毛利率", "资产", "负债"],
}
def classify(self, text):
"""分类"""
scores = {}
for category, keywords in self.category_keywords.items():
score = sum(text.count(kw) for kw in keywords)
scores[category] = score
if max(scores.values()) == 0:
return "其他", 0.0
best_category = max(scores, key=scores.get)
total = sum(scores.values())
confidence = scores[best_category] / total if total > 0 else 0
return best_category, round(confidence, 3)
# ========================
# 3.5 简单RAG问答
# ========================
class SimpleRAQA:
"""基于检索的文档问答"""
def __init__(self):
self.chunks = []
self.chunk_vectors = []
def index_document(self, text, chunk_size=200, overlap=50):
"""将文档切分并索引"""
self.chunks = []
for i in range(0, len(text), chunk_size - overlap):
chunk = text[i:i + chunk_size]
if len(chunk) > 20:
self.chunks.append(chunk)
# 简单向量化
self.chunk_vectors = [self._vectorize(c) for c in self.chunks]
def _vectorize(self, text):
"""简单向量化"""
vector = np.zeros(128)
for i, char in enumerate(text): # enumerate同时获取索引和元素
vector[ord(char) % 128] += 1
norm = np.linalg.norm(vector)
return vector / norm if norm > 0 else vector
def answer(self, question, top_k=3):
"""回答问题"""
if not self.chunks:
return "请先索引文档"
q_vec = self._vectorize(question)
scores = [np.dot(q_vec, cv) for cv in self.chunk_vectors] # np.dot矩阵/向量点乘
top_indices = np.argsort(scores)[::-1][:top_k]
context = "\n".join(self.chunks[i] for i in top_indices)
# 实际使用时调用LLM
prompt = f"""基于以下上下文回答问题。
上下文:
{context}
问题: {question}
回答:"""
return {
"context": context[:200] + "...",
"prompt": prompt[:300] + "...",
"retrieved_chunks": len(top_indices),
}
# ========================
# 3.6 完整文档处理系统
# ========================
class DocIntelligenceSystem:
"""文档智能处理系统"""
def __init__(self):
self.parser = DocumentParser()
self.summarizer = TextRankSummarizer()
self.info_extractor = KeyInfoExtractor()
self.classifier = DocumentClassifier()
self.qa_system = SimpleRAQA()
def process(self, text, doc_type="plain"):
"""全流程处理"""
# 1. 解析
doc = self.parser.parse_text(text, doc_type)
# 2. 分类
category, clf_conf = self.classifier.classify(text)
# 3. 摘要
summary = self.summarizer.summarize(text, num_sentences=3)
# 4. 关键信息提取
key_info = self.info_extractor.extract(text)
# 5. 索引文档用于问答
self.qa_system.index_document(text)
return {
"word_count": doc["word_count"],
"paragraph_count": len(doc["paragraphs"]),
"sentence_count": len(doc["sentences"]),
"category": category,
"category_confidence": clf_conf,
"summary": summary,
"key_info": key_info,
"qa_ready": True,
}
# 测试
system = DocIntelligenceSystem()
test_doc = """
2024年3月15日,人工智能研究院发布了最新的自然语言处理技术报告。
报告指出,基于Transformer架构的大型语言模型在过去一年取得了显著进展。
GPT-4等模型的参数量已超过万亿级别,推理能力大幅提升。
在应用层面,大模型已广泛应用于智能客服、内容生成、代码辅助等领域。
据统计,2023年全球AI市场规模达到5000亿美元,同比增长35%。
中国AI市场规模约680亿元,预计2025年将突破1000亿元。
然而,大模型也面临着幻觉问题、推理成本高、数据隐私等挑战。
研究人员正在探索更高效的训练方法,如LoRA微调技术可将参数量减少98%。
未来,多模态能力和推理优化将成为重要研究方向。
"""
result = system.process(test_doc)
print("=" * 60)
print("文档智能处理报告")
print("=" * 60)
print(f"\n文档统计:")
print(f" 字数: {result['word_count']}")
print(f" 段落数: {result['paragraph_count']}")
print(f" 句子数: {result['sentence_count']}")
print(f"\n文档分类: {result['category']} (置信度: {result['category_confidence']})")
print(f"\n自动摘要:\n {result['summary']}")
print(f"\n关键信息:")
for info_type, values in result["key_info"].items():
print(f" {info_type}: {values}")
# 测试问答
qa_result = system.qa_system.answer("大模型面临什么挑战?")
print(f"\n问答测试:")
print(f" 问题: 大模型面临什么挑战?")
print(f" 检索到 {qa_result['retrieved_chunks']} 个相关段落")
项目部署建议¶
Python
deployment_guide = {
"开发环境": {
"Python": "3.9+",
"GPU": "NVIDIA RTX 3080+ (微调需要)",
"内存": "16GB+",
"框架": "PyTorch + Transformers",
},
"部署方式": {
"FastAPI": "用于API服务",
"Docker": "容器化部署",
"Gunicorn": "多进程服务",
"Redis": "缓存和消息队列",
},
"监控": {
"Prometheus": "性能监控",
"Grafana": "可视化面板",
"日志": "ELK Stack",
},
}
print("项目部署建议:")
for category, items in deployment_guide.items():
print(f"\n {category}:")
for k, v in items.items():
print(f" {k}: {v}")
✅ 自我检查清单¶
Text Only
□ 我理解智能客服系统的各个模块
□ 我能实现意图识别和实体提取
□ 我理解舆情分析的完整流程
□ 我能实现文档的自动摘要和分类
□ 我理解RAG问答系统的工作原理
□ 我知道如何部署NLP系统到生产环境
□ 我至少完成了一个项目的完整实现
📚 延伸阅读¶
恭喜! 完成本系列全部12章的学习。建议回顾学习指南制定复习计划,并继续学习第13章 对话系统与Agent化NLP和第14章 RAG系统设计。