10. 最新模型与Agent技术(2025年更新)¶
⚠️ 时效性说明:本章涉及前沿模型/价格/榜单等信息,可能随版本快速变化;请以论文原文、官方发布页和 API 文档为准。
目录¶
- Claude 4系列模型
- OpenAI Codex与新一代模型
- OpenClaw(开源AI助手生态)
- Google Gemini 2.5系列
- Devin与AI编程Agent
- 2025年Agent技术趋势
- 实践项目:构建本地AI助手
1. Claude 4系列模型¶
1.1 Claude 4发布概述¶
发布时间:2025年5月22日 发布方:Anthropic 模型系列:Claude Opus 4 + Claude Sonnet 4
┌─────────────────────────────────────────────────────────────────┐
│ Claude 4系列模型 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ Claude Opus 4 │
│ ├── 定位:旗舰级编程与推理模型 │
│ ├── 特点:7小时连续编码能力 │
│ ├── 性能:公开评测显示其在多项编程任务中表现领先 │
│ └── 应用:复杂软件工程任务 │
│ │
│ Claude Sonnet 4 │
│ ├── 定位:高效平衡版本 │
│ ├── 特点:成本为Opus 4的1/3,速度提升2倍+ │
│ ├── 性能:接近Opus 4水平 │
│ └── 应用:日常开发、生产环境 │
│ │
└─────────────────────────────────────────────────────────────────┘
1.2 Claude 4核心能力¶
class Claude4Capabilities:
"""Claude 4核心能力分析"""
def __init__(self):
self.capabilities = {
"coding": {
"description": "编程能力",
"features": [
"7小时不间断编码",
"复杂代码库理解",
"跨文件重构",
"测试驱动开发",
"代码审查与优化"
],
"benchmarks": {
"SWE_bench": "公开榜单领先(随评测版本变化)",
"HumanEval": "90%+",
"Codeforces": "Expert级别"
}
},
"reasoning": {
"description": "高级推理",
"features": [
"多步骤逻辑推理",
"数学问题求解",
"科学分析",
"指令层次结构理解"
],
"improvements": "相比Claude 3.5提升显著"
},
"agent": {
"description": "Agent能力",
"features": [
"长期任务执行",
"工具调用链",
"自主规划",
"错误恢复"
],
"context_window": "200K tokens"
}
}
def compare_with_predecessors(self):
"""与前代模型对比"""
comparison = {
"Claude 3 Opus": {
"coding": "Claude 4 Opus > 3 Opus",
"speed": "Claude 4 Sonnet 2x faster",
"cost": "Sonnet 4: 1/3 cost of Opus 4"
},
"GPT-4": {
"coding": "在多项公开编程评测中表现强劲",
"reasoning": "Comparable or better",
"instruction_following": "Better hierarchy understanding"
}
}
return comparison
1.3 Claude 4架构创新¶
class Claude4Architecture:
"""Claude 4架构特点(基于公开信息推断)"""
def __init__(self):
self.architecture_features = {
"mixture_of_experts": {
"description": "混合专家架构",
"details": "可能采用稀疏MoE架构提升效率",
"benefits": ["推理效率", "专业化能力"]
},
"extended_context": {
"description": "扩展上下文",
"window": "200K tokens",
"use_cases": ["长代码库", "多文档分析", "长期对话"]
},
"tool_use": {
"description": "增强工具使用",
"capabilities": [
"代码执行",
"文件操作",
"API调用",
"Web浏览"
]
},
"safety_improvements": {
"description": "安全增强",
"features": [
"更好的指令层次理解",
"减少有害输出",
"Constitutional AI改进"
]
}
}
def training_methodology(self):
"""训练方法"""
return {
"pre_training": {
"data": "大规模高质量文本",
"compute": "显著增加",
"duration": "更长训练周期"
},
"fine_tuning": {
"rlhf": "人类反馈强化学习",
"constitutional_ai": "宪法AI方法",
"coding_specific": "编程专项优化"
},
"scaling_laws": {
"compute": "遵循计算缩放定律",
"data": "数据质量优先于数量",
"efficiency": "效率优化"
}
}
1.4 Claude 4使用示例¶
# 使用Claude 4进行复杂编程任务
import anthropic
class Claude4CodingAssistant:
"""Claude 4编程助手封装"""
def __init__(self, api_key: str):
self.client = anthropic.Anthropic(api_key=api_key)
self.model = "claude-opus-4-20250522"
async def code_review(self, code: str, context: str = "") -> dict: # async def定义协程函数
"""代码审查"""
prompt = f"""Please review the following code for:
1. Bugs and logic errors
2. Performance issues
3. Security vulnerabilities
4. Code style and best practices
5. Documentation needs
Context: {context}
Code:
Provide detailed feedback with specific line references."""
response = await self.client.messages.create(
model=self.model,
max_tokens=4096,
messages=[{"role": "user", "content": prompt}]
)
return {
"review": response.content[0].text,
"model": self.model,
"tokens_used": response.usage.input_tokens + response.usage.output_tokens
}
async def refactor_code(self, code: str, requirements: str) -> dict:
"""代码重构"""
prompt = f"""Refactor the following code according to these requirements:
{requirements}
Original code:
Provide:
1. Refactored code
2. Explanation of changes
3. Benefits of the refactoring"""
response = await self.client.messages.create(
model=self.model,
max_tokens=4096,
messages=[{"role": "user", "content": prompt}]
)
return {
"refactored_code": response.content[0].text,
"model": self.model
}
async def long_session_coding(self, task_description: str,
max_iterations: int = 10) -> dict:
"""长会话编程(模拟Claude 4的7小时编码能力)"""
conversation_history = []
current_code = ""
for i in range(max_iterations):
# 构建包含上下文的提示
prompt = self._build_iteration_prompt(
task_description,
conversation_history,
current_code
)
response = await self.client.messages.create(
model=self.model,
max_tokens=4096,
messages=[{"role": "user", "content": prompt}]
)
iteration_result = response.content[0].text
conversation_history.append({
"iteration": i,
"action": iteration_result
})
# 解析代码更新
current_code = self._extract_code(iteration_result)
# 检查是否完成
if self._is_task_complete(iteration_result):
break
return {
"final_code": current_code,
"iterations": conversation_history,
"total_iterations": len(conversation_history)
}
def _build_iteration_prompt(self, task: str, history: list, current_code: str) -> str:
"""构建迭代提示"""
history_str = "\n".join([
f"Iteration {h['iteration']}: {h['action'][:200]}..."
for h in history[-3:] # 只保留最近3轮
])
return f"""Task: {task}
Previous actions:
{history_str}
Current code:
2. OpenAI Codex与新一代模型¶
⚠️ 时效性提醒:本节内容描述的是2025年快速迭代中的产品,具体模型名称、版本号和功能细节可能已更新。请以 OpenAI官方文档 为准。OpenAI Codex编程Agent是真实产品,但底层模型名称请查阅最新API文档。
2.1 OpenAI Codex Agent¶
定位:云端AI编程Agent 核心特点: - 专为Agentic Coding优化的模型 - 专为Agentic Coding优化 - 集成到ChatGPT
┌─────────────────────────────────────────────────────────────────┐
│ OpenAI Codex架构 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 模型层:codex-1 / GPT-5-Codex(按发布时间线) │
│ ├── 专为代码生成和Agent任务优化 │
│ ├── 支持多文件编辑 │
│ └── 理解复杂代码库结构 │
│ │
│ Agent层:Codex Agent │
│ ├── 自主任务规划 │
│ ├── 代码执行环境 │
│ ├── 测试与验证 │
│ └── 错误处理与恢复 │
│ │
│ 集成层:ChatGPT + API │
│ ├── 自然语言交互 │
│ ├── 代码审查界面 │
│ └── 版本控制集成 │
│ │
└─────────────────────────────────────────────────────────────────┘
2.2 Codex相关模型(按公开发布时间)¶
class CodexModelSeries:
"""Codex相关模型分析(基于公开发布信息)"""
def __init__(self):
self.models = {
"codex-1": {
"description": "Codex 研究预览时期的核心模型",
"release_date": "2025-05(公开产品发布)",
"optimization": "Agentic coding",
"features": [
"多文件编辑",
"代码库理解",
"测试生成",
"调试辅助"
]
},
"gpt-5-codex": {
"description": "后续升级的 Codex 专用模型",
"optimization": "Agentic coding",
"features": [
"多文件编辑",
"代码库理解",
"测试生成",
"调试辅助"
]
},
"codex-mini-latest": {
"description": "面向 CLI 的轻量低延迟模型",
"features": [
"低延迟交互",
"本地终端工作流",
"代码问答与编辑"
]
}
}
def capabilities(self):
"""能力对比"""
return {
"coding": {
"gpt-5-codex": "公开产品中面向工程任务优化",
"comparison": "与同类编程模型竞争激烈"
},
"reasoning": {
"codex-1": "强调真实工程任务完成度",
"improvements": "具体表现依任务和评测集而变"
},
"multimodal": {
"features": ["以代码与工程任务为主", "结合工具执行"],
"quality": "以官方更新为准"
}
}
2.3 Codex Agent实现原理¶
class CodexAgentImplementation:
"""Codex Agent技术实现"""
def __init__(self):
self.components = {
"task_planner": {
"description": "任务规划器",
"function": "将用户请求分解为可执行步骤",
"algorithm": "基于LLM的规划算法"
},
"code_executor": {
"description": "代码执行器",
"environment": "隔离的沙箱环境",
"safety": "资源限制和权限控制"
},
"file_manager": {
"description": "文件管理",
"operations": ["读取", "写入", "修改", "删除"],
"version_control": "Git集成"
},
"test_runner": {
"description": "测试运行器",
"capabilities": [
"自动生成测试",
"运行测试套件",
"报告覆盖率"
]
}
}
def workflow(self):
"""工作流程"""
return """
1. 理解用户意图
- 解析自然语言描述
- 识别代码相关需求
2. 代码库分析
- 读取项目结构
- 理解依赖关系
- 识别相关文件
3. 任务规划
- 生成执行计划
- 确定修改范围
- 预估影响
4. 代码生成
- 编写新代码
- 修改现有代码
- 保持代码风格一致
5. 测试验证
- 生成测试用例
- 运行测试
- 修复失败测试
6. 结果交付
- 生成diff
- 提供说明文档
- 提交PR(可选)
"""
def safety_mechanisms(self):
"""安全机制"""
return {
"sandbox": {
"description": "沙箱隔离",
"features": [
"容器化执行",
"资源限制",
"网络隔离"
]
},
"approval": {
"description": "人工确认",
"triggers": [
"破坏性操作",
"敏感文件修改",
"外部API调用"
]
},
"audit": {
"description": "审计日志",
"records": [
"所有文件操作",
"代码变更",
"执行结果"
]
}
}
3. OpenClaw(开源AI助手生态)¶
⚠️ 内容审核提醒:本节介绍的开源AI助手项目属于快速演进的领域。项目名称、架构细节可能已变化,以实际GitHub仓库为准。以下侧重讲解本地AI助手的架构设计模式,这些设计思想适用于各类类似项目(如Open Interpreter、Aider等)。
3.1 开源AI助手概述¶
原名:Clawdbot → Moltbot → OpenClaw 开发者:Peter Steinberger (PSPDFKit Labs) 开源热度:GitHub星标超10万+(开源约一个月内) 定位:开源个人AI助手平台
┌─────────────────────────────────────────────────────────────────┐
│ OpenClaw核心特性 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 本地优先 (Local First) │
│ ├── 在用户设备上运行 │
│ ├── 数据完全由用户控制 │
│ └── 无需云端依赖 │
│ │
│ 多渠道支持 │
│ ├── WhatsApp │
│ ├── Telegram │
│ ├── iMessage │
│ └── 其他通讯软件 │
│ │
│ 系统级权限 │
│ ├── 真实系统访问权限 │
│ ├── 可执行文件操作 │
│ └── 控制本地应用程序 │
│ │
│ 长期记忆 │
│ ├── 记住数月前的决策 │
│ ├── 持续学习用户偏好 │
│ └── 上下文保持 │
│ │
└─────────────────────────────────────────────────────────────────┘
3.2 OpenClaw架构¶
class OpenClawArchitecture:
"""OpenClaw架构分析"""
def __init__(self):
self.layers = {
"communication_layer": {
"description": "通信层",
"adapters": [
"WhatsApp Adapter",
"Telegram Adapter",
"iMessage Adapter",
"Slack Adapter"
],
"protocol": "消息队列 + WebSocket"
},
"core_engine": {
"description": "核心引擎",
"components": {
"intent_parser": "意图解析器",
"task_planner": "任务规划器",
"memory_manager": "记忆管理器",
"action_executor": "动作执行器"
}
},
"system_interface": {
"description": "系统接口层",
"capabilities": [
"文件系统访问",
"应用程序控制",
"网络请求",
"硬件交互"
]
},
"llm_backend": {
"description": "LLM后端",
"options": [
"本地模型 (Ollama)",
"OpenAI API",
"Anthropic API",
"Google Gemini"
]
}
}
def memory_system(self):
"""记忆系统"""
return {
"short_term": {
"description": "短期记忆",
"scope": "当前对话",
"storage": "内存"
},
"long_term": {
"description": "长期记忆",
"scope": "跨会话",
"storage": "本地数据库",
"retention": "数月甚至永久"
},
"episodic": {
"description": "情景记忆",
"content": "具体事件和决策",
"retrieval": "语义搜索"
},
"procedural": {
"description": "程序性记忆",
"content": "技能和流程",
"learning": "从执行中学习"
}
}
3.3 OpenClaw使用场景¶
class OpenClawUseCases:
"""OpenClaw使用场景"""
def __init__(self):
self.scenarios = {
"personal_assistant": {
"description": "个人助手",
"examples": [
{
"user": "下班想看电视剧",
"action": "自动打开视频平台,搜索推荐内容"
},
{
"user": "明天早上8点有会议",
"action": "设置闹钟,准备会议资料"
},
{
"user": "记得提醒我喝水",
"action": "定时提醒,追踪饮水习惯"
}
]
},
"productivity": {
"description": "生产力工具",
"examples": [
{
"user": "整理桌面文件",
"action": "按类型/日期自动分类"
},
{
"user": "发送周报给团队",
"action": "生成报告,发送邮件"
}
]
},
"developer_tool": {
"description": "开发者工具",
"examples": [
{
"user": "运行测试并报告结果",
"action": "执行测试套件,格式化输出"
},
{
"user": "部署到生产环境",
"action": "执行部署脚本,监控状态"
}
]
}
}
def example_interaction(self):
"""示例交互"""
return """
用户: "帮我准备明天的演示"
OpenClaw:
1. 检查日历获取会议信息
2. 查找相关文档和资料
3. 打开演示软件
4. 准备演示文稿大纲
5. 设置提醒
执行结果:
✅ 已找到明天14:00的产品评审会议
✅ 已收集相关技术文档
✅ 已在Keynote中创建新演示文稿
✅ 已设置13:30的提醒
需要我帮您完善演示内容吗?
"""
3.4 OpenClaw技术实现¶
# OpenClaw核心实现示例
class OpenClawCore:
"""OpenClaw核心实现"""
def __init__(self, config: dict):
self.config = config
self.memory = LongTermMemory(config['memory_db'])
self.llm = LLMBackend(config['llm_provider'])
self.system = SystemInterface()
self.communication = CommunicationManager()
async def handle_message(self, message: str, user_id: str,
platform: str) -> str:
"""处理传入消息——核心处理管线:记忆检索→意图解析→行动规划→执行→回复"""
# 1. 检索与当前消息语义相关的历史记忆
context = await self.memory.retrieve_relevant( # await等待异步操作完成
query=message,
user_id=user_id,
limit=5
)
# 2. 解析用户意图(结合历史上下文提高准确性)
intent = await self.llm.parse_intent(
message=message,
context=context
)
# 3. 根据意图和可用工具生成执行计划
plan = await self.llm.create_plan(
intent=intent,
available_tools=self.system.get_tools()
)
# 4. 顺序执行计划中的每个动作,并记录到记忆系统
results = []
for action in plan.actions:
result = await self.system.execute(action)
results.append(result)
# 记录到记忆
await self.memory.store_action(
user_id=user_id,
action=action,
result=result
)
# 5. 生成回复
response = await self.llm.generate_response(
original_message=message,
plan=plan,
results=results
)
return response
class LongTermMemory:
"""长期记忆系统"""
def __init__(self, db_path: str):
self.db = VectorDatabase(db_path)
self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
async def store_action(self, user_id: str, action: dict,
result: dict):
"""存储行动记忆:将操作及结果向量化后存入数据库,供后续语义检索"""
memory_entry = {
"user_id": user_id,
"timestamp": datetime.now(),
"action": action,
"result": result,
# 将操作描述编码为向量,用于后续的语义相似度检索
"embedding": self.embedder.encode(
f"{action['type']}: {action['description']}"
)
}
await self.db.insert(memory_entry)
async def retrieve_relevant(self, query: str, user_id: str,
limit: int = 5) -> list:
"""检索相关记忆:通过向量相似度搜索找到与当前查询最相关的历史记忆"""
# 将查询文本编码为向量
query_embedding = self.embedder.encode(query)
# 在向量数据库中按相似度搜索,过滤指定用户的记忆
results = await self.db.similarity_search(
embedding=query_embedding,
filter={"user_id": user_id},
limit=limit
)
return results
class SystemInterface:
"""系统接口"""
def __init__(self):
self.tools = {
"file_system": FileSystemTool(),
"application": ApplicationTool(),
"network": NetworkTool(),
"system": SystemCommandTool()
}
async def execute(self, action: dict) -> dict:
"""执行系统操作"""
tool_name = action['tool']
tool = self.tools.get(tool_name)
if not tool:
return {"error": f"Unknown tool: {tool_name}"}
try: # try/except捕获异常,防止程序崩溃
result = await tool.run(action['parameters'])
return {"success": True, "data": result}
except Exception as e:
return {"success": False, "error": str(e)}
def get_tools(self) -> list:
"""获取可用工具列表"""
return [
{
"name": name,
"description": tool.description,
"parameters": tool.parameters
}
for name, tool in self.tools.items()
]
4. Google Gemini 2.5系列¶
4.1 Gemini 2.5家族¶
┌─────────────────────────────────────────────────────────────────┐
│ Google Gemini 2.5系列 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ Gemini 2.5 Pro │
│ ├── 定位:旗舰推理模型 │
│ ├── 特点:公开评测中推理能力处于第一梯队 │
│ ├── 上下文:1M tokens │
│ └── 应用:复杂分析、研究 │
│ │
│ Gemini 2.5 Flash │
│ ├── 定位:高效快速模型 │
│ ├── 特点:速度与质量平衡 │
│ ├── 优化:实时应用 │
│ └── 应用:聊天、内容生成 │
│ │
│ Gemini 2.5 Flash Image │
│ ├── 定位:图像生成与编辑 │
│ ├── 特点:多模态架构 │
│ ├── 能力:角色一致性、实时生成 │
│ └── 应用:图像创作、编辑 │
│ │
│ Gemini 2.5 Flash Lite │
│ ├── 定位:轻量级版本 │
│ └── 特点:成本优化 │
│ │
└─────────────────────────────────────────────────────────────────┘
4.2 Gemini 2.5技术特点¶
class Gemini25Capabilities:
"""Gemini 2.5能力分析"""
def __init__(self):
self.features = {
"multimodal": {
"description": "原生多模态",
"modalities": ["文本", "图像", "音频", "视频"],
"integration": "统一架构处理所有模态"
},
"long_context": {
"description": "超长上下文",
"pro": "1M tokens",
"flash": "1M tokens",
"use_cases": [
"整本书分析",
"长视频理解",
"大规模代码库"
]
},
"reasoning": {
"description": "高级推理",
"techniques": [
"链式思维",
"多步推理",
"数学求解",
"代码推理"
]
},
"agent": {
"description": "Agent能力",
"features": [
"Google工具集成",
"搜索增强",
"代码执行",
"多轮规划"
]
}
}
def benchmarks(self):
"""基准测试表现"""
return {
"MMLU": "以官方与第三方最新评测为准",
"HumanEval": "以官方与第三方最新评测为准",
"MATH": "以官方与第三方最新评测为准",
"GPQA": "以官方与第三方最新评测为准"
}
4.3 Gemini 2.5使用示例¶
import google.generativeai as genai
class Gemini25Assistant:
"""Gemini 2.5助手封装"""
def __init__(self, api_key: str):
genai.configure(api_key=api_key)
self.model_pro = genai.GenerativeModel('gemini-2.5-pro-preview-05-06')
self.model_flash = genai.GenerativeModel('gemini-2.5-flash-preview-04-17')
async def analyze_document(self, document_path: str,
questions: list) -> dict:
"""分析长文档"""
# 读取文档
with open(document_path, 'r', encoding='utf-8') as f: # with自动管理文件关闭
document = f.read()
# 构建提示
prompt = f"""Analyze the following document and answer these questions:
{chr(10).join(f"{i+1}. {q}" for i, q in enumerate(questions))} # chr(10)即\n(f-string内不能用反斜杠),enumerate编号后join拼接为多行列表
Document:
{document[:100000]} # Gemini 2.5 Pro支持1M tokens
"""
response = await self.model_pro.generate_content_async(prompt)
return {
"analysis": response.text,
"model": "gemini-2.5-pro",
"document_length": len(document)
}
async def multimodal_chat(self, text: str, image_path: str = None,
audio_path: str = None) -> dict:
"""多模态对话"""
contents = [text]
if image_path:
image = genai.upload_file(image_path)
contents.append(image)
if audio_path:
audio = genai.upload_file(audio_path)
contents.append(audio)
response = await self.model_flash.generate_content_async(contents)
return {
"response": response.text,
"modalities_used": ["text"] +
(["image"] if image_path else []) +
(["audio"] if audio_path else [])
}
async def code_with_search(self, task: str) -> dict:
"""结合搜索的代码生成"""
# 使用Google搜索工具
tools = [
genai.protos.Tool(
google_search=genai.protos.GoogleSearch()
)
]
model_with_tools = genai.GenerativeModel(
'gemini-2.5-pro-preview-05-06',
tools=tools
)
chat = model_with_tools.start_chat()
response = await chat.send_message_async(
f"Write code for this task, using Google Search if needed: {task}"
)
return {
"code": response.text,
"search_used": "search" in str(response.candidates[0].content)
}
5. Devin与AI编程Agent¶
5.1 Devin概述¶
公司:Cognition AI 创始人:Scott Wu(IOI金牌得主,Codeforces传奇大师) 发布时间:2024年 定位:首个全自主AI软件工程师
┌─────────────────────────────────────────────────────────────────┐
│ Devin核心能力 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 端到端软件开发 │
│ ├── 需求理解与分析 │
│ ├── 架构设计 │
│ ├── 代码实现 │
│ ├── 测试与调试 │
│ └── 部署与维护 │
│ │
│ 自主执行 │
│ ├── 无需人工干预 │
│ ├── 自主规划任务 │
│ ├── 自我纠错 │
│ └── 学习新知识 │
│ │
│ 工具使用 │
│ ├── 代码编辑器 │
│ ├── 浏览器(搜索、文档) │
│ ├── 命令行 │
│ └── 版本控制(Git) │
│ │
└─────────────────────────────────────────────────────────────────┘
5.2 Devin技术架构¶
class DevinArchitecture:
"""Devin架构分析"""
def __init__(self):
self.components = {
"perception": {
"description": "感知系统",
"inputs": [
"屏幕截图",
"终端输出",
"浏览器内容",
"代码文件"
],
"processing": "多模态理解"
},
"cognition": {
"description": "认知系统",
"capabilities": [
"任务分解",
"规划制定",
"决策推理",
"知识检索"
]
},
"action": {
"description": "行动系统",
"tools": [
"代码编辑",
"命令执行",
"网页浏览",
"文件操作"
]
},
"learning": {
"description": "学习系统",
"mechanisms": [
"从错误学习",
"知识积累",
"技能提升"
]
}
}
def workflow(self):
"""工作流程"""
return """
1. 任务接收
- 理解自然语言需求
- 澄清不明确的地方
2. 环境探索
- 查看项目结构
- 阅读相关文档
- 理解代码库
3. 方案设计
- 制定实施计划
- 选择技术方案
- 预估工作量
4. 代码实现
- 编写代码
- 运行测试
- 调试修复
5. 验证交付
- 功能验证
- 代码审查
- 文档编写
"""
class DevinPerformance:
"""Devin性能数据"""
def __init__(self):
self.metrics = {
"swe_bench": {
"description": "SWE-benchmark",
"score": "13.86%",
"comparison": "相较早期公开结果有显著提升",
"note": "端到端解决真实GitHub问题"
},
"upwork": {
"description": "Upwork任务",
"performance": "成功完成真实自由职业任务",
"tasks": ["编写代码", "修复bug", "数据分析"]
},
"revenue": {
"description": "商业表现",
"arr_2024_09": "$1M",
"arr_2025_06": "$73M+",
"growth": "快速增长",
"note": "数据来源于公开报道,实际数据可能有所变化"
}
}
5.3 AI编程Agent生态对比¶
class AICodingAgentsComparison:
"""AI编程Agent对比"""
def __init__(self):
self.agents = {
"devin": {
"company": "Cognition AI",
"type": "全自主Agent",
"autonomy": "高",
"interaction": "异步",
"pricing": "企业级",
"strengths": [
"端到端开发",
"自主规划",
"长期任务"
]
},
"claude_code": {
"company": "Anthropic",
"type": "交互式Agent",
"autonomy": "中",
"interaction": "实时协作",
"pricing": "按量计费",
"strengths": [
"代码理解",
"重构能力",
"安全执行"
]
},
"openai_codex": {
"company": "OpenAI",
"type": "云端Agent",
"autonomy": "中高",
"interaction": "ChatGPT集成",
"pricing": "订阅+按量",
"strengths": [
"Codex专用模型基础",
"多文件编辑",
"测试生成"
]
},
"github_copilot": {
"company": "GitHub/Microsoft",
"type": "编程助手",
"autonomy": "低",
"interaction": "IDE集成",
"pricing": "订阅制",
"strengths": [
"代码补全",
"广泛IDE支持",
"普及度高"
]
},
"cursor": {
"company": "Cursor",
"type": "AI编辑器",
"autonomy": "中",
"interaction": "编辑器",
"pricing": "订阅制",
"strengths": [
"编辑体验",
"Composer功能",
"快速迭代"
]
}
}
def comparison_table(self):
"""对比表格"""
return """
| 特性 | Devin | Claude Code | OpenAI Codex | GitHub Copilot | Cursor |
|------|-------|-------------|--------------|----------------|--------|
| 自主性 | ⭐⭐⭐⭐⭐ | ⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐ | ⭐⭐⭐ |
| 交互性 | ⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ |
| 代码质量 | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐ |
| 易用性 | ⭐⭐⭐ | ⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ | ⭐⭐⭐⭐⭐ |
| 价格 | $$$$ | $$$ | $$$ | $$ | $$ |
"""
6. 2025年Agent技术趋势¶
6.1 趋势分析¶
┌─────────────────────────────────────────────────────────────────┐
│ 2025年Agent技术趋势 │
├─────────────────────────────────────────────────────────────────┤
│ │
│ 1. Agentic Coding爆发 │
│ • 从Copilot到Agent的范式转移 │
│ • 2025年被定义为"Agentic Coding元年" │
│ • 代表:Devin, Claude Code, OpenAI Codex │
│ │
│ 2. 本地优先Agent │
│ • 数据隐私驱动 │
│ • 离线可用性 │
│ • 代表:OpenClaw, Ollama + Agent框架 │
│ │
│ 3. 多Agent协作 │
│ • 专业化Agent分工 │
│ • 协调与通信协议 │
│ • 代表:AutoGen, CrewAI │
│ │
│ 4. 长期记忆与个性化 │
│ • 跨会话记忆保持 │
│ • 用户偏好学习 │
│ • 个性化工作流 │
│ │
│ 5. 工具生态标准化 │
│ • MCP协议普及 │
│ • 工具发现机制 │
│ • 跨平台兼容 │
│ │
│ 6. 安全与可控性 │
│ • 沙箱执行环境 │
│ • 权限细粒度控制 │
│ • 人在回路机制 │
│ │
└─────────────────────────────────────────────────────────────────┘
6.2 技术发展方向¶
class AgentTechTrends2025:
"""2025年Agent技术发展方向"""
def __init__(self):
self.trends = {
"model_improvements": {
"description": "模型能力提升",
"directions": [
{
"area": "推理能力",
"trend": "o1/o3类推理模型普及",
"impact": "Agent决策质量提升"
},
{
"area": "上下文长度",
"trend": "百万级token上下文",
"impact": "处理复杂长任务"
},
{
"area": "多模态",
"trend": "原生多模态理解",
"impact": "GUI操作、视觉感知"
}
]
},
"infrastructure": {
"description": "基础设施演进",
"developments": [
{
"tech": "MCP协议",
"status": "成为事实标准",
"adoption": "OpenAI, Anthropic, Google支持"
},
{
"tech": "Agent框架",
"status": "成熟化",
"examples": ["LangGraph", "AutoGen", "CrewAI"]
},
{
"tech": "执行环境",
"status": "安全隔离",
"features": ["沙箱", "权限控制", "审计日志"]
}
]
},
"applications": {
"description": "应用场景扩展",
"domains": [
{
"domain": "软件开发",
"maturity": "高",
"tools": ["Devin", "Claude Code", "Codex"]
},
{
"domain": "数据分析",
"maturity": "中",
"tools": ["ChatGPT Data Analyst", "Claude"]
},
{
"domain": "内容创作",
"maturity": "中",
"tools": ["AI写作助手", "视频生成Agent"]
},
{
"domain": "科学研究",
"maturity": "早期",
"tools": ["AlphaFold", "科研Agent"]
}
]
}
}
def predictions(self):
"""2025-2026预测"""
return {
"short_term": {
"timeline": "2025年内",
"predictions": [
"Agentic Coding成为主流",
"MCP协议生态爆发",
"多Agent协作产品出现",
"本地Agent方案成熟"
]
},
"medium_term": {
"timeline": "2026年",
"predictions": [
"Agent即服务(Agent-as-a-Service)",
"跨Agent协作标准",
"Agent市场/商店",
"企业级Agent平台"
]
},
"long_term": {
"timeline": "2027+",
"predictions": [
"AGI-level Agent系统",
"完全自主的数字员工",
"Agent经济生态",
"人机协作新模式"
]
}
}
7. 实践项目:构建本地AI助手¶
7.1 项目概述¶
构建一个类似OpenClaw的本地AI助手,具备: 1. 本地LLM支持(Ollama) 2. 系统工具调用 3. 长期记忆 4. 命令行交互
7.2 完整实现¶
#!/usr/bin/env python3
"""
LocalAI - 本地AI助手
功能:
- 本地LLM支持(通过Ollama)
- 文件系统操作
- 命令执行
- 长期记忆
"""
import asyncio # Python标准异步库
import json
import sqlite3
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Optional
from dataclasses import dataclass, asdict
import subprocess
import ollama
@dataclass # @dataclass自动生成__init__等方法
class Memory:
"""记忆条目"""
id: Optional[int] # Optional表示值可以为None
timestamp: str
content: str
embedding: Optional[List[float]]
metadata: Dict
class LocalLLM:
"""本地LLM封装"""
def __init__(self, model: str = "llama3.2"):
self.model = model
self.client = ollama.AsyncClient()
async def generate(self, prompt: str,
system: str = None) -> str:
"""生成回复"""
messages = []
if system:
messages.append({
"role": "system",
"content": system
})
messages.append({
"role": "user",
"content": prompt
})
response = await self.client.chat(
model=self.model,
messages=messages
)
return response['message']['content']
async def embed(self, text: str) -> List[float]:
"""获取文本嵌入"""
response = await self.client.embeddings(
model=self.model,
prompt=text
)
return response['embedding']
class MemoryStore:
"""记忆存储"""
def __init__(self, db_path: str = "localai_memory.db"):
self.db_path = db_path
self._init_db()
def _init_db(self):
"""初始化数据库"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
CREATE TABLE IF NOT EXISTS memories (
id INTEGER PRIMARY KEY AUTOINCREMENT,
timestamp TEXT NOT NULL,
content TEXT NOT NULL,
embedding TEXT,
metadata TEXT
)
""")
conn.commit()
conn.close()
def add_memory(self, memory: Memory):
"""添加记忆"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
INSERT INTO memories (timestamp, content, embedding, metadata)
VALUES (?, ?, ?, ?)
""", (
memory.timestamp,
memory.content,
json.dumps(memory.embedding) if memory.embedding else None, # json.dumps将Python对象→JSON字符串
json.dumps(memory.metadata)
))
conn.commit()
conn.close()
def get_recent_memories(self, limit: int = 10) -> List[Memory]:
"""获取最近记忆"""
conn = sqlite3.connect(self.db_path)
cursor = conn.cursor()
cursor.execute("""
SELECT * FROM memories
ORDER BY timestamp DESC
LIMIT ?
""", (limit,))
rows = cursor.fetchall()
conn.close()
return [
Memory(
id=row[0],
timestamp=row[1],
content=row[2],
embedding=json.loads(row[3]) if row[3] else None, # 三元守卫:数据库NULL对应None,先判空再解析,避免json.loads(None)报错
metadata=json.loads(row[4]) if row[4] else {} # json.loads将JSON字符串→Python对象
)
for row in rows
]
class ToolRegistry:
"""工具注册表"""
def __init__(self):
self.tools: Dict[str, callable] = {}
self.descriptions: Dict[str, str] = {}
def register(self, name: str, description: str, func: callable):
"""注册工具"""
self.tools[name] = func
self.descriptions[name] = description
def get_tool(self, name: str) -> Optional[callable]: # Optional表示值可以为None
"""获取工具"""
return self.tools.get(name)
def list_tools(self) -> str:
"""列出工具"""
return "\n".join([
f"- {name}: {desc}"
for name, desc in self.descriptions.items()
])
class SystemTools:
"""系统工具"""
def __init__(self):
self.allowed_commands = ['ls', 'cat', 'pwd', 'echo', 'grep', 'find']
async def read_file(self, path: str) -> str:
"""读取文件"""
try:
file_path = Path(path).expanduser()
if not file_path.exists():
return f"Error: File not found: {path}"
return file_path.read_text(encoding='utf-8')
except Exception as e:
return f"Error reading file: {str(e)}"
async def write_file(self, path: str, content: str) -> str:
"""写入文件"""
try:
file_path = Path(path).expanduser()
file_path.parent.mkdir(parents=True, exist_ok=True)
file_path.write_text(content, encoding='utf-8')
return f"Successfully wrote to {path}"
except Exception as e:
return f"Error writing file: {str(e)}"
async def list_directory(self, path: str = ".") -> str:
"""列出目录"""
try:
dir_path = Path(path).expanduser()
if not dir_path.exists():
return f"Error: Directory not found: {path}"
entries = []
for entry in dir_path.iterdir():
entry_type = "📁" if entry.is_dir() else "📄"
entries.append(f"{entry_type} {entry.name}")
return "\n".join(entries)
except Exception as e:
return f"Error listing directory: {str(e)}"
async def execute_command(self, command: str) -> str:
"""执行命令(受限)——仅允许白名单内的安全命令"""
# 安全检查:解析命令名并验证是否在允许列表中
cmd_parts = command.split()
if not cmd_parts:
return "Error: Empty command"
base_cmd = cmd_parts[0]
if base_cmd not in self.allowed_commands:
return f"Error: Command '{base_cmd}' not allowed. Allowed: {', '.join(self.allowed_commands)}"
try:
# 在子进程中执行,设置30秒超时防止悬挂
result = subprocess.run(
command,
shell=True,
capture_output=True,
text=True,
timeout=30
)
output = result.stdout
if result.stderr:
output += f"\nStderr: {result.stderr}"
return output
except subprocess.TimeoutExpired:
return "Error: Command timed out"
except Exception as e:
return f"Error executing command: {str(e)}"
class LocalAI:
"""本地AI助手主类"""
def __init__(self, model: str = "llama3.2"):
self.llm = LocalLLM(model)
self.memory = MemoryStore()
self.tools = ToolRegistry()
self.system_tools = SystemTools()
self._register_tools()
def _register_tools(self):
"""注册系统工具"""
self.tools.register(
"read_file",
"读取文件内容",
self.system_tools.read_file
)
self.tools.register(
"write_file",
"写入文件内容",
self.system_tools.write_file
)
self.tools.register(
"list_directory",
"列出目录内容",
self.system_tools.list_directory
)
self.tools.register(
"execute_command",
"执行系统命令(受限)",
self.system_tools.execute_command
)
async def process_request(self, user_input: str) -> str:
"""处理用户请求——核心处理流程:记忆检索→提示构建→生成→工具调用→回复"""
# 1. 检索最近历史记忆作为上下文
recent_memories = self.memory.get_recent_memories(5)
memory_context = self._format_memories(recent_memories)
# 2. 构建系统提示:包含可用工具列表和记忆上下文
system_prompt = f"""You are a helpful local AI assistant.
You have access to the following tools:
{self.tools.list_tools()}
Recent context:
{memory_context}
When you need to use a tool, respond in this format:
TOOL: <tool_name>
ARGS: <json_arguments>
Otherwise, respond normally."""
# 3. 调用本地LLM生成回复
response = await self.llm.generate(
prompt=user_input,
system=system_prompt
)
# 4. 检测是否触发了工具调用(LLM输出以TOOL:开头)
if response.startswith("TOOL:"):
tool_result = await self._execute_tool(response)
# 将工具结果反馈给LLM,生成最终用户友好的回复
final_prompt = f"""User request: {user_input}
Tool used: {response}
Tool result: {tool_result}
Please provide a helpful response based on the tool result."""
final_response = await self.llm.generate(final_prompt)
# 存储记忆
self._store_interaction(user_input, final_response)
return final_response
# 5. 存储记忆
self._store_interaction(user_input, response)
return response
async def _execute_tool(self, tool_command: str) -> str:
"""解析并执行LLM输出的工具调用指令"""
lines = tool_command.strip().split('\n') # 链式调用:strip去除空白
tool_name = None
args = {}
# 解析LLM输出的结构化工具调用格式
for line in lines:
if line.startswith("TOOL:"):
tool_name = line[5:].strip() # 提取工具名
elif line.startswith("ARGS:"):
try:
args = json.loads(line[5:].strip()) # 解析JSON参数
except:
args = {}
if not tool_name:
return "Error: No tool specified"
tool = self.tools.get_tool(tool_name)
if not tool:
return f"Error: Unknown tool: {tool_name}"
try:
result = await tool(**args)
return result
except Exception as e:
return f"Error executing tool: {str(e)}"
def _format_memories(self, memories: List[Memory]) -> str:
"""格式化记忆"""
if not memories:
return "No recent memories."
return "\n".join([
f"- [{m.timestamp}] {m.content[:100]}..."
for m in memories
])
def _store_interaction(self, user_input: str, response: str):
"""存储交互记忆"""
memory = Memory(
id=None,
timestamp=datetime.now().isoformat(),
content=f"User: {user_input}\nAssistant: {response}",
embedding=None,
metadata={"type": "interaction"}
)
self.memory.add_memory(memory)
async def main():
"""主函数"""
print("🤖 LocalAI - 本地AI助手")
print("输入 'quit' 退出\n")
# 初始化
assistant = LocalAI(model="llama3.2")
while True:
try:
user_input = input("\nYou: ").strip()
if user_input.lower() in ['quit', 'exit', 'q']:
print("再见!")
break
if not user_input:
continue
print("\n🤔 思考中...")
response = await assistant.process_request(user_input)
print(f"\n🤖 AI: {response}")
except KeyboardInterrupt:
print("\n再见!")
break
except Exception as e:
print(f"\n❌ 错误: {str(e)}")
if __name__ == "__main__":
asyncio.run(main()) # 创建事件循环运行顶层协程
7.3 运行说明¶
# 1. 安装Ollama
# 访问 https://ollama.com 下载安装
# 2. 拉取模型
ollama pull llama3.2
# 3. 安装依赖
pip install ollama
# 4. 运行助手
python local_ai.py
总结¶
2025年是AI Agent技术的爆发之年,主要趋势包括:
- 模型能力飞跃:Claude 4、Codex 系列、Gemini 2.5 等新一代模型持续提升推理和编程能力
- Agentic Coding:从辅助编程到全自主开发,Devin、Claude Code、OpenAI Codex代表新范式
- 本地Agent兴起:OpenClaw等本地优先方案满足隐私和离线需求
- 生态标准化:MCP协议成为工具集成的标准
- 多Agent协作:从单一Agent向多Agent协作系统演进
这些技术正在重塑软件开发、知识工作和人机交互的方式。
参考资源¶
官方资源¶
- Claude 4 Announcement
- OpenAI Introducing Codex (2025-05-16)
- OpenAI Codex GA (2025-10-06)
- OpenClaw GitHub
- Google Gemini
- Devin by Cognition
论文¶
社区¶
文档版本: 1.0 作者: AI Learning Team
最后更新日期:2026-02-12 适用版本:LLM学习教程 v2026