03 - 元强化学习¶
学习时间: 4-5小时 重要性: ⭐⭐⭐⭐⭐ 学会如何学习 前置知识: MAML、策略梯度
🎯 学习目标¶
完成本章后,你将能够: - 理解元学习的核心思想 - 掌握MAML算法 - 了解RL²和任务分布学习 - 应用元RL解决少样本学习问题
1. 元强化学习简介¶
1.1 什么是元学习¶
核心思想:
"学会学习"(Learning to Learn)
与传统RL的区别: - 传统RL:在单一任务上学习 - 元RL:在多个相关任务上学习,快速适应新任务
1.2 应用场景¶
- 机器人:快速适应新环境
- 游戏:快速掌握新游戏规则
- 推荐系统:快速适应新用户
- 医疗:快速适应新病人
1.3 问题定义¶
任务分布: $\(\mathcal{T} \sim p(\mathcal{T})\)$
目标: $\(\min_\theta \mathbb{E}_{\mathcal{T} \sim p(\mathcal{T})} [\mathcal{L}(\theta, \mathcal{T})]\)$
其中\(\mathcal{L}(\theta, \mathcal{T})\)是在任务\(\mathcal{T}\)上的损失。
2. MAML:模型无关元学习¶
2.1 核心思想¶
找到好的初始化参数: - 使得少量梯度更新就能适应新任务 - 模型无关,可应用于任何基于梯度的学习
2.2 算法流程¶
Text Only
对于每个迭代:
采样一批任务 T_i ~ p(T)
对于每个任务 T_i:
计算梯度: g_i = ∇_θ L(θ, T_i)
更新参数: θ_i' = θ - α * g_i
元更新: θ = θ - β * ∇_θ Σ_i L(θ_i', T_i)
2.3 代码实现¶
Python
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
class MAML:
"""MAML算法"""
def __init__(self, model, inner_lr=0.01, meta_lr=0.001, num_inner_steps=1):
# θ: 元参数(外层循环优化)— self.model.parameters() 即为 θ
self.model = model
self.inner_lr = inner_lr
self.meta_lr = meta_lr
self.num_inner_steps = num_inner_steps
self.meta_optimizer = optim.Adam(self.model.parameters(), lr=meta_lr)
def inner_loop(self, task_data, create_graph=False):
"""
内循环:在单个任务上适应
参数:
task_data: (support_x, support_y, query_x, query_y)
create_graph: 是否创建计算图(用于二阶导数)
"""
support_x, support_y, query_x, query_y = task_data
# φ: 任务特定参数(内层循环适应)= θ - α∇L_task(θ)
# 克隆当前元参数 θ 作为 φ 的初始值
adapted_params = {name: param.clone()
for name, param in self.model.named_parameters()}
# 内循环更新
for _ in range(self.num_inner_steps):
# 使用support set计算损失
support_pred = self.model(support_x, adapted_params)
support_loss = nn.MSELoss()(support_pred, support_y)
# 计算梯度
grads = torch.autograd.grad(
support_loss,
adapted_params.values(),
create_graph=create_graph
)
# 更新参数
adapted_params = {
name: param - self.inner_lr * grad
for (name, param), grad in zip(adapted_params.items(), grads) # zip按位置配对
}
return adapted_params
def outer_loop(self, batch_tasks):
"""
外循环:元更新
参数:
batch_tasks: 一批任务数据
"""
meta_loss = 0
for task_data in batch_tasks:
# 内循环适应
adapted_params = self.inner_loop(task_data, create_graph=True)
# 在query set上评估
_, _, query_x, query_y = task_data
query_pred = self.model(query_x, adapted_params)
query_loss = nn.MSELoss()(query_pred, query_y)
meta_loss += query_loss
# 元更新
meta_loss = meta_loss / len(batch_tasks)
self.meta_optimizer.zero_grad() # 清零梯度
meta_loss.backward() # 反向传播计算梯度
self.meta_optimizer.step() # 更新参数
return meta_loss.item() # 将单元素张量转为Python数值
def adapt(self, task_data):
"""适应新任务"""
adapted_params = self.inner_loop(task_data, create_graph=False)
return adapted_params
class SimpleModel(nn.Module): # 继承nn.Module定义网络层
"""简单模型用于MAML"""
def __init__(self, input_dim, output_dim, hidden_dim=40):
super(SimpleModel, self).__init__()
self.layers = nn.ModuleList([
nn.Linear(input_dim, hidden_dim),
nn.ReLU(),
nn.Linear(hidden_dim, output_dim)
])
def forward(self, x, params=None):
"""前向传播,支持自定义参数"""
if params is None:
params = {name: param for name, param in self.named_parameters()}
x = nn.functional.linear(x, params['layers.0.weight'], params['layers.0.bias'])
x = nn.functional.relu(x)
x = nn.functional.linear(x, params['layers.2.weight'], params['layers.2.bias'])
return x
3. RL²:通过慢速强化学习进行快速强化学习¶
3.1 核心思想¶
循环神经网络作为元学习者: - 使用RNN的隐藏状态存储任务信息 - 每个episode开始时重置隐藏状态 - 通过大量任务训练RNN
3.2 代码实现¶
Python
class RL2(nn.Module):
"""RL²算法"""
def __init__(self, state_dim, action_dim, hidden_dim=256):
super(RL2, self).__init__()
# LSTM处理序列
self.lstm = nn.LSTM(state_dim + action_dim + 1, hidden_dim, batch_first=True)
# 策略头
self.policy_head = nn.Linear(hidden_dim, action_dim)
# 价值头
self.value_head = nn.Linear(hidden_dim, 1)
def forward(self, states, actions, rewards, hidden=None):
"""
前向传播
参数:
states: (batch, seq_len, state_dim)
actions: (batch, seq_len, action_dim)
rewards: (batch, seq_len, 1)
hidden: LSTM隐藏状态
"""
# 拼接输入
inputs = torch.cat([states, actions, rewards], dim=-1) # torch.cat沿已有维度拼接张量
# LSTM
lstm_out, hidden = self.lstm(inputs, hidden)
# 输出
logits = self.policy_head(lstm_out)
values = self.value_head(lstm_out)
return logits, values, hidden
def reset_hidden(self, batch_size=1):
"""重置隐藏状态"""
h = torch.zeros(1, batch_size, self.lstm.hidden_size)
c = torch.zeros(1, batch_size, self.lstm.hidden_size)
return (h, c)
4. 任务分布学习¶
4.1 任务生成¶
Python
class TaskDistribution:
"""任务分布"""
def __init__(self, n_tasks=1000):
self.n_tasks = n_tasks
self.tasks = self._generate_tasks()
def _generate_tasks(self):
"""生成任务集合"""
tasks = []
for _ in range(self.n_tasks):
# 随机生成任务参数
# 例如:不同的目标位置、不同的动力学参数等
task_params = {
'goal': np.random.uniform(-1, 1, size=2),
'dynamics': np.random.uniform(0.8, 1.2)
}
tasks.append(task_params)
return tasks
def sample_task(self):
"""采样一个任务"""
return random.choice(self.tasks)
def sample_batch(self, batch_size):
"""采样一批任务"""
return [self.sample_task() for _ in range(batch_size)]
class NavigationTask:
"""导航任务示例"""
def __init__(self, goal_position, start_position=(0, 0)):
self.goal = np.array(goal_position) # np.array创建NumPy数组
self.start = np.array(start_position)
self.position = self.start.copy()
def reset(self):
"""重置任务"""
self.position = self.start.copy()
return self._get_observation(), {}
def _get_observation(self):
"""获取观察"""
return np.concatenate([self.position, self.goal])
def step(self, action):
"""执行动作"""
# 更新位置
self.position += action
# 计算奖励(距离目标的负距离)
distance = np.linalg.norm(self.position - self.goal) # np.linalg线性代数运算
reward = -distance
# 检查是否到达目标
done = distance < 0.1
return self._get_observation(), reward, done, False, {}
5. 元RL训练流程¶
Python
def train_meta_rl(agent, task_distribution, num_iterations=10000,
episodes_per_task=10):
"""
训练元RL智能体
参数:
agent: 元RL智能体
task_distribution: 任务分布
num_iterations: 训练迭代次数
episodes_per_task: 每个任务的episode数
"""
for iteration in range(num_iterations):
# 采样一批任务
tasks = task_distribution.sample_batch(batch_size=16)
meta_loss = 0
for task in tasks:
# 收集经验
for episode in range(episodes_per_task):
obs, _ = task.reset()
hidden = agent.reset_hidden()
episode_data = {
'observations': [],
'actions': [],
'rewards': [],
'dones': []
}
done = False
while not done:
# 选择动作
action, hidden = agent.select_action(obs, hidden)
# 执行动作
next_obs, reward, terminated, truncated, _ = task.step(action)
done = terminated or truncated
# 存储数据
episode_data['observations'].append(obs)
episode_data['actions'].append(action)
episode_data['rewards'].append(reward)
episode_data['dones'].append(done)
obs = next_obs
# 计算损失并更新
loss = agent.update(episode_data)
meta_loss += loss
# 打印进度
if (iteration + 1) % 100 == 0:
avg_loss = meta_loss / (len(tasks) * episodes_per_task)
print(f"Iteration {iteration + 1}, Avg Loss: {avg_loss:.4f}")
def evaluate_meta_rl(agent, new_task, num_episodes=10):
"""
评估元RL智能体在新任务上的表现
参数:
agent: 训练好的元RL智能体
new_task: 新任务
num_episodes: 评估episode数
"""
rewards = []
for episode in range(num_episodes):
obs, _ = new_task.reset()
hidden = agent.reset_hidden()
episode_reward = 0
done = False
while not done:
action, hidden = agent.select_action(obs, hidden)
obs, reward, terminated, truncated, _ = new_task.step(action)
done = terminated or truncated
episode_reward += reward
rewards.append(episode_reward)
return np.mean(rewards), np.std(rewards)
6. 本章总结¶
核心概念¶
Text Only
元强化学习:
├── 目标: 学会学习
├── 方法:
│ ├── MAML: 找到好的初始化
│ ├── RL²: RNN作为元学习者
│ └── 任务分布: 学习从任务分布中采样
└── 应用:
├── 少样本学习
├── 快速适应
└── 迁移学习
算法对比¶
| 算法 | 核心思想 | 优点 | 缺点 |
|---|---|---|---|
| MAML | 好的初始化 | 模型无关 | 二阶导数计算量大 |
| RL² | RNN记忆 | 端到端 | 需要大量任务 |
| ProMP | 改进MAML | 更稳定 | 复杂 |
✅ 自测问题¶
-
元学习与传统学习的主要区别是什么?
-
MAML为什么能找到好的初始化参数?
-
RL²如何利用RNN进行元学习?
📚 延伸阅读¶
- Finn et al. (2017) - MAML
- Duan et al. (2017) - RL²
- Rothfuss et al. (2019) - ProMP
→ 下一步:04-RLHF与人类反馈.md