PPO的基本思想
- 策略优化:PPO直接优化策略,通过限制更新幅度来保证训练稳定性。
- Clip方法:PPO引入了clip方法限制策略更新的幅度,避免策略过大更新导致的不稳定。
- 优势估计:使用优势函数来评估当前策略相对于某个基准策略的提升。
详细的训练过程
- 初始化:初始化策略网络(Actor)和价值网络(Critic),设置超参数和经验回放池。
- 交互环境:在每一回合中,智能体根据当前策略与环境进行交互,选择动作并获得奖励,存储经验。
- 计算优势:使用GAE(广义优势估计)方法计算优势函数,估计每个状态-动作对的优势。
- 更新策略网络:使用Clip-PPO方法,通过限制策略变化幅度来更新策略网络。
- 更新价值网络:通过最小化价值函数预测误差来更新价值网络。
- 训练结束:在达到设定的回合数后,结束训练过程。
import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
gamma = 0.99
lambda_ = 0.95
clip_ratio = 0.2
actor_lr = 0.0003
critic_lr = 0.0003
batch_size = 64
epochs = 10
update_steps = 4000
env = gym.make('Pendulum-v1')
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_bound = env.action_space.high[0]
def build_actor():
model = tf.keras.Sequential([
layers.Dense(64, activation='relu', input_dim=state_dim),
layers.Dense(64, activation='relu'),
layers.Dense(action_dim, activation='tanh')
])
return model
def build_critic():
model = tf.keras.Sequential([
layers.Dense(64, activation='relu', input_dim=state_dim),
layers.Dense(64, activation='relu'),
layers.Dense(1)
])
return model
def train_ppo(episodes):
actor = build_actor()
critic = build_critic()
actor_optimizer = tf.keras.optimizers.Adam(learning_rate=actor_lr)
critic_optimizer = tf.keras.optimizers.Adam(learning_rate=critic_lr)
def compute_advantages(rewards, values, next_values, done):
advantages = np.zeros_like(rewards)
gae = 0
for t in reversed(range(len(rewards))):
delta = rewards[t] + gamma * next_values[t] * (1 - done[t]) - values[t]
gae = delta + gamma * lambda_ * (1 - done[t]) * gae
advantages[t] = gae
return advantages
def update_actor_and_critic(states, actions, advantages, old_log_probs, returns):
with tf.GradientTape() as tape:
logits = actor(states)
new_log_probs = tf.reduce_sum(tf.math.log(tf.reduce_sum(logits * actions, axis=1)))
ratio = tf.exp(new_log_probs - old_log_probs)
clip_loss = tf.reduce_mean(tf.minimum(ratio * advantages, tf.clip_by_value(ratio, 1 - clip_ratio, 1 + clip_ratio) * advantages))
actor_loss = -clip_loss
actor_grads = tape.gradient(actor_loss, actor.trainable_variables)
actor_optimizer.apply_gradients(zip(actor_grads, actor.trainable_variables))
with tf.GradientTape() as tape:
values = critic(states)
critic_loss = tf.reduce_mean((returns - values) ** 2)
critic_grads = tape.gradient(critic_loss, critic.trainable_variables)
critic_optimizer.apply_gradients(zip(critic_grads, critic.trainable_variables))
for episode in range(episodes):
state = env.reset()
done = False
total_reward = 0
states, actions, rewards, next_states, dones = [], [], [], [], []
while not done:
state = np.reshape(state, [1, state_dim])
action = actor.predict(state)[0]
noise = np.random.normal(0, action_bound * 0.1, size=action_dim)
action = np.clip(action + noise, -action_bound, action_bound)
next_state, reward, done, _ = env.step(action)
next_state = np.reshape(next_state, [1, state_dim])
states.append(state)
actions.append(action)
rewards.append(reward)
next_states.append(next_state)
dones.append(done)
state = next_state
total_reward += reward
states = np.vstack(states)
actions = np.vstack(actions)
rewards = np.array(rewards)
next_states = np.vstack(next_states)
dones = np.array(dones)
values = critic.predict(states)
next_values = critic.predict(next_states)
advantages = compute_advantages(rewards, values, next_values, dones)
returns = advantages + values
old_log_probs = tf.reduce_sum(tf.math.log(tf.reduce_sum(actor.predict(states) * actions, axis=1)))
for _ in range(epochs):
update_actor_and_critic(states, actions, advantages, old_log_probs, returns)
print(f"Episode {episode + 1} - Total Reward: {total_reward}")
train_ppo(1000)