This repository has been archived by the owner on Oct 6, 2021. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tme_6_ppo_clipped_lunarlander.py
103 lines (82 loc) · 2.99 KB
/
tme_6_ppo_clipped_lunarlander.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import gym
import numpy as np
import torch
from torch.utils.tensorboard import SummaryWriter
from agent import PPOAdaptive, PPOClipped
from experiment import Experiment
from logger import get_logger
number_of_episodes = 3000
optimize_every = 64 # Number of steps.
show_every = 100 # Number of episodes.
if __name__ == "__main__":
env = gym.make("LunarLander-v2")
# Create a new agent here.
experiment = Experiment.create(
base_name="ppo/ppo_clipped_LunarLander-v2",
model_class=PPOClipped,
hp={
"observation_space": env.observation_space,
"action_space": env.action_space,
"learning_rate": 0.0001,
"gamma": 0.98,
"k": 4,
"epsilon": 0.01,
},
)
experiment.save()
# Or load a previous one.
# experiment = Experiment.load("...")
logger = get_logger(experiment.name, file_path=experiment.log_path)
writer = SummaryWriter(
log_dir=experiment.writer_path, purge_step=experiment.episode
)
experiment.info(logger)
while experiment.episode < number_of_episodes:
experiment.episode += 1
show = (experiment.episode + 1) % show_every == 0
state = env.reset()
episode_reward, episode_steps = 0, 0
entropies, policy_losses, value_losses = [], [], []
while True:
# Draw an action and act on the environment.
action, entropy = experiment.model.step(torch.from_numpy(state).float())
end_state, reward, done, info = env.step(action)
# Record the transition.
experiment.model.add_transition(
(
state,
action,
reward,
end_state,
False if info.get("TimeLimit.truncated") else done,
)
)
state = end_state
experiment.step += 1
episode_steps += 1
episode_reward += reward
entropies.append(entropy)
# Optimize if needed.
if (experiment.step + 1) % optimize_every == 0:
policy_loss, value_loss = experiment.model.optimize()
policy_losses.append(policy_loss)
value_losses.append(value_loss)
# Show if needed.
if show:
env.render()
if done:
break
# Log.
if show:
logger.info(f"Episode {experiment.episode}: reward = {episode_reward}.")
writer.add_scalars(
"train",
{"reward": episode_reward, "steps": episode_steps},
global_step=experiment.episode,
)
debug_scalars = {"entropy": np.mean(entropies)}
if len(policy_losses) > 0:
debug_scalars["policy_loss"] = np.mean(policy_losses)
debug_scalars["value_loss"] = np.mean(value_losses)
writer.add_scalars("debug", debug_scalars, global_step=experiment.episode)
env.close()