-
Notifications
You must be signed in to change notification settings - Fork 0
/
ppo_s_train.py
78 lines (66 loc) · 1.78 KB
/
ppo_s_train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import sys
import time
from stable_baselines import PPO2
from stable_baselines.common.policies import MlpPolicy
from stable_baselines.common.vec_env import SubprocVecEnv
import tensegrity_env_s_ppo2 as tes
import tensegrity_env_base as teb
import common as cm
MAX_CPU = 16 # cap it per current aws instance type
#STEPS = 10000
STEPS = 50000
#STEPS = 100000
#STEPS = 500000
#STEPS = 2000000
#DIMENSION = 16
#DIMENSION = 32
#DIMENSION = 64
#DIMENSION = 128
def callback(lcl, glb):
return False
def train(env, file):
start = time.time()
#env.setRender(False)
# create the learning agent
model = PPO2(
#tensorboard_log=saver.data_dir,
policy=MlpPolicy,
#policy_kwargs=dict(net_arch=[dict(pi=[DIMENSION, DIMENSION], vf=[DIMENSION, DIMENSION])]),
policy_kwargs=dict(net_arch=[8, 8]),
env=env,
gamma=0.998,
#n_steps=1000,
n_steps=300,
ent_coef=0,
learning_rate=1e-3,
vf_coef=0.5,
max_grad_norm=0.5,
lam=0.95,
nminibatches=10,
noptepochs=10,
cliprange=0.2,
verbose=1,
)
# train the agent on the environment
model.learn(
#total_timesteps=10000000,
total_timesteps=STEPS,
log_interval=10,
#log_dir=".",
#record_video=False
)
# save trained model
model.save(teb.PATH_POLICY + file)
print("Duration: %.1f" % ((time.time() - start)/60))
def main(n_cpu):
env = SubprocVecEnv([lambda: tes.TensegrityEnvSinePpo2(cm.TRAIN) for i in range(n_cpu)])
train(env, tes.NAME_POLICY)
if __name__ == '__main__':
n_cpu = 2
if len(sys.argv) > 1:
arg = sys.argv[1]
arg = int(arg)
if arg > MAX_CPU:
arg = MAX_CPU
n_cpu = arg
main(n_cpu)