-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathdracm_trainer.py
138 lines (117 loc) · 6.36 KB
/
dracm_trainer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# This class is responsible for the training process of the PPO algorithm
import tensorflow as tf
import numpy as np
import time
from utils import logger
from policies.random_migrate_policy import RandomMigratePolicy
from policies.always_migrate_policy import AlwaysMigratePolicy
from policies.random_solution import random_solution
from policies.always_migration_solution import always_migration_solution
from policies.optimal_solution import optimal_solution_for_batch_system_infos
from policies.no_migration_solution import no_migration_solution
from utils.logger import Logger
class Trainer(object):
def __init__(self,
train_env,
eval_env,
algo,
sampler,
sample_processor,
update_batch_size,
policy,
n_itr,
save_interval,
save_path,
test_interval=0,
eval_sampler=None):
self.train_env = train_env
self.eval_env = eval_env
self.sampler = sampler
self.sample_processor = sample_processor
self.policy = policy
self.n_itr = n_itr
self.save_interval = save_interval
self.update_batch_size = update_batch_size
self.algo = algo
self.eval_sampler = eval_sampler
self.test_interval = test_interval
self.save_path = save_path
def train(self, rnn_policy=False, is_test=True, is_save =True, is_log=True,
std_reward=0.0,
mean_reward=0.0,
avg_random_rewards=0.0,
avg_always_migrate_rewards=0.0,
optimal_migrate_rewards=0.0,
no_migrate_rewards=0.0):
avg_ret = []
avg_loss = []
avg_latencies = []
for itr in range(self.n_itr):
itr_start_time = time.time()
logger.log("\n ---------------- Iteration %d ----------------" % itr)
logger.log("Sampling trajectories from environment ...")
start_time = time.time()
paths = self.sampler.obtain_samples(is_rnn=rnn_policy, log=False, log_prefix='',reward_mean=mean_reward,
reward_std=std_reward)
end_time = time.time()
logger.log("Sampling spend time: ", (end_time - start_time), "s")
logger.log("Processing trajectories ...")
start_time = time.time()
samples_data = self.sample_processor.process_samples(paths)
end_time = time.time()
logger.log("Processing spend time: ", (end_time - start_time), "s")
start_time = time.time()
avg_random_rewards = random_solution(self.train_env, samples_data["system_info"])
avg_always_migrate_rewards = always_migration_solution(self.train_env, samples_data["system_info"])
no_migrate_rewards =no_migration_solution(self.train_env, samples_data["system_info"])
end_time = time.time()
logger.log("Baselien algorithms: ", (end_time - start_time), "s")
# update ppo target
logger.log("Updating policies ....")
start_time = time.time()
policy_losses, ent_losses, value_losses = self.algo.update_dracm(samples_data, self.update_batch_size)
end_time = time.time()
logger.log("Update spend time: ", (end_time - start_time), "s")
""" ------------------- Logging Stuff --------------------------"""
ret = np.sum(samples_data['un_norm_rewards'], axis=-1)
avg_reward = np.mean(ret)
if is_log:
logger.logkv("Itr", itr)
logger.logkv("policy loss: ", np.round(np.mean(policy_losses), 2))
logger.logkv("value loss: ", np.round(np.mean(value_losses), 2))
logger.logkv("entropy loss: ", np.round(np.mean(ent_losses), 2))
logger.logkv("average reward: ", np.round(np.mean(avg_reward), 2))
logger.logkv("average random reward: ", -np.round(np.mean(avg_random_rewards), 2))
logger.logkv("average always migrate reward: ", -np.round(np.mean(avg_always_migrate_rewards),2))
logger.logkv("average never migrate rewards: ", -np.round(np.mean(no_migrate_rewards), 2))
logger.logkv("optimal migrate reward: ", -np.round(np.mean(optimal_migrate_rewards), 2))
logger.dumpkvs()
if itr % self.test_interval == 0 and is_test == True:
avg_ppo_rewards = 0.0
avg_random_rewards = 0.0
avg_always_migrate_rewards = 0.0
avg_optimal_rewards = 0.0
avg_no_migration_rewards = 0.0
num_iter = 4
for i in range(num_iter):
reward_collects, system_info_collects = self.eval_sampler.obtain_samples(is_rnn=rnn_policy)
ppo_rewards = np.mean(np.sum(reward_collects, axis=-1))
random_rewards = random_solution(self.eval_sampler.env, system_info_collects)
always_migrate_rewards = always_migration_solution(self.eval_sampler.env, system_info_collects)
#optimal_rewards = optimal_solution_for_batch_system_infos(self.eval_sampler.env, system_info_collects)
no_migrate_rewards = no_migration_solution(self.eval_sampler.env, system_info_collects)
avg_ppo_rewards += ppo_rewards
avg_random_rewards += random_rewards
avg_always_migrate_rewards += always_migrate_rewards
#avg_optimal_rewards += optimal_rewards
avg_no_migration_rewards += no_migrate_rewards
if is_log:
logger.logkv("eval reward", avg_ppo_rewards / num_iter)
logger.logkv("eval random reward", -(avg_random_rewards / num_iter))
logger.logkv("eval always migration reward", -(avg_always_migrate_rewards / num_iter))
logger.logkv("eval optimal reward", -(avg_optimal_rewards / num_iter))
logger.logkv("eval no migration reward", -(avg_no_migration_rewards)/ num_iter)
logger.dumpkvs()
if itr % self.save_interval == 0 and is_save == True:
logger.log("save model weights ... ")
self.policy.save_weights(self.save_path+str(itr))