forked from WEIRDLabUW/CSE-579-HW1
-
Notifications
You must be signed in to change notification settings - Fork 0
/
bc_data_variation.py
74 lines (62 loc) · 3.36 KB
/
bc_data_variation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import torch
import numpy as np
import json
import random
from evaluate import evaluate
def simulate_policy_bc_with_data_variation(env, policy, expert_data, data_ratios=[0.1, 0.3, 0.5, 0.7, 1.0],
num_epochs=500, episode_length=50, batch_size=32):
"""
Train the BC agent using varying amounts of expert data and record performance.
Args:
env: The environment in which the policy will be trained.
policy: The policy model to be trained.
expert_data: Full expert data used to sample subsets for training.
data_ratios: List of data proportions (e.g., [0.1, 0.3, 0.5, 1.0]) to investigate.
num_epochs: Number of epochs for training each subset.
episode_length: The length of each episode in the environment.
batch_size: Batch size used for training.
Returns:
performance_scores: List of average performance scores for each data ratio.
"""
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
policy.to(device)
performance_scores = []
for ratio in data_ratios:
# Sample a subset of the expert data based on the current ratio
sample_size = int(ratio * len(expert_data))
sampled_expert_data = random.sample(expert_data, sample_size)
idxs = np.array(range(len(sampled_expert_data)))
num_batches = len(idxs)*episode_length // batch_size
losses = []
# Initialize optimizer
optimizer = torch.optim.Adam(list(policy.parameters()), lr=1e-4)
# Train the policy with the current data subset
for epoch in range(num_epochs):
np.random.shuffle(idxs)
running_loss = 0.0
for i in range(num_batches):
optimizer.zero_grad()
t1_idx = np.random.randint(len(sampled_expert_data), size=(batch_size,)) # Indices of first trajectory
t1_idx_pertraj = [np.random.randint(sampled_expert_data[c_idx]['observations'].shape[0]) for c_idx in t1_idx]
t1_states = np.concatenate([sampled_expert_data[c_idx]['observations'][t_idx][None] for (c_idx, t_idx) in zip(t1_idx, t1_idx_pertraj)])
t1_actions = np.concatenate([sampled_expert_data[c_idx]['actions'][t_idx][None] for (c_idx, t_idx) in zip(t1_idx, t1_idx_pertraj)])
t1_states = torch.Tensor(t1_states).float().to(device)
t1_actions = torch.Tensor(t1_actions).float().to(device)
#========== TODO: start ==========
# Fill in your behavior cloning implementation here
log_probs = policy.log_prob(t1_states, t1_actions)
loss = -log_probs.mean()
loss.backward()
optimizer.step()
running_loss += loss.item()
# Print average loss every 10 epochs for monitoring
if epoch % 10 == 0:
print(f'[Data Ratio: {ratio * 100:.0f}%] Epoch [{epoch}/{num_epochs}], Loss: {running_loss / num_batches:.4f}')
# Evaluate the trained policy
Success_rate = evaluate(env, policy,'behavior_cloning', num_validation_runs=20, episode_length=episode_length,env_name='reacher')
print('avg_reward=',Success_rate)
performance_scores.append(Success_rate)
# Save the performance scores for each data ratio
with open("performance_scores.json", "w") as f:
json.dump(performance_scores, f)
return performance_scores