From 9f9fb56d87b58e05f679c06ac970a00171f340e4 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Mon, 10 Apr 2023 15:17:14 +0800 Subject: [PATCH 1/8] add bcq --- ding/example/bcq.py | 45 +++ ding/model/template/__init__.py | 1 + ding/model/template/bcq.py | 108 +++++++ ding/policy/__init__.py | 3 + ding/policy/bcq.py | 290 ++++++++++++++++++ ding/policy/command_mode_policy_instance.py | 8 + .../config/halfcheetah_medium_bcq_config.py | 55 ++++ .../halfcheetah_medium_expert_bcq_config.py | 55 ++++ dizoo/d4rl/config/hopper_medium_bcq_config.py | 55 ++++ .../config/hopper_medium_expert_bcq_config.py | 55 ++++ dizoo/d4rl/entry/d4rl_bcq_main.py | 21 ++ 11 files changed, 696 insertions(+) create mode 100755 ding/example/bcq.py create mode 100755 ding/model/template/bcq.py create mode 100755 ding/policy/bcq.py create mode 100755 dizoo/d4rl/config/halfcheetah_medium_bcq_config.py create mode 100755 dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py create mode 100755 dizoo/d4rl/config/hopper_medium_bcq_config.py create mode 100755 dizoo/d4rl/config/hopper_medium_expert_bcq_config.py create mode 100755 dizoo/d4rl/entry/d4rl_bcq_main.py diff --git a/ding/example/bcq.py b/ding/example/bcq.py new file mode 100755 index 0000000000..0744388e3a --- /dev/null +++ b/ding/example/bcq.py @@ -0,0 +1,45 @@ +import gym +from ditk import logging +from ding.model import BCQ +from ding.policy import BCQPolicy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2 +from ding.data import create_dataset +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OfflineRLContext +from ding.framework.middleware import interaction_evaluator, trainer, CkptSaver, offline_data_fetcher, offline_logger +from ding.utils import set_pkg_seed +from dizoo.d4rl.envs import D4RLEnv +from dizoo.d4rl.config.halfcheetah_medium_bcq_config import main_config, create_config +# from dizoo.d4rl.config.halfcheetah_medium_expert_edac_config import main_config,create_config +# from dizoo.d4rl.config.hopper_medium_expert_edac_config import main_config,create_config +# from dizoo.d4rl.config.hopper_medium_edac_config import main_config,create_config + + +def main(): + # If you don't have offline data, you need to prepare if first and set the data_path in config + # For demostration, we also can train a RL policy (e.g. SAC) and collect some data + logging.getLogger().setLevel(logging.INFO) + cfg = compile_config(main_config, create_cfg=create_config, auto=True) + ding_init(cfg) + with task.start(async_mode=False, ctx=OfflineRLContext()): + evaluator_env = BaseEnvManagerV2( + env_fn=[lambda: D4RLEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager + ) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + dataset = create_dataset(cfg) + model = BCQ(**cfg.policy.model) + policy = BCQPolicy(cfg.policy, model=model) + + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(offline_data_fetcher(cfg, dataset)) + task.use(trainer(cfg, policy.learn_mode)) + task.use(CkptSaver(policy, cfg.exp_name, train_freq=10000000)) + task.use(offline_logger()) + task.run() + + +if __name__ == "__main__": + main() diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py index e994286ac3..b22ea61d6e 100644 --- a/ding/model/template/__init__.py +++ b/ding/model/template/__init__.py @@ -23,3 +23,4 @@ from .vae import VanillaVAE from .decision_transformer import DecisionTransformer from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS +from .bcq import BCQ \ No newline at end of file diff --git a/ding/model/template/bcq.py b/ding/model/template/bcq.py new file mode 100755 index 0000000000..58db5a3fbd --- /dev/null +++ b/ding/model/template/bcq.py @@ -0,0 +1,108 @@ +from typing import Union, Dict, Optional, List +from easydict import EasyDict +import numpy as np +import torch +import torch.nn as nn + +from ding.utils import SequenceType, squeeze, MODEL_REGISTRY +from ..common import RegressionHead, ReparameterizationHead +from .vae import VanillaVAE + + +@MODEL_REGISTRY.register('bcq') +class BCQ(nn.Module): + + mode = ['compute_actor', 'compute_critic', 'compute_vae', 'compute_eval'] + + def __init__( + self, + obs_shape: Union[int, SequenceType], + action_shape: Union[int, SequenceType, EasyDict], + actor_head_hidden_size: int = 64, + critic_head_hidden_size: int = 64, + activation: Optional[nn.Module] = nn.ReLU(), + norm_type: Optional[str] = None, + vae_hidden_dims: List = [750, 750], + phi: float = 0.05 + ) -> None: + super(BCQ, self).__init__() + obs_shape: int = squeeze(obs_shape) + action_shape = squeeze(action_shape) + self.action_shape = action_shape + self.input_size = obs_shape + self.phi = phi + + critic_input_size = self.input_size + action_shape + self.critic = nn.ModuleList() + for _ in range(2): + net = [] + d = critic_input_size + for dim in critic_head_hidden_size: + net.append(nn.Linear(d, dim)) + net.append(activation) + d = dim + net.append(nn.Linear(d, 1)) + self.critic.append(nn.Sequential(*net)) + + net = [] + d = critic_input_size + for dim in actor_head_hidden_size: + net.append(nn.Linear(d, dim)) + net.append(activation) + d = dim + net.append(nn.Linear(d, 1)) + self.actor = nn.Sequential(*net) + + self.vae = VanillaVAE(action_shape, obs_shape, action_shape * 2, vae_hidden_dims) + + def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: str) -> Dict[str, torch.Tensor]: + """ + Overview: + The unique execution (forward) method of QAC method, and one can indicate different modes to implement \ + different computation graph, including ``compute_actor`` and ``compute_critic`` in QAC. + Mode compute_actor: + Arguments: + - inputs (:obj:`torch.Tensor`): Observation data, defaults to tensor. + Returns: + - output (:obj:`Dict`): Output dict data, including differnet key-values among distinct action_space. + Mode compute_critic: + Arguments: + - inputs (:obj:`Dict`): Input dict data, including obs and action tensor. + Returns: + - output (:obj:`Dict`): Output dict data, including q_value tensor. + + .. note:: + For specific examples, one can refer to API doc of ``compute_actor`` and ``compute_critic`` respectively. + """ + assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode) + return getattr(self, mode)(inputs) + + def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + obs, action = inputs['obs'], inputs['action'] + if len(action.shape) == 1: # (B, ) -> (B, 1) + action = action.unsqueeze(1) + x = torch.cat([obs, action], dim=-1) + x = [m(x).squeeze() for m in self.critic] + return {'q_value': x} + + def compute_actor(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]: + input = torch.cat([inputs['obs'], inputs['action']], -1) + x = self.actor(input) + action = self.phi * 1 * torch.tanh(x) + action = (action + inputs['action']).clamp(-1, 1) + return {'action': action} + + def compute_vae(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + return self.vae.forward(inputs) + + def compute_eval(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + obs = inputs['obs'] + obs_rep = obs.clone().unsqueeze(0).repeat_interleave(100, dim=0) + z = torch.randn((obs_rep.shape[0], obs_rep.shape[1], self.action_shape * 2)).to(obs.device).clamp(-0.5, 0.5) + sample_action = self.vae.decode_with_obs(z, obs_rep)['reconstruction_action'] + action = self.compute_actor({'obs': obs_rep, 'action': sample_action})['action'] + q = self.compute_critic({'obs': obs_rep, 'action': action})['q_value'][0] + idx = q.argmax(dim=0).unsqueeze(0).unsqueeze(-1) + idx = idx.repeat_interleave(action.shape[-1], dim=-1) + action = action.gather(0, idx).squeeze() + return {'action': action} diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py index 15575c7d30..65f3f2757e 100644 --- a/ding/policy/__init__.py +++ b/ding/policy/__init__.py @@ -18,6 +18,7 @@ from .ppo import PPOPolicy, PPOPGPolicy, PPOOffPolicy from .sac import SACPolicy, SACDiscretePolicy, SQILSACPolicy from .cql import CQLPolicy, CQLDiscretePolicy +from .edac import EDACPolicy from .impala import IMPALAPolicy from .ngu import NGUPolicy from .r2d2 import R2D2Policy @@ -48,5 +49,7 @@ from .pc import ProcedureCloningBFSPolicy +from .bcq import BCQPolicy + # new-type policy from .ppof import PPOFPolicy diff --git a/ding/policy/bcq.py b/ding/policy/bcq.py new file mode 100755 index 0000000000..f0144566b4 --- /dev/null +++ b/ding/policy/bcq.py @@ -0,0 +1,290 @@ +from typing import List, Dict, Any, Tuple, Union +from collections import namedtuple +import copy +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ding.torch_utils import Adam, to_device +from ding.rl_utils import v_1step_td_data, v_1step_td_error, get_train_sample, get_nstep_return_data +from ding.model import model_wrap +from ding.policy import Policy +from ding.utils import POLICY_REGISTRY +from ding.utils.data import default_collate, default_decollate +from .common_utils import default_preprocess_learn + + +@POLICY_REGISTRY.register('bcq') +class BCQPolicy(Policy): + config = dict( + type='bcq', + # (bool) Whether to use cuda for network. + cuda=False, + # (bool type) priority: Determine whether to use priority in buffer sample. + # Default False in SAC. + priority=False, + # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. + priority_IS_weight=False, + # (int) Number of training samples(randomly collected) in replay buffer when training starts. + # Default 10000 in SAC. + random_collect_size=10000, + nstep=1, + model=dict( + # (List) Hidden list for actor network head. + actor_head_hidden_size=[400,400,300], + + + # (List) Hidden list for critic network head. + critic_head_hidden_size=[400,400,300], + # Max perturbation hyper-parameter for BCQ + phi=0.05, + ), + learn=dict( + + # How many updates(iterations) to train after collector's one collection. + # Bigger "update_per_collect" means bigger off-policy. + # collect data -> update policy-> collect data -> ... + update_per_collect=1, + # (int) Minibatch size for gradient descent. + batch_size=100, + + # (float type) learning_rate_q: Learning rate for soft q network. + # Default to 3e-4. + # Please set to 1e-3, when model.value_network is True. + learning_rate_q=3e-4, + # (float type) learning_rate_policy: Learning rate for policy network. + # Default to 3e-4. + # Please set to 1e-3, when model.value_network is True. + learning_rate_policy=3e-4, + # (float type) learning_rate_vae: Learning rate for vae network. + # `learning_rate_value` should be initialized, when model.vae_network is True. + # Please set to 3e-4, when model.vae_network is True. + learning_rate_vae=3e-4, + # (bool) Whether ignore done(usually for max step termination env. e.g. pendulum) + # Note: Gym wraps the MuJoCo envs by default with TimeLimit environment wrappers. + # These limit HalfCheetah, and several other MuJoCo envs, to max length of 1000. + # However, interaction with HalfCheetah always gets done with done is False, + # Since we inplace done==True with done==False to keep + # TD-error accurate computation(``gamma * (1 - done) * next_v + reward``), + # when the episode step is greater than max episode step. + ignore_done=False, + + # (float type) target_theta: Used for soft update of the target network, + # aka. Interpolation factor in polyak averaging for target networks. + # Default to 0.005. + target_theta=0.005, + # (float) discount factor for the discounted sum of rewards, aka. gamma. + discount_factor=0.99, + lmbda=0.75, + + # (float) Weight uniform initialization range in the last output layer + init_w=3e-3, + ), + collect=dict( + # (int) Cut trajectories into pieces with length "unroll_len". + unroll_len=1, + ), + eval=dict(), + other=dict( + replay_buffer=dict( + # (int type) replay_buffer_size: Max size of replay buffer. + replay_buffer_size=1000000, + # (int type) max_use: Max use times of one data in the buffer. + # Data will be removed once used for too many times. + # Default to infinite. + # max_use=256, + ), + ), + ) + + def default_model(self) -> Tuple[str, List[str]]: + return 'bcq', ['ding.model.template.bcq'] + + def _init_learn(self) -> None: + r""" + Overview: + Learn mode init method. Called by ``self.__init__``. + Init q, value and policy's optimizers, algorithm config, main and target models. + """ + # Init + self._priority = self._cfg.priority + self._priority_IS_weight = self._cfg.priority_IS_weight + self.lmbda = self._cfg.learn.lmbda + self.latent_dim = self._cfg.model.action_shape * 2 + + # Optimizers + self._optimizer_q = Adam( + self._model.critic.parameters(), + lr=self._cfg.learn.learning_rate_q, + ) + self._optimizer_policy = Adam( + self._model.actor.parameters(), + lr=self._cfg.learn.learning_rate_policy, + ) + self._optimizer_vae = Adam( + self._model.vae.parameters(), + lr=self._cfg.learn.learning_rate_vae, + ) + + # Algorithm config + self._gamma = self._cfg.learn.discount_factor + + # Main and target models + self._target_model = copy.deepcopy(self._model) + self._target_model = model_wrap( + self._target_model, + wrapper_name='target', + update_type='momentum', + update_kwargs={'theta': self._cfg.learn.target_theta} + ) + self._learn_model = model_wrap(self._model, wrapper_name='base') + self._learn_model.reset() + self._target_model.reset() + + self._forward_learn_cnt = 0 + + def _forward_learn(self, data: dict) -> Dict[str, Any]: + loss_dict = {} + + data = default_preprocess_learn( + data, + use_priority=self._priority, + use_priority_IS_weight=self._cfg.priority_IS_weight, + ignore_done=self._cfg.learn.ignore_done, + use_nstep=False + ) + if len(data.get('action').shape) == 1: + data['action'] = data['action'].reshape(-1, 1) + + if self._cuda: + data = to_device(data, self._device) + + self._learn_model.train() + self._target_model.train() + obs = data['obs'] + next_obs = data['next_obs'] + reward = data['reward'] + done = data['done'] + batch_size = obs.shape[0] + + # train_vae + vae_out = self._model.forward(data, mode='compute_vae') + recon, mean, log_std = vae_out['recons_action'], vae_out['mu'], vae_out['log_var'] + recons_loss = F.mse_loss(recon, data['action']) + kld_loss = torch.mean(-0.5 * torch.sum(1 + log_std - mean ** 2 - log_std.exp(), dim=1), dim=0) + loss_dict['recons_loss'] = recons_loss + loss_dict['kld_loss'] = kld_loss + vae_loss = recons_loss + 0.5 * kld_loss + loss_dict['vae_loss'] = vae_loss + self._optimizer_vae.zero_grad() + vae_loss.backward() + self._optimizer_vae.step() + + # train_critic + q_value = self._learn_model.forward(data, mode='compute_critic')['q_value'] + + with torch.no_grad(): + next_obs_rep = torch.repeat_interleave(next_obs, 10, 0) + z = torch.randn((next_obs_rep.shape[0], self.latent_dim)).to(self._device).clamp(-0.5, 0.5) + vae_action = self._model.vae.decode_with_obs(z, next_obs_rep)['reconstruction_action'] + next_action = self._model.forward({ + 'obs': next_obs_rep, + 'action': vae_action + }, mode='compute_actor')['action'] + + next_data = {'obs': next_obs_rep, 'action': next_action} + target_q_value = self._target_model.forward(next_data, mode='compute_critic')['q_value'] + # the value of a policy according to the maximum entropy objective + # find min one as target q value + target_q_value = self.lmbda * torch.min(target_q_value[0],target_q_value[1]) \ + + (1 - self.lmbda) * torch.max(target_q_value[0],target_q_value[1]) + target_q_value = target_q_value.reshape(batch_size, -1).max(1)[0].reshape(-1, 1) + + q_data0 = v_1step_td_data(q_value[0], target_q_value, reward, done, data['weight']) + loss_dict['critic_loss'], td_error_per_sample0 = v_1step_td_error(q_data0, self._gamma) + q_data1 = v_1step_td_data(q_value[1], target_q_value, reward, done, data['weight']) + loss_dict['twin_critic_loss'], td_error_per_sample1 = v_1step_td_error(q_data1, self._gamma) + td_error_per_sample = (td_error_per_sample0 + td_error_per_sample1) / 2 + + self._optimizer_q.zero_grad() + (loss_dict['critic_loss'] + loss_dict['twin_critic_loss']).backward() + self._optimizer_q.step() + + # train_policy + z = torch.randn((obs.shape[0], self.latent_dim)).to(self._device).clamp(-0.5, 0.5) + sample_action = self._model.vae.decode_with_obs(z, obs)['reconstruction_action'] + input = {'obs': obs, 'action': sample_action} + perturbed_action = self._model.forward(input, mode='compute_actor')['action'] + q_input = {'obs': obs, 'action': perturbed_action} + q = self._learn_model.forward(q_input, mode='compute_critic')['q_value'][0] + loss_dict['actor_loss'] = -q.mean() + self._optimizer_policy.zero_grad() + loss_dict['actor_loss'].backward() + self._optimizer_policy.step() + self._forward_learn_cnt += 1 + self._target_model.update(self._learn_model.state_dict()) + return { + 'td_error': td_error_per_sample.detach().mean().item(), + 'target_q_value': target_q_value.detach().mean().item(), + **loss_dict + } + + def _monitor_vars_learn(self) -> List[str]: + return [ + 'td_error', 'target_q_value', 'critic_loss', 'twin_critic_loss', 'actor_loss', 'recons_loss', 'kld_loss', + 'vae_loss' + ] + + def _state_dict_learn(self) -> Dict[str, Any]: + ret = { + 'model': self._learn_model.state_dict(), + 'target_model': self._target_model.state_dict(), + 'optimizer_q': self._optimizer_q.state_dict(), + 'optimizer_policy': self._optimizer_policy.state_dict(), + 'optimizer_vae': self._optimizer_vae.state_dict(), + } + return ret + + def _init_eval(self): + self._eval_model = model_wrap(self._model, wrapper_name='base') + self._eval_model.reset() + + def _forward_eval(self, data: dict) -> Dict[str, Any]: + data_id = list(data.keys()) + data = default_collate(list(data.values())) + if self._cuda: + data = to_device(data, self._device) + data = {'obs': data} + self._eval_model.eval() + with torch.no_grad(): + output = self._eval_model.forward(data, mode='compute_eval') + if self._cuda: + output = to_device(output, 'cpu') + output = default_decollate(output) + return {i: d for i, d in zip(data_id, output)} + + def _init_collect(self) -> None: + self._unroll_len = self._cfg.collect.unroll_len + self._gamma = self._cfg.discount_factor # necessary for parallel + self._nstep = self._cfg.nstep # necessary for parallel + self._collect_model = model_wrap(self._model, wrapper_name='eps_greedy_sample') + self._collect_model.reset() + + def _forward_collect(self, data: dict, **kwargs) -> dict: + pass + + def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict: + pass + + def _get_train_sample(self, data: list) -> Union[None, List[Any]]: + r""" + Overview: + Get the trajectory and the n step return data, then sample from the n_step return data + Arguments: + - data (:obj:`list`): The trajectory's cache + Returns: + - samples (:obj:`dict`): The training samples generated + """ + data = get_nstep_return_data(data, self._nstep, gamma=self._gamma) + return get_train_sample(data, self._unroll_len) diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py index 36e8ba7185..726536cde9 100644 --- a/ding/policy/command_mode_policy_instance.py +++ b/ding/policy/command_mode_policy_instance.py @@ -47,6 +47,7 @@ from .sac import SQILSACPolicy from .madqn import MADQNPolicy from .bdq import BDQPolicy +from .bcq import BCQPolicy class EpsCommandModePolicy(CommandModePolicy): @@ -381,6 +382,13 @@ class IBCCommandModePolicy(IBCPolicy, DummyCommandModePolicy): pass + +@POLICY_REGISTRY.register('bcq_command') +class BCQCommandModelPolicy(BCQPolicy, DummyCommandModePolicy): + pass + + + @POLICY_REGISTRY.register('bc_command') class BCCommandModePolicy(BehaviourCloningPolicy, DummyCommandModePolicy): diff --git a/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py b/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py new file mode 100755 index 0000000000..1817fc91fd --- /dev/null +++ b/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py @@ -0,0 +1,55 @@ +from easydict import EasyDict + +main_config = dict( + exp_name="halfcheetah_medium_bcq_seed0", + env=dict( + env_id='halfcheetah-medium-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=7000, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=17, + action_shape=6, + actor_head_hidden_size=[400,400,300], + critic_head_hidden_size=[400,400,300], + phi=0.05, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=100, + learning_rate_q=3e-3, + learning_rate_policy=3e-3, + learning_rate_alpha=3e-3, + lmbda=0.75, + learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )), + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=500, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), + seed = 123, +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='bcq', + import_names=['ding.policy.bcq'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py new file mode 100755 index 0000000000..610e3996d3 --- /dev/null +++ b/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py @@ -0,0 +1,55 @@ +from easydict import EasyDict + +main_config = dict( + exp_name="halfcheetah_medium_expert_bcq_seed0", + env=dict( + env_id='halfcheetah-medium-expert-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=12000, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=17, + action_shape=6, + actor_head_hidden_size=[400,400,300], + critic_head_hidden_size=[400,400,300], + phi=0.05, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=100, + learning_rate_q=3e-3, + learning_rate_policy=3e-3, + learning_rate_alpha=3e-3, + lmbda=0.75, + learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )), + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=500, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), + seed = 123, +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='bcq', + import_names=['ding.policy.bcq'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/config/hopper_medium_bcq_config.py b/dizoo/d4rl/config/hopper_medium_bcq_config.py new file mode 100755 index 0000000000..db70368f08 --- /dev/null +++ b/dizoo/d4rl/config/hopper_medium_bcq_config.py @@ -0,0 +1,55 @@ +from easydict import EasyDict + +main_config = dict( + exp_name="hopper_medium_bcq_seed0_43_v0", + env=dict( + env_id='hopper-medium-v0', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=3500, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=11, + action_shape=3, + actor_head_hidden_size=[400,400,300], + critic_head_hidden_size=[400,400,300], + phi=0.05, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=100, + learning_rate_q=3e-3, + learning_rate_policy=3e-3, + learning_rate_alpha=3e-3, + lmbda=0.75, + learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )), + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=500, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), + seed = 123, +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='bcq', + import_names=['ding.policy.bcq'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py b/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py new file mode 100755 index 0000000000..eec47363dc --- /dev/null +++ b/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py @@ -0,0 +1,55 @@ +from easydict import EasyDict + +main_config = dict( + exp_name="hopper_medium_expert_bcq_seed0_43_v0", + env=dict( + env_id='hopper-medium-expert-v0', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=3800, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=11, + action_shape=3, + actor_head_hidden_size=[400,400,300], + critic_head_hidden_size=[400,400,300], + phi=0.05, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=100, + learning_rate_q=3e-3, + learning_rate_policy=3e-3, + learning_rate_alpha=3e-3, + lmbda=0.75, + learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )), + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=500, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), + seed = 123, +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='bcq', + import_names=['ding.policy.bcq'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/entry/d4rl_bcq_main.py b/dizoo/d4rl/entry/d4rl_bcq_main.py new file mode 100755 index 0000000000..2e50f25100 --- /dev/null +++ b/dizoo/d4rl/entry/d4rl_bcq_main.py @@ -0,0 +1,21 @@ +from ding.entry import serial_pipeline_offline +from ding.config import read_config +from pathlib import Path + + +def train(args): + # launch from anywhere + config = Path(__file__).absolute().parent.parent / 'config' / args.config + config = read_config(str(config)) + config[0].exp_name = config[0].exp_name.replace('0', str(args.seed)) + serial_pipeline_offline(config, seed=args.seed) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('--seed', '-s', type=int, default=0) + parser.add_argument('--config', '-c', type=str, default='halfcheetah_medium_bcq_config.py') + args = parser.parse_args() + train(args) \ No newline at end of file From 985def776aece702d7f77b8a8fc026daf8e7dff1 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Mon, 10 Apr 2023 16:01:27 +0800 Subject: [PATCH 2/8] modif policy_init --- ding/policy/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py index 65f3f2757e..cac683b3b2 100644 --- a/ding/policy/__init__.py +++ b/ding/policy/__init__.py @@ -18,7 +18,6 @@ from .ppo import PPOPolicy, PPOPGPolicy, PPOOffPolicy from .sac import SACPolicy, SACDiscretePolicy, SQILSACPolicy from .cql import CQLPolicy, CQLDiscretePolicy -from .edac import EDACPolicy from .impala import IMPALAPolicy from .ngu import NGUPolicy from .r2d2 import R2D2Policy From 7b7a99ea4918721c10c75076da44ea04112a43dc Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Wed, 12 Apr 2023 16:19:07 +0800 Subject: [PATCH 3/8] modify bcq --- ding/example/bcq.py | 42 +++ ding/model/template/__init__.py | 1 + ding/model/template/bcq.py | 132 ++++++++ ding/policy/__init__.py | 2 + ding/policy/bcq.py | 290 ++++++++++++++++++ ding/policy/command_mode_policy_instance.py | 8 + .../config/halfcheetah_medium_bcq_config.py | 55 ++++ .../halfcheetah_medium_expert_bcq_config.py | 55 ++++ dizoo/d4rl/config/hopper_medium_bcq_config.py | 55 ++++ .../config/hopper_medium_expert_bcq_config.py | 55 ++++ dizoo/d4rl/entry/d4rl_bcq_main.py | 21 ++ 11 files changed, 716 insertions(+) create mode 100755 ding/example/bcq.py create mode 100755 ding/model/template/bcq.py create mode 100755 ding/policy/bcq.py create mode 100755 dizoo/d4rl/config/halfcheetah_medium_bcq_config.py create mode 100755 dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py create mode 100755 dizoo/d4rl/config/hopper_medium_bcq_config.py create mode 100755 dizoo/d4rl/config/hopper_medium_expert_bcq_config.py create mode 100755 dizoo/d4rl/entry/d4rl_bcq_main.py diff --git a/ding/example/bcq.py b/ding/example/bcq.py new file mode 100755 index 0000000000..4bd1385c3f --- /dev/null +++ b/ding/example/bcq.py @@ -0,0 +1,42 @@ +import gym +from ditk import logging +from ding.model import BCQ +from ding.policy import BCQPolicy +from ding.envs import DingEnvWrapper, BaseEnvManagerV2 +from ding.data import create_dataset +from ding.config import compile_config +from ding.framework import task, ding_init +from ding.framework.context import OfflineRLContext +from ding.framework.middleware import interaction_evaluator, trainer, CkptSaver, offline_data_fetcher, offline_logger +from ding.utils import set_pkg_seed +from dizoo.d4rl.envs import D4RLEnv +from dizoo.d4rl.config.halfcheetah_medium_bcq_config import main_config, create_config + + +def main(): + # If you don't have offline data, you need to prepare if first and set the data_path in config + # For demostration, we also can train a RL policy (e.g. SAC) and collect some data + logging.getLogger().setLevel(logging.INFO) + cfg = compile_config(main_config, create_cfg=create_config, auto=True) + ding_init(cfg) + with task.start(async_mode=False, ctx=OfflineRLContext()): + evaluator_env = BaseEnvManagerV2( + env_fn=[lambda: D4RLEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager + ) + + set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda) + + dataset = create_dataset(cfg) + model = BCQ(**cfg.policy.model) + policy = BCQPolicy(cfg.policy, model=model) + + task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env)) + task.use(offline_data_fetcher(cfg, dataset)) + task.use(trainer(cfg, policy.learn_mode)) + task.use(CkptSaver(policy, cfg.exp_name, train_freq=10000000)) + task.use(offline_logger()) + task.run() + + +if __name__ == "__main__": + main() diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py index e994286ac3..b22ea61d6e 100644 --- a/ding/model/template/__init__.py +++ b/ding/model/template/__init__.py @@ -23,3 +23,4 @@ from .vae import VanillaVAE from .decision_transformer import DecisionTransformer from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS +from .bcq import BCQ \ No newline at end of file diff --git a/ding/model/template/bcq.py b/ding/model/template/bcq.py new file mode 100755 index 0000000000..8f78b955e3 --- /dev/null +++ b/ding/model/template/bcq.py @@ -0,0 +1,132 @@ +from typing import Union, Dict, Optional, List +from easydict import EasyDict +import numpy as np +import torch +import torch.nn as nn + +from ding.utils import SequenceType, squeeze, MODEL_REGISTRY +from ..common import RegressionHead, ReparameterizationHead +from .vae import VanillaVAE + + +@MODEL_REGISTRY.register('bcq') +class BCQ(nn.Module): + + mode = ['compute_actor', 'compute_critic', 'compute_vae', 'compute_eval'] + + def __init__( + self, + obs_shape: Union[int, SequenceType], + action_shape: Union[int, SequenceType, EasyDict], + actor_head_hidden_size: List = [400, 300], + critic_head_hidden_size: List = [400, 300], + activation: Optional[nn.Module] = nn.ReLU(), + vae_hidden_dims: List = [750, 750], + phi: float = 0.05 + ) -> None: + """ + Overview: + Initialize QMIX neural network, i.e. agent Q network and mixer. + Arguments: + - obs_shape (:obj:`int`): the dimension of observation state + - action_shape (:obj:`int`): the dimension of action shape + - actor_hidden_size (:obj:`list`): the list of hidden size of actor + - critic_hidden_size (:obj:'list'): the list of hidden size of critic + - activation (:obj:`nn.Module`): Activation function in network, defaults to nn.ReLU(). + - vae_hidden_dims (:obj:`list`): the list of hidden size of vae + """ + super(BCQ, self).__init__() + obs_shape: int = squeeze(obs_shape) + action_shape = squeeze(action_shape) + self.action_shape = action_shape + self.input_size = obs_shape + self.phi = phi + + critic_input_size = self.input_size + action_shape + self.critic = nn.ModuleList() + for _ in range(2): + net = [] + d = critic_input_size + for dim in critic_head_hidden_size: + net.append(nn.Linear(d, dim)) + net.append(activation) + d = dim + net.append(nn.Linear(d, 1)) + self.critic.append(nn.Sequential(*net)) + + net = [] + d = critic_input_size + for dim in actor_head_hidden_size: + net.append(nn.Linear(d, dim)) + net.append(activation) + d = dim + net.append(nn.Linear(d, 1)) + self.actor = nn.Sequential(*net) + + self.vae = VanillaVAE(action_shape, obs_shape, action_shape * 2, vae_hidden_dims) + + def forward(self, inputs: Dict[str, torch.Tensor], mode: str) -> Dict[str, torch.Tensor]: + """ + Overview: + The unique execution (forward) method of QAC method, and one can indicate different modes to implement \ + different computation graph, including ``compute_actor`` and ``compute_critic`` in QAC. + Mode compute_actor: + Arguments: + - inputs (:obj:`Dict`): Input dict data, including obs and action tensor. + Returns: + - output (:obj:`Dict`): Output dict data, including action tensor. + Mode compute_critic: + Arguments: + - inputs (:obj:`Dict`): Input dict data, including obs and action tensor. + Returns: + - output (:obj:`Dict`): Output dict data, including q_value tensor. + Mode compute_vae: + Arguments: + - inputs (:obj:`Dict`): Input dict data, including obs and action tensor. + Returns: + - outputs (:obj:`Dict`): Dict containing keywords ``recons_action`` \ + (:obj:`torch.Tensor`), ``prediction_residual`` (:obj:`torch.Tensor`), \ + ``input`` (:obj:`torch.Tensor`), ``mu`` (:obj:`torch.Tensor`), \ + ``log_var`` (:obj:`torch.Tensor`) and ``z`` (:obj:`torch.Tensor`). + Mode compute_eval: + Arguments: + - inputs (:obj:`Dict`): Input dict data, including obs and action tensor. + Returns: + - output (:obj:`Dict`): Output dict data, including action tensor. + + + .. note:: + For specific examples, one can refer to API doc of ``compute_actor`` and ``compute_critic`` respectively. + """ + assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode) + return getattr(self, mode)(inputs) + + def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + obs, action = inputs['obs'], inputs['action'] + if len(action.shape) == 1: # (B, ) -> (B, 1) + action = action.unsqueeze(1) + x = torch.cat([obs, action], dim=-1) + x = [m(x).squeeze() for m in self.critic] + return {'q_value': x} + + def compute_actor(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]: + input = torch.cat([inputs['obs'], inputs['action']], -1) + x = self.actor(input) + action = self.phi * 1 * torch.tanh(x) + action = (action + inputs['action']).clamp(-1, 1) + return {'action': action} + + def compute_vae(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + return self.vae.forward(inputs) + + def compute_eval(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + obs = inputs['obs'] + obs_rep = obs.clone().unsqueeze(0).repeat_interleave(100, dim=0) + z = torch.randn((obs_rep.shape[0], obs_rep.shape[1], self.action_shape * 2)).to(obs.device).clamp(-0.5, 0.5) + sample_action = self.vae.decode_with_obs(z, obs_rep)['reconstruction_action'] + action = self.compute_actor({'obs': obs_rep, 'action': sample_action})['action'] + q = self.compute_critic({'obs': obs_rep, 'action': action})['q_value'][0] + idx = q.argmax(dim=0).unsqueeze(0).unsqueeze(-1) + idx = idx.repeat_interleave(action.shape[-1], dim=-1) + action = action.gather(0, idx).squeeze() + return {'action': action} diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py index 15575c7d30..cac683b3b2 100644 --- a/ding/policy/__init__.py +++ b/ding/policy/__init__.py @@ -48,5 +48,7 @@ from .pc import ProcedureCloningBFSPolicy +from .bcq import BCQPolicy + # new-type policy from .ppof import PPOFPolicy diff --git a/ding/policy/bcq.py b/ding/policy/bcq.py new file mode 100755 index 0000000000..5dd517b6cd --- /dev/null +++ b/ding/policy/bcq.py @@ -0,0 +1,290 @@ +from typing import List, Dict, Any, Tuple, Union +from collections import namedtuple +import copy +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ding.torch_utils import Adam, to_device +from ding.rl_utils import v_1step_td_data, v_1step_td_error, get_train_sample, get_nstep_return_data +from ding.model import model_wrap +from ding.policy import Policy +from ding.utils import POLICY_REGISTRY +from ding.utils.data import default_collate, default_decollate +from .common_utils import default_preprocess_learn + + +@POLICY_REGISTRY.register('bcq') +class BCQPolicy(Policy): + config = dict( + type='bcq', + # (bool) Whether to use cuda for network. + cuda=False, + # (bool type) priority: Determine whether to use priority in buffer sample. + # Default False in SAC. + priority=False, + # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True. + priority_IS_weight=False, + # (int) Number of training samples(randomly collected) in replay buffer when training starts. + # Default 10000 in SAC. + random_collect_size=10000, + nstep=1, + model=dict( + # (List) Hidden list for actor network head. + actor_head_hidden_size=[400,300], + + + # (List) Hidden list for critic network head. + critic_head_hidden_size=[400,300], + # Max perturbation hyper-parameter for BCQ + phi=0.05, + ), + learn=dict( + + # How many updates(iterations) to train after collector's one collection. + # Bigger "update_per_collect" means bigger off-policy. + # collect data -> update policy-> collect data -> ... + update_per_collect=1, + # (int) Minibatch size for gradient descent. + batch_size=100, + + # (float type) learning_rate_q: Learning rate for soft q network. + # Default to 3e-4. + # Please set to 1e-3, when model.value_network is True. + learning_rate_q=3e-4, + # (float type) learning_rate_policy: Learning rate for policy network. + # Default to 3e-4. + # Please set to 1e-3, when model.value_network is True. + learning_rate_policy=3e-4, + # (float type) learning_rate_vae: Learning rate for vae network. + # `learning_rate_value` should be initialized, when model.vae_network is True. + # Please set to 3e-4, when model.vae_network is True. + learning_rate_vae=3e-4, + # (bool) Whether ignore done(usually for max step termination env. e.g. pendulum) + # Note: Gym wraps the MuJoCo envs by default with TimeLimit environment wrappers. + # These limit HalfCheetah, and several other MuJoCo envs, to max length of 1000. + # However, interaction with HalfCheetah always gets done with done is False, + # Since we inplace done==True with done==False to keep + # TD-error accurate computation(``gamma * (1 - done) * next_v + reward``), + # when the episode step is greater than max episode step. + ignore_done=False, + + # (float type) target_theta: Used for soft update of the target network, + # aka. Interpolation factor in polyak averaging for target networks. + # Default to 0.005. + target_theta=0.005, + # (float) discount factor for the discounted sum of rewards, aka. gamma. + discount_factor=0.99, + lmbda=0.75, + + # (float) Weight uniform initialization range in the last output layer + init_w=3e-3, + ), + collect=dict( + # (int) Cut trajectories into pieces with length "unroll_len". + unroll_len=1, + ), + eval=dict(), + other=dict( + replay_buffer=dict( + # (int type) replay_buffer_size: Max size of replay buffer. + replay_buffer_size=1000000, + # (int type) max_use: Max use times of one data in the buffer. + # Data will be removed once used for too many times. + # Default to infinite. + # max_use=256, + ), + ), + ) + + def default_model(self) -> Tuple[str, List[str]]: + return 'bcq', ['ding.model.template.bcq'] + + def _init_learn(self) -> None: + r""" + Overview: + Learn mode init method. Called by ``self.__init__``. + Init q, value and policy's optimizers, algorithm config, main and target models. + """ + # Init + self._priority = self._cfg.priority + self._priority_IS_weight = self._cfg.priority_IS_weight + self.lmbda = self._cfg.learn.lmbda + self.latent_dim = self._cfg.model.action_shape * 2 + + # Optimizers + self._optimizer_q = Adam( + self._model.critic.parameters(), + lr=self._cfg.learn.learning_rate_q, + ) + self._optimizer_policy = Adam( + self._model.actor.parameters(), + lr=self._cfg.learn.learning_rate_policy, + ) + self._optimizer_vae = Adam( + self._model.vae.parameters(), + lr=self._cfg.learn.learning_rate_vae, + ) + + # Algorithm config + self._gamma = self._cfg.learn.discount_factor + + # Main and target models + self._target_model = copy.deepcopy(self._model) + self._target_model = model_wrap( + self._target_model, + wrapper_name='target', + update_type='momentum', + update_kwargs={'theta': self._cfg.learn.target_theta} + ) + self._learn_model = model_wrap(self._model, wrapper_name='base') + self._learn_model.reset() + self._target_model.reset() + + self._forward_learn_cnt = 0 + + def _forward_learn(self, data: dict) -> Dict[str, Any]: + loss_dict = {} + + data = default_preprocess_learn( + data, + use_priority=self._priority, + use_priority_IS_weight=self._cfg.priority_IS_weight, + ignore_done=self._cfg.learn.ignore_done, + use_nstep=False + ) + if len(data.get('action').shape) == 1: + data['action'] = data['action'].reshape(-1, 1) + + if self._cuda: + data = to_device(data, self._device) + + self._learn_model.train() + self._target_model.train() + obs = data['obs'] + next_obs = data['next_obs'] + reward = data['reward'] + done = data['done'] + batch_size = obs.shape[0] + + # train_vae + vae_out = self._model.forward(data, mode='compute_vae') + recon, mean, log_std = vae_out['recons_action'], vae_out['mu'], vae_out['log_var'] + recons_loss = F.mse_loss(recon, data['action']) + kld_loss = torch.mean(-0.5 * torch.sum(1 + log_std - mean ** 2 - log_std.exp(), dim=1), dim=0) + loss_dict['recons_loss'] = recons_loss + loss_dict['kld_loss'] = kld_loss + vae_loss = recons_loss + 0.5 * kld_loss + loss_dict['vae_loss'] = vae_loss + self._optimizer_vae.zero_grad() + vae_loss.backward() + self._optimizer_vae.step() + + # train_critic + q_value = self._learn_model.forward(data, mode='compute_critic')['q_value'] + + with torch.no_grad(): + next_obs_rep = torch.repeat_interleave(next_obs, 10, 0) + z = torch.randn((next_obs_rep.shape[0], self.latent_dim)).to(self._device).clamp(-0.5, 0.5) + vae_action = self._model.vae.decode_with_obs(z, next_obs_rep)['reconstruction_action'] + next_action = self._target_model.forward({ + 'obs': next_obs_rep, + 'action': vae_action + }, mode='compute_actor')['action'] + + next_data = {'obs': next_obs_rep, 'action': next_action} + target_q_value = self._target_model.forward(next_data, mode='compute_critic')['q_value'] + # the value of a policy according to the maximum entropy objective + # find min one as target q value + target_q_value = self.lmbda * torch.min(target_q_value[0],target_q_value[1]) \ + + (1 - self.lmbda) * torch.max(target_q_value[0],target_q_value[1]) + target_q_value = target_q_value.reshape(batch_size, -1).max(1)[0].reshape(-1, 1) + + q_data0 = v_1step_td_data(q_value[0], target_q_value, reward, done, data['weight']) + loss_dict['critic_loss'], td_error_per_sample0 = v_1step_td_error(q_data0, self._gamma) + q_data1 = v_1step_td_data(q_value[1], target_q_value, reward, done, data['weight']) + loss_dict['twin_critic_loss'], td_error_per_sample1 = v_1step_td_error(q_data1, self._gamma) + td_error_per_sample = (td_error_per_sample0 + td_error_per_sample1) / 2 + + self._optimizer_q.zero_grad() + (loss_dict['critic_loss'] + loss_dict['twin_critic_loss']).backward() + self._optimizer_q.step() + + # train_policy + z = torch.randn((obs.shape[0], self.latent_dim)).to(self._device).clamp(-0.5, 0.5) + sample_action = self._model.vae.decode_with_obs(z, obs)['reconstruction_action'] + input = {'obs': obs, 'action': sample_action} + perturbed_action = self._model.forward(input, mode='compute_actor')['action'] + q_input = {'obs': obs, 'action': perturbed_action} + q = self._learn_model.forward(q_input, mode='compute_critic')['q_value'][0] + loss_dict['actor_loss'] = -q.mean() + self._optimizer_policy.zero_grad() + loss_dict['actor_loss'].backward() + self._optimizer_policy.step() + self._forward_learn_cnt += 1 + self._target_model.update(self._learn_model.state_dict()) + return { + 'td_error': td_error_per_sample.detach().mean().item(), + 'target_q_value': target_q_value.detach().mean().item(), + **loss_dict + } + + def _monitor_vars_learn(self) -> List[str]: + return [ + 'td_error', 'target_q_value', 'critic_loss', 'twin_critic_loss', 'actor_loss', 'recons_loss', 'kld_loss', + 'vae_loss' + ] + + def _state_dict_learn(self) -> Dict[str, Any]: + ret = { + 'model': self._learn_model.state_dict(), + 'target_model': self._target_model.state_dict(), + 'optimizer_q': self._optimizer_q.state_dict(), + 'optimizer_policy': self._optimizer_policy.state_dict(), + 'optimizer_vae': self._optimizer_vae.state_dict(), + } + return ret + + def _init_eval(self): + self._eval_model = model_wrap(self._model, wrapper_name='base') + self._eval_model.reset() + + def _forward_eval(self, data: dict) -> Dict[str, Any]: + data_id = list(data.keys()) + data = default_collate(list(data.values())) + if self._cuda: + data = to_device(data, self._device) + data = {'obs': data} + self._eval_model.eval() + with torch.no_grad(): + output = self._eval_model.forward(data, mode='compute_eval') + if self._cuda: + output = to_device(output, 'cpu') + output = default_decollate(output) + return {i: d for i, d in zip(data_id, output)} + + def _init_collect(self) -> None: + self._unroll_len = self._cfg.collect.unroll_len + self._gamma = self._cfg.discount_factor # necessary for parallel + self._nstep = self._cfg.nstep # necessary for parallel + self._collect_model = model_wrap(self._model, wrapper_name='eps_greedy_sample') + self._collect_model.reset() + + def _forward_collect(self, data: dict, **kwargs) -> dict: + pass + + def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict: + pass + + def _get_train_sample(self, data: list) -> Union[None, List[Any]]: + r""" + Overview: + Get the trajectory and the n step return data, then sample from the n_step return data + Arguments: + - data (:obj:`list`): The trajectory's cache + Returns: + - samples (:obj:`dict`): The training samples generated + """ + data = get_nstep_return_data(data, self._nstep, gamma=self._gamma) + return get_train_sample(data, self._unroll_len) diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py index 36e8ba7185..726536cde9 100644 --- a/ding/policy/command_mode_policy_instance.py +++ b/ding/policy/command_mode_policy_instance.py @@ -47,6 +47,7 @@ from .sac import SQILSACPolicy from .madqn import MADQNPolicy from .bdq import BDQPolicy +from .bcq import BCQPolicy class EpsCommandModePolicy(CommandModePolicy): @@ -381,6 +382,13 @@ class IBCCommandModePolicy(IBCPolicy, DummyCommandModePolicy): pass + +@POLICY_REGISTRY.register('bcq_command') +class BCQCommandModelPolicy(BCQPolicy, DummyCommandModePolicy): + pass + + + @POLICY_REGISTRY.register('bc_command') class BCCommandModePolicy(BehaviourCloningPolicy, DummyCommandModePolicy): diff --git a/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py b/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py new file mode 100755 index 0000000000..cee9fc9f5a --- /dev/null +++ b/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py @@ -0,0 +1,55 @@ +from easydict import EasyDict + +main_config = dict( + exp_name="halfcheetah_medium_bcq_seed0", + env=dict( + env_id='halfcheetah-medium-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=7000, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=17, + action_shape=6, + actor_head_hidden_size=[400,300], + critic_head_hidden_size=[400,300], + phi=0.05, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=100, + learning_rate_q=3e-3, + learning_rate_policy=3e-3, + learning_rate_alpha=3e-3, + lmbda=0.75, + learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )), + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=500, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), + seed = 123, +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='bcq', + import_names=['ding.policy.bcq'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py new file mode 100755 index 0000000000..219810d1d0 --- /dev/null +++ b/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py @@ -0,0 +1,55 @@ +from easydict import EasyDict + +main_config = dict( + exp_name="halfcheetah_medium_expert_bcq_seed0", + env=dict( + env_id='halfcheetah-medium-expert-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=12000, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=17, + action_shape=6, + actor_head_hidden_size=[400,300], + critic_head_hidden_size=[400,300], + phi=0.05, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=100, + learning_rate_q=3e-3, + learning_rate_policy=3e-3, + learning_rate_alpha=3e-3, + lmbda=0.75, + learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )), + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=500, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), + seed = 123, +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='bcq', + import_names=['ding.policy.bcq'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/config/hopper_medium_bcq_config.py b/dizoo/d4rl/config/hopper_medium_bcq_config.py new file mode 100755 index 0000000000..df13e46f3b --- /dev/null +++ b/dizoo/d4rl/config/hopper_medium_bcq_config.py @@ -0,0 +1,55 @@ +from easydict import EasyDict + +main_config = dict( + exp_name="hopper_medium_bcq_seed0_43_v0", + env=dict( + env_id='hopper-medium-v0', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=3500, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=11, + action_shape=3, + actor_head_hidden_size=[400,300], + critic_head_hidden_size=[400,300], + phi=0.05, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=100, + learning_rate_q=3e-3, + learning_rate_policy=3e-3, + learning_rate_alpha=3e-3, + lmbda=0.75, + learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )), + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=500, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), + seed = 123, +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='bcq', + import_names=['ding.policy.bcq'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py b/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py new file mode 100755 index 0000000000..0d17f6ef04 --- /dev/null +++ b/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py @@ -0,0 +1,55 @@ +from easydict import EasyDict + +main_config = dict( + exp_name="hopper_medium_expert_bcq_seed0_43", + env=dict( + env_id='hopper-medium-expert-v2', + collector_env_num=1, + evaluator_env_num=8, + use_act_scale=True, + n_evaluator_episode=8, + stop_value=3800, + ), + policy=dict( + cuda=True, + model=dict( + obs_shape=11, + action_shape=3, + actor_head_hidden_size=[400,300], + critic_head_hidden_size=[400,300], + phi=0.05, + ), + learn=dict( + data_path=None, + train_epoch=30000, + batch_size=100, + learning_rate_q=3e-3, + learning_rate_policy=3e-3, + learning_rate_alpha=3e-3, + lmbda=0.75, + learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )), + ), + collect=dict(data_type='d4rl', ), + eval=dict(evaluator=dict(eval_freq=500, )), + other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), + ), + seed = 123, +) + +main_config = EasyDict(main_config) +main_config = main_config + +create_config = dict( + env=dict( + type='d4rl', + import_names=['dizoo.d4rl.envs.d4rl_env'], + ), + env_manager=dict(type='base'), + policy=dict( + type='bcq', + import_names=['ding.policy.bcq'], + ), + replay_buffer=dict(type='naive', ), +) +create_config = EasyDict(create_config) +create_config = create_config \ No newline at end of file diff --git a/dizoo/d4rl/entry/d4rl_bcq_main.py b/dizoo/d4rl/entry/d4rl_bcq_main.py new file mode 100755 index 0000000000..2e50f25100 --- /dev/null +++ b/dizoo/d4rl/entry/d4rl_bcq_main.py @@ -0,0 +1,21 @@ +from ding.entry import serial_pipeline_offline +from ding.config import read_config +from pathlib import Path + + +def train(args): + # launch from anywhere + config = Path(__file__).absolute().parent.parent / 'config' / args.config + config = read_config(str(config)) + config[0].exp_name = config[0].exp_name.replace('0', str(args.seed)) + serial_pipeline_offline(config, seed=args.seed) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('--seed', '-s', type=int, default=0) + parser.add_argument('--config', '-c', type=str, default='halfcheetah_medium_bcq_config.py') + args = parser.parse_args() + train(args) \ No newline at end of file From 1b8454af3b5e9e054aa8a75d4a4cb654071b4925 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Thu, 13 Apr 2023 12:09:07 +0800 Subject: [PATCH 4/8] modify default config --- ding/policy/bcq.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/ding/policy/bcq.py b/ding/policy/bcq.py index 071970d778..5dd517b6cd 100755 --- a/ding/policy/bcq.py +++ b/ding/policy/bcq.py @@ -32,19 +32,11 @@ class BCQPolicy(Policy): nstep=1, model=dict( # (List) Hidden list for actor network head. -<<<<<<< HEAD actor_head_hidden_size=[400,300], # (List) Hidden list for critic network head. critic_head_hidden_size=[400,300], -======= - actor_head_hidden_size=[400,400,300], - - - # (List) Hidden list for critic network head. - critic_head_hidden_size=[400,400,300], ->>>>>>> 985def776aece702d7f77b8a8fc026daf8e7dff1 # Max perturbation hyper-parameter for BCQ phi=0.05, ), From d8ac3f3fea17c970b623acdfa81b2babd6d78bc0 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Sun, 23 Apr 2023 21:47:34 +0800 Subject: [PATCH 5/8] format --- ding/model/template/bcq.py | 4 ++-- ding/policy/bcq.py | 5 ++--- ding/policy/command_mode_policy_instance.py | 2 -- dizoo/d4rl/config/halfcheetah_medium_bcq_config.py | 8 ++++---- dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py | 8 ++++---- dizoo/d4rl/config/hopper_medium_bcq_config.py | 8 ++++---- dizoo/d4rl/config/hopper_medium_expert_bcq_config.py | 8 ++++---- dizoo/d4rl/entry/d4rl_bcq_main.py | 4 ++-- 8 files changed, 22 insertions(+), 25 deletions(-) diff --git a/ding/model/template/bcq.py b/ding/model/template/bcq.py index 8f78b955e3..ee76c68697 100755 --- a/ding/model/template/bcq.py +++ b/ding/model/template/bcq.py @@ -68,8 +68,8 @@ def __init__( def forward(self, inputs: Dict[str, torch.Tensor], mode: str) -> Dict[str, torch.Tensor]: """ Overview: - The unique execution (forward) method of QAC method, and one can indicate different modes to implement \ - different computation graph, including ``compute_actor`` and ``compute_critic`` in QAC. + The unique execution (forward) method of BCQ method, and one can indicate different modes to implement \ + different computation graph, including ``compute_actor`` and ``compute_critic`` in BCQ. Mode compute_actor: Arguments: - inputs (:obj:`Dict`): Input dict data, including obs and action tensor. diff --git a/ding/policy/bcq.py b/ding/policy/bcq.py index 5dd517b6cd..1cf7997ff1 100755 --- a/ding/policy/bcq.py +++ b/ding/policy/bcq.py @@ -32,11 +32,10 @@ class BCQPolicy(Policy): nstep=1, model=dict( # (List) Hidden list for actor network head. - actor_head_hidden_size=[400,300], - + actor_head_hidden_size=[400, 300], # (List) Hidden list for critic network head. - critic_head_hidden_size=[400,300], + critic_head_hidden_size=[400, 300], # Max perturbation hyper-parameter for BCQ phi=0.05, ), diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py index 726536cde9..da732aa9e1 100644 --- a/ding/policy/command_mode_policy_instance.py +++ b/ding/policy/command_mode_policy_instance.py @@ -382,13 +382,11 @@ class IBCCommandModePolicy(IBCPolicy, DummyCommandModePolicy): pass - @POLICY_REGISTRY.register('bcq_command') class BCQCommandModelPolicy(BCQPolicy, DummyCommandModePolicy): pass - @POLICY_REGISTRY.register('bc_command') class BCCommandModePolicy(BehaviourCloningPolicy, DummyCommandModePolicy): diff --git a/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py b/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py index cee9fc9f5a..c0199dcb09 100755 --- a/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py @@ -15,8 +15,8 @@ model=dict( obs_shape=17, action_shape=6, - actor_head_hidden_size=[400,300], - critic_head_hidden_size=[400,300], + actor_head_hidden_size=[400, 300], + critic_head_hidden_size=[400, 300], phi=0.05, ), learn=dict( @@ -33,7 +33,7 @@ eval=dict(evaluator=dict(eval_freq=500, )), other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), ), - seed = 123, + seed=123, ) main_config = EasyDict(main_config) @@ -52,4 +52,4 @@ replay_buffer=dict(type='naive', ), ) create_config = EasyDict(create_config) -create_config = create_config \ No newline at end of file +create_config = create_config diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py index 219810d1d0..6c3ac39c18 100755 --- a/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py +++ b/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py @@ -15,8 +15,8 @@ model=dict( obs_shape=17, action_shape=6, - actor_head_hidden_size=[400,300], - critic_head_hidden_size=[400,300], + actor_head_hidden_size=[400, 300], + critic_head_hidden_size=[400, 300], phi=0.05, ), learn=dict( @@ -33,7 +33,7 @@ eval=dict(evaluator=dict(eval_freq=500, )), other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), ), - seed = 123, + seed=123, ) main_config = EasyDict(main_config) @@ -52,4 +52,4 @@ replay_buffer=dict(type='naive', ), ) create_config = EasyDict(create_config) -create_config = create_config \ No newline at end of file +create_config = create_config diff --git a/dizoo/d4rl/config/hopper_medium_bcq_config.py b/dizoo/d4rl/config/hopper_medium_bcq_config.py index df13e46f3b..06282d1680 100755 --- a/dizoo/d4rl/config/hopper_medium_bcq_config.py +++ b/dizoo/d4rl/config/hopper_medium_bcq_config.py @@ -15,8 +15,8 @@ model=dict( obs_shape=11, action_shape=3, - actor_head_hidden_size=[400,300], - critic_head_hidden_size=[400,300], + actor_head_hidden_size=[400, 300], + critic_head_hidden_size=[400, 300], phi=0.05, ), learn=dict( @@ -33,7 +33,7 @@ eval=dict(evaluator=dict(eval_freq=500, )), other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), ), - seed = 123, + seed=123, ) main_config = EasyDict(main_config) @@ -52,4 +52,4 @@ replay_buffer=dict(type='naive', ), ) create_config = EasyDict(create_config) -create_config = create_config \ No newline at end of file +create_config = create_config diff --git a/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py b/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py index a230a9704b..ac48ee4847 100755 --- a/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py +++ b/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py @@ -15,8 +15,8 @@ model=dict( obs_shape=11, action_shape=3, - actor_head_hidden_size=[400,300], - critic_head_hidden_size=[400,300], + actor_head_hidden_size=[400, 300], + critic_head_hidden_size=[400, 300], phi=0.05, ), learn=dict( @@ -33,7 +33,7 @@ eval=dict(evaluator=dict(eval_freq=500, )), other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ), ), - seed = 123, + seed=123, ) main_config = EasyDict(main_config) @@ -52,4 +52,4 @@ replay_buffer=dict(type='naive', ), ) create_config = EasyDict(create_config) -create_config = create_config \ No newline at end of file +create_config = create_config diff --git a/dizoo/d4rl/entry/d4rl_bcq_main.py b/dizoo/d4rl/entry/d4rl_bcq_main.py index 2e50f25100..099f6e025b 100755 --- a/dizoo/d4rl/entry/d4rl_bcq_main.py +++ b/dizoo/d4rl/entry/d4rl_bcq_main.py @@ -5,7 +5,7 @@ def train(args): # launch from anywhere - config = Path(__file__).absolute().parent.parent / 'config' / args.config + config = Path(__file__).absolute().parent.parent / 'config' / args.config config = read_config(str(config)) config[0].exp_name = config[0].exp_name.replace('0', str(args.seed)) serial_pipeline_offline(config, seed=args.seed) @@ -18,4 +18,4 @@ def train(args): parser.add_argument('--seed', '-s', type=int, default=0) parser.add_argument('--config', '-c', type=str, default='halfcheetah_medium_bcq_config.py') args = parser.parse_args() - train(args) \ No newline at end of file + train(args) From 9c08f7cc5b729dc57ee4ad3eaffff77adbf42923 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Sun, 23 Apr 2023 22:23:58 +0800 Subject: [PATCH 6/8] format --- ding/model/template/__init__.py | 2 +- ding/model/template/bcq.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py index b22ea61d6e..694dcd732c 100644 --- a/ding/model/template/__init__.py +++ b/ding/model/template/__init__.py @@ -23,4 +23,4 @@ from .vae import VanillaVAE from .decision_transformer import DecisionTransformer from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS -from .bcq import BCQ \ No newline at end of file +from .bcq import BCQ diff --git a/ding/model/template/bcq.py b/ding/model/template/bcq.py index ee76c68697..7b8d013e9e 100755 --- a/ding/model/template/bcq.py +++ b/ding/model/template/bcq.py @@ -26,7 +26,7 @@ def __init__( ) -> None: """ Overview: - Initialize QMIX neural network, i.e. agent Q network and mixer. + Initialize neural network, i.e. agent Q network and actor. Arguments: - obs_shape (:obj:`int`): the dimension of observation state - action_shape (:obj:`int`): the dimension of action shape From d594000ea716569914162d64c40cb13f0d3b8e07 Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Mon, 24 Apr 2023 09:22:33 +0800 Subject: [PATCH 7/8] format --- ding/policy/bcq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ding/policy/bcq.py b/ding/policy/bcq.py index 1cf7997ff1..9a8388b00f 100755 --- a/ding/policy/bcq.py +++ b/ding/policy/bcq.py @@ -196,8 +196,8 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]: target_q_value = self._target_model.forward(next_data, mode='compute_critic')['q_value'] # the value of a policy according to the maximum entropy objective # find min one as target q value - target_q_value = self.lmbda * torch.min(target_q_value[0],target_q_value[1]) \ - + (1 - self.lmbda) * torch.max(target_q_value[0],target_q_value[1]) + target_q_value = self.lmbda * torch.min(target_q_value[0], target_q_value[1]) \ + + (1 - self.lmbda) * torch.max(target_q_value[0], target_q_value[1]) target_q_value = target_q_value.reshape(batch_size, -1).max(1)[0].reshape(-1, 1) q_data0 = v_1step_td_data(q_value[0], target_q_value, reward, done, data['weight']) From 4d4c9977f5643181f9d89c8ca53bc746eba5b13b Mon Sep 17 00:00:00 2001 From: Super1ce <278042904@qq.com> Date: Tue, 30 May 2023 11:34:50 +0800 Subject: [PATCH 8/8] modify format --- ding/policy/command_mode_policy_instance.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py index c131b596a6..8b6123c063 100755 --- a/ding/policy/command_mode_policy_instance.py +++ b/ding/policy/command_mode_policy_instance.py @@ -385,6 +385,9 @@ class IBCCommandModePolicy(IBCPolicy, DummyCommandModePolicy): @POLICY_REGISTRY.register('bcq_command') class BCQCommandModelPolicy(BCQPolicy, DummyCommandModePolicy): + pass + + @POLICY_REGISTRY.register('edac_command') class EDACCommandModelPolicy(EDACPolicy, DummyCommandModePolicy): pass