From 9f9fb56d87b58e05f679c06ac970a00171f340e4 Mon Sep 17 00:00:00 2001
From: Super1ce <278042904@qq.com>
Date: Mon, 10 Apr 2023 15:17:14 +0800
Subject: [PATCH 1/8] add bcq

---
 ding/example/bcq.py                           |  45 +++
 ding/model/template/__init__.py               |   1 +
 ding/model/template/bcq.py                    | 108 +++++++
 ding/policy/__init__.py                       |   3 +
 ding/policy/bcq.py                            | 290 ++++++++++++++++++
 ding/policy/command_mode_policy_instance.py   |   8 +
 .../config/halfcheetah_medium_bcq_config.py   |  55 ++++
 .../halfcheetah_medium_expert_bcq_config.py   |  55 ++++
 dizoo/d4rl/config/hopper_medium_bcq_config.py |  55 ++++
 .../config/hopper_medium_expert_bcq_config.py |  55 ++++
 dizoo/d4rl/entry/d4rl_bcq_main.py             |  21 ++
 11 files changed, 696 insertions(+)
 create mode 100755 ding/example/bcq.py
 create mode 100755 ding/model/template/bcq.py
 create mode 100755 ding/policy/bcq.py
 create mode 100755 dizoo/d4rl/config/halfcheetah_medium_bcq_config.py
 create mode 100755 dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py
 create mode 100755 dizoo/d4rl/config/hopper_medium_bcq_config.py
 create mode 100755 dizoo/d4rl/config/hopper_medium_expert_bcq_config.py
 create mode 100755 dizoo/d4rl/entry/d4rl_bcq_main.py

diff --git a/ding/example/bcq.py b/ding/example/bcq.py
new file mode 100755
index 0000000000..0744388e3a
--- /dev/null
+++ b/ding/example/bcq.py
@@ -0,0 +1,45 @@
+import gym
+from ditk import logging
+from ding.model import BCQ
+from ding.policy import BCQPolicy
+from ding.envs import DingEnvWrapper, BaseEnvManagerV2
+from ding.data import create_dataset
+from ding.config import compile_config
+from ding.framework import task, ding_init
+from ding.framework.context import OfflineRLContext
+from ding.framework.middleware import interaction_evaluator, trainer, CkptSaver, offline_data_fetcher, offline_logger
+from ding.utils import set_pkg_seed
+from dizoo.d4rl.envs import D4RLEnv
+from dizoo.d4rl.config.halfcheetah_medium_bcq_config import main_config, create_config
+# from dizoo.d4rl.config.halfcheetah_medium_expert_edac_config import main_config,create_config
+# from dizoo.d4rl.config.hopper_medium_expert_edac_config import main_config,create_config
+# from dizoo.d4rl.config.hopper_medium_edac_config import main_config,create_config
+
+
+def main():
+    # If you don't have offline data, you need to prepare if first and set the data_path in config
+    # For demostration, we also can train a RL policy (e.g. SAC) and collect some data
+    logging.getLogger().setLevel(logging.INFO)
+    cfg = compile_config(main_config, create_cfg=create_config, auto=True)
+    ding_init(cfg)
+    with task.start(async_mode=False, ctx=OfflineRLContext()):
+        evaluator_env = BaseEnvManagerV2(
+            env_fn=[lambda: D4RLEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager
+        )
+
+        set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda)
+
+        dataset = create_dataset(cfg)
+        model = BCQ(**cfg.policy.model)
+        policy = BCQPolicy(cfg.policy, model=model)
+
+        task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env))
+        task.use(offline_data_fetcher(cfg, dataset))
+        task.use(trainer(cfg, policy.learn_mode))
+        task.use(CkptSaver(policy, cfg.exp_name, train_freq=10000000))
+        task.use(offline_logger())
+        task.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py
index e994286ac3..b22ea61d6e 100644
--- a/ding/model/template/__init__.py
+++ b/ding/model/template/__init__.py
@@ -23,3 +23,4 @@
 from .vae import VanillaVAE
 from .decision_transformer import DecisionTransformer
 from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS
+from .bcq import BCQ
\ No newline at end of file
diff --git a/ding/model/template/bcq.py b/ding/model/template/bcq.py
new file mode 100755
index 0000000000..58db5a3fbd
--- /dev/null
+++ b/ding/model/template/bcq.py
@@ -0,0 +1,108 @@
+from typing import Union, Dict, Optional, List
+from easydict import EasyDict
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ding.utils import SequenceType, squeeze, MODEL_REGISTRY
+from ..common import RegressionHead, ReparameterizationHead
+from .vae import VanillaVAE
+
+
+@MODEL_REGISTRY.register('bcq')
+class BCQ(nn.Module):
+
+    mode = ['compute_actor', 'compute_critic', 'compute_vae', 'compute_eval']
+
+    def __init__(
+            self,
+            obs_shape: Union[int, SequenceType],
+            action_shape: Union[int, SequenceType, EasyDict],
+            actor_head_hidden_size: int = 64,
+            critic_head_hidden_size: int = 64,
+            activation: Optional[nn.Module] = nn.ReLU(),
+            norm_type: Optional[str] = None,
+            vae_hidden_dims: List = [750, 750],
+            phi: float = 0.05
+    ) -> None:
+        super(BCQ, self).__init__()
+        obs_shape: int = squeeze(obs_shape)
+        action_shape = squeeze(action_shape)
+        self.action_shape = action_shape
+        self.input_size = obs_shape
+        self.phi = phi
+
+        critic_input_size = self.input_size + action_shape
+        self.critic = nn.ModuleList()
+        for _ in range(2):
+            net = []
+            d = critic_input_size
+            for dim in critic_head_hidden_size:
+                net.append(nn.Linear(d, dim))
+                net.append(activation)
+                d = dim
+            net.append(nn.Linear(d, 1))
+            self.critic.append(nn.Sequential(*net))
+
+        net = []
+        d = critic_input_size
+        for dim in actor_head_hidden_size:
+            net.append(nn.Linear(d, dim))
+            net.append(activation)
+            d = dim
+        net.append(nn.Linear(d, 1))
+        self.actor = nn.Sequential(*net)
+
+        self.vae = VanillaVAE(action_shape, obs_shape, action_shape * 2, vae_hidden_dims)
+
+    def forward(self, inputs: Union[torch.Tensor, Dict[str, torch.Tensor]], mode: str) -> Dict[str, torch.Tensor]:
+        """
+        Overview:
+            The unique execution (forward) method of QAC method, and one can indicate different modes to implement \
+            different computation graph, including ``compute_actor`` and ``compute_critic`` in QAC.
+        Mode compute_actor:
+            Arguments:
+                - inputs (:obj:`torch.Tensor`): Observation data, defaults to tensor.
+            Returns:
+                - output (:obj:`Dict`): Output dict data, including differnet key-values among distinct action_space.
+        Mode compute_critic:
+            Arguments:
+                - inputs (:obj:`Dict`): Input dict data, including obs and action tensor.
+            Returns:
+                - output (:obj:`Dict`): Output dict data, including q_value tensor.
+
+        .. note::
+            For specific examples, one can refer to API doc of ``compute_actor`` and ``compute_critic`` respectively.
+        """
+        assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode)
+        return getattr(self, mode)(inputs)
+
+    def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        obs, action = inputs['obs'], inputs['action']
+        if len(action.shape) == 1:  # (B, ) -> (B, 1)
+            action = action.unsqueeze(1)
+        x = torch.cat([obs, action], dim=-1)
+        x = [m(x).squeeze() for m in self.critic]
+        return {'q_value': x}
+
+    def compute_actor(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]:
+        input = torch.cat([inputs['obs'], inputs['action']], -1)
+        x = self.actor(input)
+        action = self.phi * 1 * torch.tanh(x)
+        action = (action + inputs['action']).clamp(-1, 1)
+        return {'action': action}
+
+    def compute_vae(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        return self.vae.forward(inputs)
+
+    def compute_eval(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        obs = inputs['obs']
+        obs_rep = obs.clone().unsqueeze(0).repeat_interleave(100, dim=0)
+        z = torch.randn((obs_rep.shape[0], obs_rep.shape[1], self.action_shape * 2)).to(obs.device).clamp(-0.5, 0.5)
+        sample_action = self.vae.decode_with_obs(z, obs_rep)['reconstruction_action']
+        action = self.compute_actor({'obs': obs_rep, 'action': sample_action})['action']
+        q = self.compute_critic({'obs': obs_rep, 'action': action})['q_value'][0]
+        idx = q.argmax(dim=0).unsqueeze(0).unsqueeze(-1)
+        idx = idx.repeat_interleave(action.shape[-1], dim=-1)
+        action = action.gather(0, idx).squeeze()
+        return {'action': action}
diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py
index 15575c7d30..65f3f2757e 100644
--- a/ding/policy/__init__.py
+++ b/ding/policy/__init__.py
@@ -18,6 +18,7 @@
 from .ppo import PPOPolicy, PPOPGPolicy, PPOOffPolicy
 from .sac import SACPolicy, SACDiscretePolicy, SQILSACPolicy
 from .cql import CQLPolicy, CQLDiscretePolicy
+from .edac import EDACPolicy
 from .impala import IMPALAPolicy
 from .ngu import NGUPolicy
 from .r2d2 import R2D2Policy
@@ -48,5 +49,7 @@
 
 from .pc import ProcedureCloningBFSPolicy
 
+from .bcq import BCQPolicy
+
 # new-type policy
 from .ppof import PPOFPolicy
diff --git a/ding/policy/bcq.py b/ding/policy/bcq.py
new file mode 100755
index 0000000000..f0144566b4
--- /dev/null
+++ b/ding/policy/bcq.py
@@ -0,0 +1,290 @@
+from typing import List, Dict, Any, Tuple, Union
+from collections import namedtuple
+import copy
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ding.torch_utils import Adam, to_device
+from ding.rl_utils import v_1step_td_data, v_1step_td_error, get_train_sample, get_nstep_return_data
+from ding.model import model_wrap
+from ding.policy import Policy
+from ding.utils import POLICY_REGISTRY
+from ding.utils.data import default_collate, default_decollate
+from .common_utils import default_preprocess_learn
+
+
+@POLICY_REGISTRY.register('bcq')
+class BCQPolicy(Policy):
+    config = dict(
+        type='bcq',
+        # (bool) Whether to use cuda for network.
+        cuda=False,
+        # (bool type) priority: Determine whether to use priority in buffer sample.
+        # Default False in SAC.
+        priority=False,
+        # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True.
+        priority_IS_weight=False,
+        # (int) Number of training samples(randomly collected) in replay buffer when training starts.
+        # Default 10000 in SAC.
+        random_collect_size=10000,
+        nstep=1,
+        model=dict(
+            # (List) Hidden list for actor network head.
+            actor_head_hidden_size=[400,400,300],
+            
+
+            # (List) Hidden list for critic network head.
+            critic_head_hidden_size=[400,400,300],
+            # Max perturbation hyper-parameter for BCQ
+            phi=0.05,
+        ),
+        learn=dict(
+
+            # How many updates(iterations) to train after collector's one collection.
+            # Bigger "update_per_collect" means bigger off-policy.
+            # collect data -> update policy-> collect data -> ...
+            update_per_collect=1,
+            # (int) Minibatch size for gradient descent.
+            batch_size=100,
+
+            # (float type) learning_rate_q: Learning rate for soft q network.
+            # Default to 3e-4.
+            # Please set to 1e-3, when model.value_network is True.
+            learning_rate_q=3e-4,
+            # (float type) learning_rate_policy: Learning rate for policy network.
+            # Default to 3e-4.
+            # Please set to 1e-3, when model.value_network is True.
+            learning_rate_policy=3e-4,
+            # (float type) learning_rate_vae: Learning rate for vae network.
+            # `learning_rate_value` should be initialized, when model.vae_network is True.
+            # Please set to 3e-4, when model.vae_network is True.
+            learning_rate_vae=3e-4,
+            # (bool) Whether ignore done(usually for max step termination env. e.g. pendulum)
+            # Note: Gym wraps the MuJoCo envs by default with TimeLimit environment wrappers.
+            # These limit HalfCheetah, and several other MuJoCo envs, to max length of 1000.
+            # However, interaction with HalfCheetah always gets done with done is False,
+            # Since we inplace done==True with done==False to keep
+            # TD-error accurate computation(``gamma * (1 - done) * next_v + reward``),
+            # when the episode step is greater than max episode step.
+            ignore_done=False,
+
+            # (float type) target_theta: Used for soft update of the target network,
+            # aka. Interpolation factor in polyak averaging for target networks.
+            # Default to 0.005.
+            target_theta=0.005,
+            # (float) discount factor for the discounted sum of rewards, aka. gamma.
+            discount_factor=0.99,
+            lmbda=0.75,
+
+            # (float) Weight uniform initialization range in the last output layer
+            init_w=3e-3,
+        ),
+        collect=dict(
+            # (int) Cut trajectories into pieces with length "unroll_len".
+            unroll_len=1,
+        ),
+        eval=dict(),
+        other=dict(
+            replay_buffer=dict(
+                # (int type) replay_buffer_size: Max size of replay buffer.
+                replay_buffer_size=1000000,
+                # (int type) max_use: Max use times of one data in the buffer.
+                # Data will be removed once used for too many times.
+                # Default to infinite.
+                # max_use=256,
+            ),
+        ),
+    )
+
+    def default_model(self) -> Tuple[str, List[str]]:
+        return 'bcq', ['ding.model.template.bcq']
+
+    def _init_learn(self) -> None:
+        r"""
+        Overview:
+            Learn mode init method. Called by ``self.__init__``.
+            Init q, value and policy's optimizers, algorithm config, main and target models.
+        """
+        # Init
+        self._priority = self._cfg.priority
+        self._priority_IS_weight = self._cfg.priority_IS_weight
+        self.lmbda = self._cfg.learn.lmbda
+        self.latent_dim = self._cfg.model.action_shape * 2
+
+        # Optimizers
+        self._optimizer_q = Adam(
+            self._model.critic.parameters(),
+            lr=self._cfg.learn.learning_rate_q,
+        )
+        self._optimizer_policy = Adam(
+            self._model.actor.parameters(),
+            lr=self._cfg.learn.learning_rate_policy,
+        )
+        self._optimizer_vae = Adam(
+            self._model.vae.parameters(),
+            lr=self._cfg.learn.learning_rate_vae,
+        )
+
+        # Algorithm config
+        self._gamma = self._cfg.learn.discount_factor
+
+        # Main and target models
+        self._target_model = copy.deepcopy(self._model)
+        self._target_model = model_wrap(
+            self._target_model,
+            wrapper_name='target',
+            update_type='momentum',
+            update_kwargs={'theta': self._cfg.learn.target_theta}
+        )
+        self._learn_model = model_wrap(self._model, wrapper_name='base')
+        self._learn_model.reset()
+        self._target_model.reset()
+
+        self._forward_learn_cnt = 0
+
+    def _forward_learn(self, data: dict) -> Dict[str, Any]:
+        loss_dict = {}
+
+        data = default_preprocess_learn(
+            data,
+            use_priority=self._priority,
+            use_priority_IS_weight=self._cfg.priority_IS_weight,
+            ignore_done=self._cfg.learn.ignore_done,
+            use_nstep=False
+        )
+        if len(data.get('action').shape) == 1:
+            data['action'] = data['action'].reshape(-1, 1)
+
+        if self._cuda:
+            data = to_device(data, self._device)
+
+        self._learn_model.train()
+        self._target_model.train()
+        obs = data['obs']
+        next_obs = data['next_obs']
+        reward = data['reward']
+        done = data['done']
+        batch_size = obs.shape[0]
+
+        # train_vae
+        vae_out = self._model.forward(data, mode='compute_vae')
+        recon, mean, log_std = vae_out['recons_action'], vae_out['mu'], vae_out['log_var']
+        recons_loss = F.mse_loss(recon, data['action'])
+        kld_loss = torch.mean(-0.5 * torch.sum(1 + log_std - mean ** 2 - log_std.exp(), dim=1), dim=0)
+        loss_dict['recons_loss'] = recons_loss
+        loss_dict['kld_loss'] = kld_loss
+        vae_loss = recons_loss + 0.5 * kld_loss
+        loss_dict['vae_loss'] = vae_loss
+        self._optimizer_vae.zero_grad()
+        vae_loss.backward()
+        self._optimizer_vae.step()
+
+        # train_critic
+        q_value = self._learn_model.forward(data, mode='compute_critic')['q_value']
+
+        with torch.no_grad():
+            next_obs_rep = torch.repeat_interleave(next_obs, 10, 0)
+            z = torch.randn((next_obs_rep.shape[0], self.latent_dim)).to(self._device).clamp(-0.5, 0.5)
+            vae_action = self._model.vae.decode_with_obs(z, next_obs_rep)['reconstruction_action']
+            next_action = self._model.forward({
+                'obs': next_obs_rep,
+                'action': vae_action
+            }, mode='compute_actor')['action']
+
+            next_data = {'obs': next_obs_rep, 'action': next_action}
+            target_q_value = self._target_model.forward(next_data, mode='compute_critic')['q_value']
+            # the value of a policy according to the maximum entropy objective
+            # find min one as target q value
+            target_q_value = self.lmbda * torch.min(target_q_value[0],target_q_value[1]) \
+                + (1 - self.lmbda) * torch.max(target_q_value[0],target_q_value[1])
+            target_q_value = target_q_value.reshape(batch_size, -1).max(1)[0].reshape(-1, 1)
+
+        q_data0 = v_1step_td_data(q_value[0], target_q_value, reward, done, data['weight'])
+        loss_dict['critic_loss'], td_error_per_sample0 = v_1step_td_error(q_data0, self._gamma)
+        q_data1 = v_1step_td_data(q_value[1], target_q_value, reward, done, data['weight'])
+        loss_dict['twin_critic_loss'], td_error_per_sample1 = v_1step_td_error(q_data1, self._gamma)
+        td_error_per_sample = (td_error_per_sample0 + td_error_per_sample1) / 2
+
+        self._optimizer_q.zero_grad()
+        (loss_dict['critic_loss'] + loss_dict['twin_critic_loss']).backward()
+        self._optimizer_q.step()
+
+        # train_policy
+        z = torch.randn((obs.shape[0], self.latent_dim)).to(self._device).clamp(-0.5, 0.5)
+        sample_action = self._model.vae.decode_with_obs(z, obs)['reconstruction_action']
+        input = {'obs': obs, 'action': sample_action}
+        perturbed_action = self._model.forward(input, mode='compute_actor')['action']
+        q_input = {'obs': obs, 'action': perturbed_action}
+        q = self._learn_model.forward(q_input, mode='compute_critic')['q_value'][0]
+        loss_dict['actor_loss'] = -q.mean()
+        self._optimizer_policy.zero_grad()
+        loss_dict['actor_loss'].backward()
+        self._optimizer_policy.step()
+        self._forward_learn_cnt += 1
+        self._target_model.update(self._learn_model.state_dict())
+        return {
+            'td_error': td_error_per_sample.detach().mean().item(),
+            'target_q_value': target_q_value.detach().mean().item(),
+            **loss_dict
+        }
+
+    def _monitor_vars_learn(self) -> List[str]:
+        return [
+            'td_error', 'target_q_value', 'critic_loss', 'twin_critic_loss', 'actor_loss', 'recons_loss', 'kld_loss',
+            'vae_loss'
+        ]
+
+    def _state_dict_learn(self) -> Dict[str, Any]:
+        ret = {
+            'model': self._learn_model.state_dict(),
+            'target_model': self._target_model.state_dict(),
+            'optimizer_q': self._optimizer_q.state_dict(),
+            'optimizer_policy': self._optimizer_policy.state_dict(),
+            'optimizer_vae': self._optimizer_vae.state_dict(),
+        }
+        return ret
+
+    def _init_eval(self):
+        self._eval_model = model_wrap(self._model, wrapper_name='base')
+        self._eval_model.reset()
+
+    def _forward_eval(self, data: dict) -> Dict[str, Any]:
+        data_id = list(data.keys())
+        data = default_collate(list(data.values()))
+        if self._cuda:
+            data = to_device(data, self._device)
+        data = {'obs': data}
+        self._eval_model.eval()
+        with torch.no_grad():
+            output = self._eval_model.forward(data, mode='compute_eval')
+        if self._cuda:
+            output = to_device(output, 'cpu')
+        output = default_decollate(output)
+        return {i: d for i, d in zip(data_id, output)}
+
+    def _init_collect(self) -> None:
+        self._unroll_len = self._cfg.collect.unroll_len
+        self._gamma = self._cfg.discount_factor  # necessary for parallel
+        self._nstep = self._cfg.nstep  # necessary for parallel
+        self._collect_model = model_wrap(self._model, wrapper_name='eps_greedy_sample')
+        self._collect_model.reset()
+
+    def _forward_collect(self, data: dict, **kwargs) -> dict:
+        pass
+
+    def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict:
+        pass
+
+    def _get_train_sample(self, data: list) -> Union[None, List[Any]]:
+        r"""
+            Overview:
+                Get the trajectory and the n step return data, then sample from the n_step return data
+            Arguments:
+                - data (:obj:`list`): The trajectory's cache
+            Returns:
+                - samples (:obj:`dict`): The training samples generated
+            """
+        data = get_nstep_return_data(data, self._nstep, gamma=self._gamma)
+        return get_train_sample(data, self._unroll_len)
diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py
index 36e8ba7185..726536cde9 100644
--- a/ding/policy/command_mode_policy_instance.py
+++ b/ding/policy/command_mode_policy_instance.py
@@ -47,6 +47,7 @@
 from .sac import SQILSACPolicy
 from .madqn import MADQNPolicy
 from .bdq import BDQPolicy
+from .bcq import BCQPolicy
 
 
 class EpsCommandModePolicy(CommandModePolicy):
@@ -381,6 +382,13 @@ class IBCCommandModePolicy(IBCPolicy, DummyCommandModePolicy):
     pass
 
 
+
+@POLICY_REGISTRY.register('bcq_command')
+class BCQCommandModelPolicy(BCQPolicy, DummyCommandModePolicy):
+    pass
+
+
+
 @POLICY_REGISTRY.register('bc_command')
 class BCCommandModePolicy(BehaviourCloningPolicy, DummyCommandModePolicy):
 
diff --git a/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py b/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py
new file mode 100755
index 0000000000..1817fc91fd
--- /dev/null
+++ b/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py
@@ -0,0 +1,55 @@
+from easydict import EasyDict
+
+main_config = dict(
+    exp_name="halfcheetah_medium_bcq_seed0",
+    env=dict(
+        env_id='halfcheetah-medium-v2',
+        collector_env_num=1,
+        evaluator_env_num=8,
+        use_act_scale=True,
+        n_evaluator_episode=8,
+        stop_value=7000,
+    ),
+    policy=dict(
+        cuda=True,
+        model=dict(
+            obs_shape=17,
+            action_shape=6,
+            actor_head_hidden_size=[400,400,300],
+            critic_head_hidden_size=[400,400,300],
+            phi=0.05,
+        ),
+        learn=dict(
+            data_path=None,
+            train_epoch=30000,
+            batch_size=100,
+            learning_rate_q=3e-3,
+            learning_rate_policy=3e-3,
+            learning_rate_alpha=3e-3,
+            lmbda=0.75,
+            learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )),
+        ),
+        collect=dict(data_type='d4rl', ),
+        eval=dict(evaluator=dict(eval_freq=500, )),
+        other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ),
+    ),
+    seed = 123,
+)
+
+main_config = EasyDict(main_config)
+main_config = main_config
+
+create_config = dict(
+    env=dict(
+        type='d4rl',
+        import_names=['dizoo.d4rl.envs.d4rl_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(
+        type='bcq',
+        import_names=['ding.policy.bcq'],
+    ),
+    replay_buffer=dict(type='naive', ),
+)
+create_config = EasyDict(create_config)
+create_config = create_config
\ No newline at end of file
diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py
new file mode 100755
index 0000000000..610e3996d3
--- /dev/null
+++ b/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py
@@ -0,0 +1,55 @@
+from easydict import EasyDict
+
+main_config = dict(
+    exp_name="halfcheetah_medium_expert_bcq_seed0",
+    env=dict(
+        env_id='halfcheetah-medium-expert-v2',
+        collector_env_num=1,
+        evaluator_env_num=8,
+        use_act_scale=True,
+        n_evaluator_episode=8,
+        stop_value=12000,
+    ),
+    policy=dict(
+        cuda=True,
+        model=dict(
+            obs_shape=17,
+            action_shape=6,
+            actor_head_hidden_size=[400,400,300],
+            critic_head_hidden_size=[400,400,300],
+            phi=0.05,
+        ),
+        learn=dict(
+            data_path=None,
+            train_epoch=30000,
+            batch_size=100,
+            learning_rate_q=3e-3,
+            learning_rate_policy=3e-3,
+            learning_rate_alpha=3e-3,
+            lmbda=0.75,
+            learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )),
+        ),
+        collect=dict(data_type='d4rl', ),
+        eval=dict(evaluator=dict(eval_freq=500, )),
+        other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ),
+    ),
+    seed = 123,
+)
+
+main_config = EasyDict(main_config)
+main_config = main_config
+
+create_config = dict(
+    env=dict(
+        type='d4rl',
+        import_names=['dizoo.d4rl.envs.d4rl_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(
+        type='bcq',
+        import_names=['ding.policy.bcq'],
+    ),
+    replay_buffer=dict(type='naive', ),
+)
+create_config = EasyDict(create_config)
+create_config = create_config
\ No newline at end of file
diff --git a/dizoo/d4rl/config/hopper_medium_bcq_config.py b/dizoo/d4rl/config/hopper_medium_bcq_config.py
new file mode 100755
index 0000000000..db70368f08
--- /dev/null
+++ b/dizoo/d4rl/config/hopper_medium_bcq_config.py
@@ -0,0 +1,55 @@
+from easydict import EasyDict
+
+main_config = dict(
+    exp_name="hopper_medium_bcq_seed0_43_v0",
+    env=dict(
+        env_id='hopper-medium-v0',
+        collector_env_num=1,
+        evaluator_env_num=8,
+        use_act_scale=True,
+        n_evaluator_episode=8,
+        stop_value=3500,
+    ),
+    policy=dict(
+        cuda=True,
+        model=dict(
+            obs_shape=11,
+            action_shape=3,
+            actor_head_hidden_size=[400,400,300],
+            critic_head_hidden_size=[400,400,300],
+            phi=0.05,
+        ),
+        learn=dict(
+            data_path=None,
+            train_epoch=30000,
+            batch_size=100,
+            learning_rate_q=3e-3,
+            learning_rate_policy=3e-3,
+            learning_rate_alpha=3e-3,
+            lmbda=0.75,
+            learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )),
+        ),
+        collect=dict(data_type='d4rl', ),
+        eval=dict(evaluator=dict(eval_freq=500, )),
+        other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ),
+    ),
+    seed = 123,
+)
+
+main_config = EasyDict(main_config)
+main_config = main_config
+
+create_config = dict(
+    env=dict(
+        type='d4rl',
+        import_names=['dizoo.d4rl.envs.d4rl_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(
+        type='bcq',
+        import_names=['ding.policy.bcq'],
+    ),
+    replay_buffer=dict(type='naive', ),
+)
+create_config = EasyDict(create_config)
+create_config = create_config
\ No newline at end of file
diff --git a/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py b/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py
new file mode 100755
index 0000000000..eec47363dc
--- /dev/null
+++ b/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py
@@ -0,0 +1,55 @@
+from easydict import EasyDict
+
+main_config = dict(
+    exp_name="hopper_medium_expert_bcq_seed0_43_v0",
+    env=dict(
+        env_id='hopper-medium-expert-v0',
+        collector_env_num=1,
+        evaluator_env_num=8,
+        use_act_scale=True,
+        n_evaluator_episode=8,
+        stop_value=3800,
+    ),
+    policy=dict(
+        cuda=True,
+        model=dict(
+            obs_shape=11,
+            action_shape=3,
+            actor_head_hidden_size=[400,400,300],
+            critic_head_hidden_size=[400,400,300],
+            phi=0.05,
+        ),
+        learn=dict(
+            data_path=None,
+            train_epoch=30000,
+            batch_size=100,
+            learning_rate_q=3e-3,
+            learning_rate_policy=3e-3,
+            learning_rate_alpha=3e-3,
+            lmbda=0.75,
+            learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )),
+        ),
+        collect=dict(data_type='d4rl', ),
+        eval=dict(evaluator=dict(eval_freq=500, )),
+        other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ),
+    ),
+    seed = 123,
+)
+
+main_config = EasyDict(main_config)
+main_config = main_config
+
+create_config = dict(
+    env=dict(
+        type='d4rl',
+        import_names=['dizoo.d4rl.envs.d4rl_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(
+        type='bcq',
+        import_names=['ding.policy.bcq'],
+    ),
+    replay_buffer=dict(type='naive', ),
+)
+create_config = EasyDict(create_config)
+create_config = create_config
\ No newline at end of file
diff --git a/dizoo/d4rl/entry/d4rl_bcq_main.py b/dizoo/d4rl/entry/d4rl_bcq_main.py
new file mode 100755
index 0000000000..2e50f25100
--- /dev/null
+++ b/dizoo/d4rl/entry/d4rl_bcq_main.py
@@ -0,0 +1,21 @@
+from ding.entry import serial_pipeline_offline
+from ding.config import read_config
+from pathlib import Path
+
+
+def train(args):
+    # launch from anywhere
+    config = Path(__file__).absolute().parent.parent / 'config' / args.config 
+    config = read_config(str(config))
+    config[0].exp_name = config[0].exp_name.replace('0', str(args.seed))
+    serial_pipeline_offline(config, seed=args.seed)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--seed', '-s', type=int, default=0)
+    parser.add_argument('--config', '-c', type=str, default='halfcheetah_medium_bcq_config.py')
+    args = parser.parse_args()
+    train(args)
\ No newline at end of file

From 985def776aece702d7f77b8a8fc026daf8e7dff1 Mon Sep 17 00:00:00 2001
From: Super1ce <278042904@qq.com>
Date: Mon, 10 Apr 2023 16:01:27 +0800
Subject: [PATCH 2/8] modif policy_init

---
 ding/policy/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py
index 65f3f2757e..cac683b3b2 100644
--- a/ding/policy/__init__.py
+++ b/ding/policy/__init__.py
@@ -18,7 +18,6 @@
 from .ppo import PPOPolicy, PPOPGPolicy, PPOOffPolicy
 from .sac import SACPolicy, SACDiscretePolicy, SQILSACPolicy
 from .cql import CQLPolicy, CQLDiscretePolicy
-from .edac import EDACPolicy
 from .impala import IMPALAPolicy
 from .ngu import NGUPolicy
 from .r2d2 import R2D2Policy

From 7b7a99ea4918721c10c75076da44ea04112a43dc Mon Sep 17 00:00:00 2001
From: Super1ce <278042904@qq.com>
Date: Wed, 12 Apr 2023 16:19:07 +0800
Subject: [PATCH 3/8] modify bcq

---
 ding/example/bcq.py                           |  42 +++
 ding/model/template/__init__.py               |   1 +
 ding/model/template/bcq.py                    | 132 ++++++++
 ding/policy/__init__.py                       |   2 +
 ding/policy/bcq.py                            | 290 ++++++++++++++++++
 ding/policy/command_mode_policy_instance.py   |   8 +
 .../config/halfcheetah_medium_bcq_config.py   |  55 ++++
 .../halfcheetah_medium_expert_bcq_config.py   |  55 ++++
 dizoo/d4rl/config/hopper_medium_bcq_config.py |  55 ++++
 .../config/hopper_medium_expert_bcq_config.py |  55 ++++
 dizoo/d4rl/entry/d4rl_bcq_main.py             |  21 ++
 11 files changed, 716 insertions(+)
 create mode 100755 ding/example/bcq.py
 create mode 100755 ding/model/template/bcq.py
 create mode 100755 ding/policy/bcq.py
 create mode 100755 dizoo/d4rl/config/halfcheetah_medium_bcq_config.py
 create mode 100755 dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py
 create mode 100755 dizoo/d4rl/config/hopper_medium_bcq_config.py
 create mode 100755 dizoo/d4rl/config/hopper_medium_expert_bcq_config.py
 create mode 100755 dizoo/d4rl/entry/d4rl_bcq_main.py

diff --git a/ding/example/bcq.py b/ding/example/bcq.py
new file mode 100755
index 0000000000..4bd1385c3f
--- /dev/null
+++ b/ding/example/bcq.py
@@ -0,0 +1,42 @@
+import gym
+from ditk import logging
+from ding.model import BCQ
+from ding.policy import BCQPolicy
+from ding.envs import DingEnvWrapper, BaseEnvManagerV2
+from ding.data import create_dataset
+from ding.config import compile_config
+from ding.framework import task, ding_init
+from ding.framework.context import OfflineRLContext
+from ding.framework.middleware import interaction_evaluator, trainer, CkptSaver, offline_data_fetcher, offline_logger
+from ding.utils import set_pkg_seed
+from dizoo.d4rl.envs import D4RLEnv
+from dizoo.d4rl.config.halfcheetah_medium_bcq_config import main_config, create_config
+
+
+def main():
+    # If you don't have offline data, you need to prepare if first and set the data_path in config
+    # For demostration, we also can train a RL policy (e.g. SAC) and collect some data
+    logging.getLogger().setLevel(logging.INFO)
+    cfg = compile_config(main_config, create_cfg=create_config, auto=True)
+    ding_init(cfg)
+    with task.start(async_mode=False, ctx=OfflineRLContext()):
+        evaluator_env = BaseEnvManagerV2(
+            env_fn=[lambda: D4RLEnv(cfg.env) for _ in range(cfg.env.evaluator_env_num)], cfg=cfg.env.manager
+        )
+
+        set_pkg_seed(cfg.seed, use_cuda=cfg.policy.cuda)
+
+        dataset = create_dataset(cfg)
+        model = BCQ(**cfg.policy.model)
+        policy = BCQPolicy(cfg.policy, model=model)
+
+        task.use(interaction_evaluator(cfg, policy.eval_mode, evaluator_env))
+        task.use(offline_data_fetcher(cfg, dataset))
+        task.use(trainer(cfg, policy.learn_mode))
+        task.use(CkptSaver(policy, cfg.exp_name, train_freq=10000000))
+        task.use(offline_logger())
+        task.run()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py
index e994286ac3..b22ea61d6e 100644
--- a/ding/model/template/__init__.py
+++ b/ding/model/template/__init__.py
@@ -23,3 +23,4 @@
 from .vae import VanillaVAE
 from .decision_transformer import DecisionTransformer
 from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS
+from .bcq import BCQ
\ No newline at end of file
diff --git a/ding/model/template/bcq.py b/ding/model/template/bcq.py
new file mode 100755
index 0000000000..8f78b955e3
--- /dev/null
+++ b/ding/model/template/bcq.py
@@ -0,0 +1,132 @@
+from typing import Union, Dict, Optional, List
+from easydict import EasyDict
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ding.utils import SequenceType, squeeze, MODEL_REGISTRY
+from ..common import RegressionHead, ReparameterizationHead
+from .vae import VanillaVAE
+
+
+@MODEL_REGISTRY.register('bcq')
+class BCQ(nn.Module):
+
+    mode = ['compute_actor', 'compute_critic', 'compute_vae', 'compute_eval']
+
+    def __init__(
+            self,
+            obs_shape: Union[int, SequenceType],
+            action_shape: Union[int, SequenceType, EasyDict],
+            actor_head_hidden_size: List = [400, 300],
+            critic_head_hidden_size: List = [400, 300],
+            activation: Optional[nn.Module] = nn.ReLU(),
+            vae_hidden_dims: List = [750, 750],
+            phi: float = 0.05
+    ) -> None:
+        """
+        Overview:
+            Initialize QMIX neural network, i.e. agent Q network and mixer.
+        Arguments:
+            - obs_shape (:obj:`int`): the dimension of observation state
+            - action_shape (:obj:`int`): the dimension of action shape
+            - actor_hidden_size (:obj:`list`): the list of hidden size of actor
+            - critic_hidden_size (:obj:'list'): the list of hidden size of critic
+            - activation (:obj:`nn.Module`): Activation function in network, defaults to nn.ReLU().
+            - vae_hidden_dims (:obj:`list`): the list of hidden size of vae
+        """
+        super(BCQ, self).__init__()
+        obs_shape: int = squeeze(obs_shape)
+        action_shape = squeeze(action_shape)
+        self.action_shape = action_shape
+        self.input_size = obs_shape
+        self.phi = phi
+
+        critic_input_size = self.input_size + action_shape
+        self.critic = nn.ModuleList()
+        for _ in range(2):
+            net = []
+            d = critic_input_size
+            for dim in critic_head_hidden_size:
+                net.append(nn.Linear(d, dim))
+                net.append(activation)
+                d = dim
+            net.append(nn.Linear(d, 1))
+            self.critic.append(nn.Sequential(*net))
+
+        net = []
+        d = critic_input_size
+        for dim in actor_head_hidden_size:
+            net.append(nn.Linear(d, dim))
+            net.append(activation)
+            d = dim
+        net.append(nn.Linear(d, 1))
+        self.actor = nn.Sequential(*net)
+
+        self.vae = VanillaVAE(action_shape, obs_shape, action_shape * 2, vae_hidden_dims)
+
+    def forward(self, inputs: Dict[str, torch.Tensor], mode: str) -> Dict[str, torch.Tensor]:
+        """
+        Overview:
+            The unique execution (forward) method of QAC method, and one can indicate different modes to implement \
+            different computation graph, including ``compute_actor`` and ``compute_critic`` in QAC.
+        Mode compute_actor:
+            Arguments:
+                - inputs (:obj:`Dict`): Input dict data, including obs and action tensor.
+            Returns:
+                - output (:obj:`Dict`): Output dict data, including action tensor.
+        Mode compute_critic:
+            Arguments:
+                - inputs (:obj:`Dict`): Input dict data, including obs and action tensor.
+            Returns:
+                - output (:obj:`Dict`): Output dict data, including q_value tensor.
+        Mode compute_vae:
+            Arguments:
+                - inputs (:obj:`Dict`): Input dict data, including obs and action tensor.
+            Returns:
+                - outputs (:obj:`Dict`): Dict containing keywords ``recons_action`` \
+                (:obj:`torch.Tensor`), ``prediction_residual`` (:obj:`torch.Tensor`), \
+                ``input`` (:obj:`torch.Tensor`), ``mu`` (:obj:`torch.Tensor`), \
+                ``log_var`` (:obj:`torch.Tensor`) and ``z`` (:obj:`torch.Tensor`).
+        Mode compute_eval:
+            Arguments:
+                - inputs (:obj:`Dict`): Input dict data, including obs and action tensor.
+            Returns:
+                - output (:obj:`Dict`): Output dict data, including action tensor.
+
+
+        .. note::
+            For specific examples, one can refer to API doc of ``compute_actor`` and ``compute_critic`` respectively.
+        """
+        assert mode in self.mode, "not support forward mode: {}/{}".format(mode, self.mode)
+        return getattr(self, mode)(inputs)
+
+    def compute_critic(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        obs, action = inputs['obs'], inputs['action']
+        if len(action.shape) == 1:  # (B, ) -> (B, 1)
+            action = action.unsqueeze(1)
+        x = torch.cat([obs, action], dim=-1)
+        x = [m(x).squeeze() for m in self.critic]
+        return {'q_value': x}
+
+    def compute_actor(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, Union[torch.Tensor, Dict[str, torch.Tensor]]]:
+        input = torch.cat([inputs['obs'], inputs['action']], -1)
+        x = self.actor(input)
+        action = self.phi * 1 * torch.tanh(x)
+        action = (action + inputs['action']).clamp(-1, 1)
+        return {'action': action}
+
+    def compute_vae(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        return self.vae.forward(inputs)
+
+    def compute_eval(self, inputs: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        obs = inputs['obs']
+        obs_rep = obs.clone().unsqueeze(0).repeat_interleave(100, dim=0)
+        z = torch.randn((obs_rep.shape[0], obs_rep.shape[1], self.action_shape * 2)).to(obs.device).clamp(-0.5, 0.5)
+        sample_action = self.vae.decode_with_obs(z, obs_rep)['reconstruction_action']
+        action = self.compute_actor({'obs': obs_rep, 'action': sample_action})['action']
+        q = self.compute_critic({'obs': obs_rep, 'action': action})['q_value'][0]
+        idx = q.argmax(dim=0).unsqueeze(0).unsqueeze(-1)
+        idx = idx.repeat_interleave(action.shape[-1], dim=-1)
+        action = action.gather(0, idx).squeeze()
+        return {'action': action}
diff --git a/ding/policy/__init__.py b/ding/policy/__init__.py
index 15575c7d30..cac683b3b2 100644
--- a/ding/policy/__init__.py
+++ b/ding/policy/__init__.py
@@ -48,5 +48,7 @@
 
 from .pc import ProcedureCloningBFSPolicy
 
+from .bcq import BCQPolicy
+
 # new-type policy
 from .ppof import PPOFPolicy
diff --git a/ding/policy/bcq.py b/ding/policy/bcq.py
new file mode 100755
index 0000000000..5dd517b6cd
--- /dev/null
+++ b/ding/policy/bcq.py
@@ -0,0 +1,290 @@
+from typing import List, Dict, Any, Tuple, Union
+from collections import namedtuple
+import copy
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ding.torch_utils import Adam, to_device
+from ding.rl_utils import v_1step_td_data, v_1step_td_error, get_train_sample, get_nstep_return_data
+from ding.model import model_wrap
+from ding.policy import Policy
+from ding.utils import POLICY_REGISTRY
+from ding.utils.data import default_collate, default_decollate
+from .common_utils import default_preprocess_learn
+
+
+@POLICY_REGISTRY.register('bcq')
+class BCQPolicy(Policy):
+    config = dict(
+        type='bcq',
+        # (bool) Whether to use cuda for network.
+        cuda=False,
+        # (bool type) priority: Determine whether to use priority in buffer sample.
+        # Default False in SAC.
+        priority=False,
+        # (bool) Whether use Importance Sampling Weight to correct biased update. If True, priority must be True.
+        priority_IS_weight=False,
+        # (int) Number of training samples(randomly collected) in replay buffer when training starts.
+        # Default 10000 in SAC.
+        random_collect_size=10000,
+        nstep=1,
+        model=dict(
+            # (List) Hidden list for actor network head.
+            actor_head_hidden_size=[400,300],
+            
+
+            # (List) Hidden list for critic network head.
+            critic_head_hidden_size=[400,300],
+            # Max perturbation hyper-parameter for BCQ
+            phi=0.05,
+        ),
+        learn=dict(
+
+            # How many updates(iterations) to train after collector's one collection.
+            # Bigger "update_per_collect" means bigger off-policy.
+            # collect data -> update policy-> collect data -> ...
+            update_per_collect=1,
+            # (int) Minibatch size for gradient descent.
+            batch_size=100,
+
+            # (float type) learning_rate_q: Learning rate for soft q network.
+            # Default to 3e-4.
+            # Please set to 1e-3, when model.value_network is True.
+            learning_rate_q=3e-4,
+            # (float type) learning_rate_policy: Learning rate for policy network.
+            # Default to 3e-4.
+            # Please set to 1e-3, when model.value_network is True.
+            learning_rate_policy=3e-4,
+            # (float type) learning_rate_vae: Learning rate for vae network.
+            # `learning_rate_value` should be initialized, when model.vae_network is True.
+            # Please set to 3e-4, when model.vae_network is True.
+            learning_rate_vae=3e-4,
+            # (bool) Whether ignore done(usually for max step termination env. e.g. pendulum)
+            # Note: Gym wraps the MuJoCo envs by default with TimeLimit environment wrappers.
+            # These limit HalfCheetah, and several other MuJoCo envs, to max length of 1000.
+            # However, interaction with HalfCheetah always gets done with done is False,
+            # Since we inplace done==True with done==False to keep
+            # TD-error accurate computation(``gamma * (1 - done) * next_v + reward``),
+            # when the episode step is greater than max episode step.
+            ignore_done=False,
+
+            # (float type) target_theta: Used for soft update of the target network,
+            # aka. Interpolation factor in polyak averaging for target networks.
+            # Default to 0.005.
+            target_theta=0.005,
+            # (float) discount factor for the discounted sum of rewards, aka. gamma.
+            discount_factor=0.99,
+            lmbda=0.75,
+
+            # (float) Weight uniform initialization range in the last output layer
+            init_w=3e-3,
+        ),
+        collect=dict(
+            # (int) Cut trajectories into pieces with length "unroll_len".
+            unroll_len=1,
+        ),
+        eval=dict(),
+        other=dict(
+            replay_buffer=dict(
+                # (int type) replay_buffer_size: Max size of replay buffer.
+                replay_buffer_size=1000000,
+                # (int type) max_use: Max use times of one data in the buffer.
+                # Data will be removed once used for too many times.
+                # Default to infinite.
+                # max_use=256,
+            ),
+        ),
+    )
+
+    def default_model(self) -> Tuple[str, List[str]]:
+        return 'bcq', ['ding.model.template.bcq']
+
+    def _init_learn(self) -> None:
+        r"""
+        Overview:
+            Learn mode init method. Called by ``self.__init__``.
+            Init q, value and policy's optimizers, algorithm config, main and target models.
+        """
+        # Init
+        self._priority = self._cfg.priority
+        self._priority_IS_weight = self._cfg.priority_IS_weight
+        self.lmbda = self._cfg.learn.lmbda
+        self.latent_dim = self._cfg.model.action_shape * 2
+
+        # Optimizers
+        self._optimizer_q = Adam(
+            self._model.critic.parameters(),
+            lr=self._cfg.learn.learning_rate_q,
+        )
+        self._optimizer_policy = Adam(
+            self._model.actor.parameters(),
+            lr=self._cfg.learn.learning_rate_policy,
+        )
+        self._optimizer_vae = Adam(
+            self._model.vae.parameters(),
+            lr=self._cfg.learn.learning_rate_vae,
+        )
+
+        # Algorithm config
+        self._gamma = self._cfg.learn.discount_factor
+
+        # Main and target models
+        self._target_model = copy.deepcopy(self._model)
+        self._target_model = model_wrap(
+            self._target_model,
+            wrapper_name='target',
+            update_type='momentum',
+            update_kwargs={'theta': self._cfg.learn.target_theta}
+        )
+        self._learn_model = model_wrap(self._model, wrapper_name='base')
+        self._learn_model.reset()
+        self._target_model.reset()
+
+        self._forward_learn_cnt = 0
+
+    def _forward_learn(self, data: dict) -> Dict[str, Any]:
+        loss_dict = {}
+
+        data = default_preprocess_learn(
+            data,
+            use_priority=self._priority,
+            use_priority_IS_weight=self._cfg.priority_IS_weight,
+            ignore_done=self._cfg.learn.ignore_done,
+            use_nstep=False
+        )
+        if len(data.get('action').shape) == 1:
+            data['action'] = data['action'].reshape(-1, 1)
+
+        if self._cuda:
+            data = to_device(data, self._device)
+
+        self._learn_model.train()
+        self._target_model.train()
+        obs = data['obs']
+        next_obs = data['next_obs']
+        reward = data['reward']
+        done = data['done']
+        batch_size = obs.shape[0]
+
+        # train_vae
+        vae_out = self._model.forward(data, mode='compute_vae')
+        recon, mean, log_std = vae_out['recons_action'], vae_out['mu'], vae_out['log_var']
+        recons_loss = F.mse_loss(recon, data['action'])
+        kld_loss = torch.mean(-0.5 * torch.sum(1 + log_std - mean ** 2 - log_std.exp(), dim=1), dim=0)
+        loss_dict['recons_loss'] = recons_loss
+        loss_dict['kld_loss'] = kld_loss
+        vae_loss = recons_loss + 0.5 * kld_loss
+        loss_dict['vae_loss'] = vae_loss
+        self._optimizer_vae.zero_grad()
+        vae_loss.backward()
+        self._optimizer_vae.step()
+
+        # train_critic
+        q_value = self._learn_model.forward(data, mode='compute_critic')['q_value']
+
+        with torch.no_grad():
+            next_obs_rep = torch.repeat_interleave(next_obs, 10, 0)
+            z = torch.randn((next_obs_rep.shape[0], self.latent_dim)).to(self._device).clamp(-0.5, 0.5)
+            vae_action = self._model.vae.decode_with_obs(z, next_obs_rep)['reconstruction_action']
+            next_action = self._target_model.forward({
+                'obs': next_obs_rep,
+                'action': vae_action
+            }, mode='compute_actor')['action']
+
+            next_data = {'obs': next_obs_rep, 'action': next_action}
+            target_q_value = self._target_model.forward(next_data, mode='compute_critic')['q_value']
+            # the value of a policy according to the maximum entropy objective
+            # find min one as target q value
+            target_q_value = self.lmbda * torch.min(target_q_value[0],target_q_value[1]) \
+                + (1 - self.lmbda) * torch.max(target_q_value[0],target_q_value[1])
+            target_q_value = target_q_value.reshape(batch_size, -1).max(1)[0].reshape(-1, 1)
+
+        q_data0 = v_1step_td_data(q_value[0], target_q_value, reward, done, data['weight'])
+        loss_dict['critic_loss'], td_error_per_sample0 = v_1step_td_error(q_data0, self._gamma)
+        q_data1 = v_1step_td_data(q_value[1], target_q_value, reward, done, data['weight'])
+        loss_dict['twin_critic_loss'], td_error_per_sample1 = v_1step_td_error(q_data1, self._gamma)
+        td_error_per_sample = (td_error_per_sample0 + td_error_per_sample1) / 2
+
+        self._optimizer_q.zero_grad()
+        (loss_dict['critic_loss'] + loss_dict['twin_critic_loss']).backward()
+        self._optimizer_q.step()
+
+        # train_policy
+        z = torch.randn((obs.shape[0], self.latent_dim)).to(self._device).clamp(-0.5, 0.5)
+        sample_action = self._model.vae.decode_with_obs(z, obs)['reconstruction_action']
+        input = {'obs': obs, 'action': sample_action}
+        perturbed_action = self._model.forward(input, mode='compute_actor')['action']
+        q_input = {'obs': obs, 'action': perturbed_action}
+        q = self._learn_model.forward(q_input, mode='compute_critic')['q_value'][0]
+        loss_dict['actor_loss'] = -q.mean()
+        self._optimizer_policy.zero_grad()
+        loss_dict['actor_loss'].backward()
+        self._optimizer_policy.step()
+        self._forward_learn_cnt += 1
+        self._target_model.update(self._learn_model.state_dict())
+        return {
+            'td_error': td_error_per_sample.detach().mean().item(),
+            'target_q_value': target_q_value.detach().mean().item(),
+            **loss_dict
+        }
+
+    def _monitor_vars_learn(self) -> List[str]:
+        return [
+            'td_error', 'target_q_value', 'critic_loss', 'twin_critic_loss', 'actor_loss', 'recons_loss', 'kld_loss',
+            'vae_loss'
+        ]
+
+    def _state_dict_learn(self) -> Dict[str, Any]:
+        ret = {
+            'model': self._learn_model.state_dict(),
+            'target_model': self._target_model.state_dict(),
+            'optimizer_q': self._optimizer_q.state_dict(),
+            'optimizer_policy': self._optimizer_policy.state_dict(),
+            'optimizer_vae': self._optimizer_vae.state_dict(),
+        }
+        return ret
+
+    def _init_eval(self):
+        self._eval_model = model_wrap(self._model, wrapper_name='base')
+        self._eval_model.reset()
+
+    def _forward_eval(self, data: dict) -> Dict[str, Any]:
+        data_id = list(data.keys())
+        data = default_collate(list(data.values()))
+        if self._cuda:
+            data = to_device(data, self._device)
+        data = {'obs': data}
+        self._eval_model.eval()
+        with torch.no_grad():
+            output = self._eval_model.forward(data, mode='compute_eval')
+        if self._cuda:
+            output = to_device(output, 'cpu')
+        output = default_decollate(output)
+        return {i: d for i, d in zip(data_id, output)}
+
+    def _init_collect(self) -> None:
+        self._unroll_len = self._cfg.collect.unroll_len
+        self._gamma = self._cfg.discount_factor  # necessary for parallel
+        self._nstep = self._cfg.nstep  # necessary for parallel
+        self._collect_model = model_wrap(self._model, wrapper_name='eps_greedy_sample')
+        self._collect_model.reset()
+
+    def _forward_collect(self, data: dict, **kwargs) -> dict:
+        pass
+
+    def _process_transition(self, obs: Any, model_output: dict, timestep: namedtuple) -> dict:
+        pass
+
+    def _get_train_sample(self, data: list) -> Union[None, List[Any]]:
+        r"""
+            Overview:
+                Get the trajectory and the n step return data, then sample from the n_step return data
+            Arguments:
+                - data (:obj:`list`): The trajectory's cache
+            Returns:
+                - samples (:obj:`dict`): The training samples generated
+            """
+        data = get_nstep_return_data(data, self._nstep, gamma=self._gamma)
+        return get_train_sample(data, self._unroll_len)
diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py
index 36e8ba7185..726536cde9 100644
--- a/ding/policy/command_mode_policy_instance.py
+++ b/ding/policy/command_mode_policy_instance.py
@@ -47,6 +47,7 @@
 from .sac import SQILSACPolicy
 from .madqn import MADQNPolicy
 from .bdq import BDQPolicy
+from .bcq import BCQPolicy
 
 
 class EpsCommandModePolicy(CommandModePolicy):
@@ -381,6 +382,13 @@ class IBCCommandModePolicy(IBCPolicy, DummyCommandModePolicy):
     pass
 
 
+
+@POLICY_REGISTRY.register('bcq_command')
+class BCQCommandModelPolicy(BCQPolicy, DummyCommandModePolicy):
+    pass
+
+
+
 @POLICY_REGISTRY.register('bc_command')
 class BCCommandModePolicy(BehaviourCloningPolicy, DummyCommandModePolicy):
 
diff --git a/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py b/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py
new file mode 100755
index 0000000000..cee9fc9f5a
--- /dev/null
+++ b/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py
@@ -0,0 +1,55 @@
+from easydict import EasyDict
+
+main_config = dict(
+    exp_name="halfcheetah_medium_bcq_seed0",
+    env=dict(
+        env_id='halfcheetah-medium-v2',
+        collector_env_num=1,
+        evaluator_env_num=8,
+        use_act_scale=True,
+        n_evaluator_episode=8,
+        stop_value=7000,
+    ),
+    policy=dict(
+        cuda=True,
+        model=dict(
+            obs_shape=17,
+            action_shape=6,
+            actor_head_hidden_size=[400,300],
+            critic_head_hidden_size=[400,300],
+            phi=0.05,
+        ),
+        learn=dict(
+            data_path=None,
+            train_epoch=30000,
+            batch_size=100,
+            learning_rate_q=3e-3,
+            learning_rate_policy=3e-3,
+            learning_rate_alpha=3e-3,
+            lmbda=0.75,
+            learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )),
+        ),
+        collect=dict(data_type='d4rl', ),
+        eval=dict(evaluator=dict(eval_freq=500, )),
+        other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ),
+    ),
+    seed = 123,
+)
+
+main_config = EasyDict(main_config)
+main_config = main_config
+
+create_config = dict(
+    env=dict(
+        type='d4rl',
+        import_names=['dizoo.d4rl.envs.d4rl_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(
+        type='bcq',
+        import_names=['ding.policy.bcq'],
+    ),
+    replay_buffer=dict(type='naive', ),
+)
+create_config = EasyDict(create_config)
+create_config = create_config
\ No newline at end of file
diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py
new file mode 100755
index 0000000000..219810d1d0
--- /dev/null
+++ b/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py
@@ -0,0 +1,55 @@
+from easydict import EasyDict
+
+main_config = dict(
+    exp_name="halfcheetah_medium_expert_bcq_seed0",
+    env=dict(
+        env_id='halfcheetah-medium-expert-v2',
+        collector_env_num=1,
+        evaluator_env_num=8,
+        use_act_scale=True,
+        n_evaluator_episode=8,
+        stop_value=12000,
+    ),
+    policy=dict(
+        cuda=True,
+        model=dict(
+            obs_shape=17,
+            action_shape=6,
+            actor_head_hidden_size=[400,300],
+            critic_head_hidden_size=[400,300],
+            phi=0.05,
+        ),
+        learn=dict(
+            data_path=None,
+            train_epoch=30000,
+            batch_size=100,
+            learning_rate_q=3e-3,
+            learning_rate_policy=3e-3,
+            learning_rate_alpha=3e-3,
+            lmbda=0.75,
+            learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )),
+        ),
+        collect=dict(data_type='d4rl', ),
+        eval=dict(evaluator=dict(eval_freq=500, )),
+        other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ),
+    ),
+    seed = 123,
+)
+
+main_config = EasyDict(main_config)
+main_config = main_config
+
+create_config = dict(
+    env=dict(
+        type='d4rl',
+        import_names=['dizoo.d4rl.envs.d4rl_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(
+        type='bcq',
+        import_names=['ding.policy.bcq'],
+    ),
+    replay_buffer=dict(type='naive', ),
+)
+create_config = EasyDict(create_config)
+create_config = create_config
\ No newline at end of file
diff --git a/dizoo/d4rl/config/hopper_medium_bcq_config.py b/dizoo/d4rl/config/hopper_medium_bcq_config.py
new file mode 100755
index 0000000000..df13e46f3b
--- /dev/null
+++ b/dizoo/d4rl/config/hopper_medium_bcq_config.py
@@ -0,0 +1,55 @@
+from easydict import EasyDict
+
+main_config = dict(
+    exp_name="hopper_medium_bcq_seed0_43_v0",
+    env=dict(
+        env_id='hopper-medium-v0',
+        collector_env_num=1,
+        evaluator_env_num=8,
+        use_act_scale=True,
+        n_evaluator_episode=8,
+        stop_value=3500,
+    ),
+    policy=dict(
+        cuda=True,
+        model=dict(
+            obs_shape=11,
+            action_shape=3,
+            actor_head_hidden_size=[400,300],
+            critic_head_hidden_size=[400,300],
+            phi=0.05,
+        ),
+        learn=dict(
+            data_path=None,
+            train_epoch=30000,
+            batch_size=100,
+            learning_rate_q=3e-3,
+            learning_rate_policy=3e-3,
+            learning_rate_alpha=3e-3,
+            lmbda=0.75,
+            learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )),
+        ),
+        collect=dict(data_type='d4rl', ),
+        eval=dict(evaluator=dict(eval_freq=500, )),
+        other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ),
+    ),
+    seed = 123,
+)
+
+main_config = EasyDict(main_config)
+main_config = main_config
+
+create_config = dict(
+    env=dict(
+        type='d4rl',
+        import_names=['dizoo.d4rl.envs.d4rl_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(
+        type='bcq',
+        import_names=['ding.policy.bcq'],
+    ),
+    replay_buffer=dict(type='naive', ),
+)
+create_config = EasyDict(create_config)
+create_config = create_config
\ No newline at end of file
diff --git a/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py b/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py
new file mode 100755
index 0000000000..0d17f6ef04
--- /dev/null
+++ b/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py
@@ -0,0 +1,55 @@
+from easydict import EasyDict
+
+main_config = dict(
+    exp_name="hopper_medium_expert_bcq_seed0_43",
+    env=dict(
+        env_id='hopper-medium-expert-v2',
+        collector_env_num=1,
+        evaluator_env_num=8,
+        use_act_scale=True,
+        n_evaluator_episode=8,
+        stop_value=3800,
+    ),
+    policy=dict(
+        cuda=True,
+        model=dict(
+            obs_shape=11,
+            action_shape=3,
+            actor_head_hidden_size=[400,300],
+            critic_head_hidden_size=[400,300],
+            phi=0.05,
+        ),
+        learn=dict(
+            data_path=None,
+            train_epoch=30000,
+            batch_size=100,
+            learning_rate_q=3e-3,
+            learning_rate_policy=3e-3,
+            learning_rate_alpha=3e-3,
+            lmbda=0.75,
+            learner=dict(hook=dict(save_ckpt_after_iter=1000000000, )),
+        ),
+        collect=dict(data_type='d4rl', ),
+        eval=dict(evaluator=dict(eval_freq=500, )),
+        other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ),
+    ),
+    seed = 123,
+)
+
+main_config = EasyDict(main_config)
+main_config = main_config
+
+create_config = dict(
+    env=dict(
+        type='d4rl',
+        import_names=['dizoo.d4rl.envs.d4rl_env'],
+    ),
+    env_manager=dict(type='base'),
+    policy=dict(
+        type='bcq',
+        import_names=['ding.policy.bcq'],
+    ),
+    replay_buffer=dict(type='naive', ),
+)
+create_config = EasyDict(create_config)
+create_config = create_config
\ No newline at end of file
diff --git a/dizoo/d4rl/entry/d4rl_bcq_main.py b/dizoo/d4rl/entry/d4rl_bcq_main.py
new file mode 100755
index 0000000000..2e50f25100
--- /dev/null
+++ b/dizoo/d4rl/entry/d4rl_bcq_main.py
@@ -0,0 +1,21 @@
+from ding.entry import serial_pipeline_offline
+from ding.config import read_config
+from pathlib import Path
+
+
+def train(args):
+    # launch from anywhere
+    config = Path(__file__).absolute().parent.parent / 'config' / args.config 
+    config = read_config(str(config))
+    config[0].exp_name = config[0].exp_name.replace('0', str(args.seed))
+    serial_pipeline_offline(config, seed=args.seed)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--seed', '-s', type=int, default=0)
+    parser.add_argument('--config', '-c', type=str, default='halfcheetah_medium_bcq_config.py')
+    args = parser.parse_args()
+    train(args)
\ No newline at end of file

From 1b8454af3b5e9e054aa8a75d4a4cb654071b4925 Mon Sep 17 00:00:00 2001
From: Super1ce <278042904@qq.com>
Date: Thu, 13 Apr 2023 12:09:07 +0800
Subject: [PATCH 4/8] modify default config

---
 ding/policy/bcq.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/ding/policy/bcq.py b/ding/policy/bcq.py
index 071970d778..5dd517b6cd 100755
--- a/ding/policy/bcq.py
+++ b/ding/policy/bcq.py
@@ -32,19 +32,11 @@ class BCQPolicy(Policy):
         nstep=1,
         model=dict(
             # (List) Hidden list for actor network head.
-<<<<<<< HEAD
             actor_head_hidden_size=[400,300],
             
 
             # (List) Hidden list for critic network head.
             critic_head_hidden_size=[400,300],
-=======
-            actor_head_hidden_size=[400,400,300],
-            
-
-            # (List) Hidden list for critic network head.
-            critic_head_hidden_size=[400,400,300],
->>>>>>> 985def776aece702d7f77b8a8fc026daf8e7dff1
             # Max perturbation hyper-parameter for BCQ
             phi=0.05,
         ),

From d8ac3f3fea17c970b623acdfa81b2babd6d78bc0 Mon Sep 17 00:00:00 2001
From: Super1ce <278042904@qq.com>
Date: Sun, 23 Apr 2023 21:47:34 +0800
Subject: [PATCH 5/8] format

---
 ding/model/template/bcq.py                                | 4 ++--
 ding/policy/bcq.py                                        | 5 ++---
 ding/policy/command_mode_policy_instance.py               | 2 --
 dizoo/d4rl/config/halfcheetah_medium_bcq_config.py        | 8 ++++----
 dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py | 8 ++++----
 dizoo/d4rl/config/hopper_medium_bcq_config.py             | 8 ++++----
 dizoo/d4rl/config/hopper_medium_expert_bcq_config.py      | 8 ++++----
 dizoo/d4rl/entry/d4rl_bcq_main.py                         | 4 ++--
 8 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/ding/model/template/bcq.py b/ding/model/template/bcq.py
index 8f78b955e3..ee76c68697 100755
--- a/ding/model/template/bcq.py
+++ b/ding/model/template/bcq.py
@@ -68,8 +68,8 @@ def __init__(
     def forward(self, inputs: Dict[str, torch.Tensor], mode: str) -> Dict[str, torch.Tensor]:
         """
         Overview:
-            The unique execution (forward) method of QAC method, and one can indicate different modes to implement \
-            different computation graph, including ``compute_actor`` and ``compute_critic`` in QAC.
+            The unique execution (forward) method of BCQ method, and one can indicate different modes to implement \
+            different computation graph, including ``compute_actor`` and ``compute_critic`` in BCQ.
         Mode compute_actor:
             Arguments:
                 - inputs (:obj:`Dict`): Input dict data, including obs and action tensor.
diff --git a/ding/policy/bcq.py b/ding/policy/bcq.py
index 5dd517b6cd..1cf7997ff1 100755
--- a/ding/policy/bcq.py
+++ b/ding/policy/bcq.py
@@ -32,11 +32,10 @@ class BCQPolicy(Policy):
         nstep=1,
         model=dict(
             # (List) Hidden list for actor network head.
-            actor_head_hidden_size=[400,300],
-            
+            actor_head_hidden_size=[400, 300],
 
             # (List) Hidden list for critic network head.
-            critic_head_hidden_size=[400,300],
+            critic_head_hidden_size=[400, 300],
             # Max perturbation hyper-parameter for BCQ
             phi=0.05,
         ),
diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py
index 726536cde9..da732aa9e1 100644
--- a/ding/policy/command_mode_policy_instance.py
+++ b/ding/policy/command_mode_policy_instance.py
@@ -382,13 +382,11 @@ class IBCCommandModePolicy(IBCPolicy, DummyCommandModePolicy):
     pass
 
 
-
 @POLICY_REGISTRY.register('bcq_command')
 class BCQCommandModelPolicy(BCQPolicy, DummyCommandModePolicy):
     pass
 
 
-
 @POLICY_REGISTRY.register('bc_command')
 class BCCommandModePolicy(BehaviourCloningPolicy, DummyCommandModePolicy):
 
diff --git a/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py b/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py
index cee9fc9f5a..c0199dcb09 100755
--- a/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py
+++ b/dizoo/d4rl/config/halfcheetah_medium_bcq_config.py
@@ -15,8 +15,8 @@
         model=dict(
             obs_shape=17,
             action_shape=6,
-            actor_head_hidden_size=[400,300],
-            critic_head_hidden_size=[400,300],
+            actor_head_hidden_size=[400, 300],
+            critic_head_hidden_size=[400, 300],
             phi=0.05,
         ),
         learn=dict(
@@ -33,7 +33,7 @@
         eval=dict(evaluator=dict(eval_freq=500, )),
         other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ),
     ),
-    seed = 123,
+    seed=123,
 )
 
 main_config = EasyDict(main_config)
@@ -52,4 +52,4 @@
     replay_buffer=dict(type='naive', ),
 )
 create_config = EasyDict(create_config)
-create_config = create_config
\ No newline at end of file
+create_config = create_config
diff --git a/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py b/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py
index 219810d1d0..6c3ac39c18 100755
--- a/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py
+++ b/dizoo/d4rl/config/halfcheetah_medium_expert_bcq_config.py
@@ -15,8 +15,8 @@
         model=dict(
             obs_shape=17,
             action_shape=6,
-            actor_head_hidden_size=[400,300],
-            critic_head_hidden_size=[400,300],
+            actor_head_hidden_size=[400, 300],
+            critic_head_hidden_size=[400, 300],
             phi=0.05,
         ),
         learn=dict(
@@ -33,7 +33,7 @@
         eval=dict(evaluator=dict(eval_freq=500, )),
         other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ),
     ),
-    seed = 123,
+    seed=123,
 )
 
 main_config = EasyDict(main_config)
@@ -52,4 +52,4 @@
     replay_buffer=dict(type='naive', ),
 )
 create_config = EasyDict(create_config)
-create_config = create_config
\ No newline at end of file
+create_config = create_config
diff --git a/dizoo/d4rl/config/hopper_medium_bcq_config.py b/dizoo/d4rl/config/hopper_medium_bcq_config.py
index df13e46f3b..06282d1680 100755
--- a/dizoo/d4rl/config/hopper_medium_bcq_config.py
+++ b/dizoo/d4rl/config/hopper_medium_bcq_config.py
@@ -15,8 +15,8 @@
         model=dict(
             obs_shape=11,
             action_shape=3,
-            actor_head_hidden_size=[400,300],
-            critic_head_hidden_size=[400,300],
+            actor_head_hidden_size=[400, 300],
+            critic_head_hidden_size=[400, 300],
             phi=0.05,
         ),
         learn=dict(
@@ -33,7 +33,7 @@
         eval=dict(evaluator=dict(eval_freq=500, )),
         other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ),
     ),
-    seed = 123,
+    seed=123,
 )
 
 main_config = EasyDict(main_config)
@@ -52,4 +52,4 @@
     replay_buffer=dict(type='naive', ),
 )
 create_config = EasyDict(create_config)
-create_config = create_config
\ No newline at end of file
+create_config = create_config
diff --git a/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py b/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py
index a230a9704b..ac48ee4847 100755
--- a/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py
+++ b/dizoo/d4rl/config/hopper_medium_expert_bcq_config.py
@@ -15,8 +15,8 @@
         model=dict(
             obs_shape=11,
             action_shape=3,
-            actor_head_hidden_size=[400,300],
-            critic_head_hidden_size=[400,300],
+            actor_head_hidden_size=[400, 300],
+            critic_head_hidden_size=[400, 300],
             phi=0.05,
         ),
         learn=dict(
@@ -33,7 +33,7 @@
         eval=dict(evaluator=dict(eval_freq=500, )),
         other=dict(replay_buffer=dict(replay_buffer_size=2000000, ), ),
     ),
-    seed = 123,
+    seed=123,
 )
 
 main_config = EasyDict(main_config)
@@ -52,4 +52,4 @@
     replay_buffer=dict(type='naive', ),
 )
 create_config = EasyDict(create_config)
-create_config = create_config
\ No newline at end of file
+create_config = create_config
diff --git a/dizoo/d4rl/entry/d4rl_bcq_main.py b/dizoo/d4rl/entry/d4rl_bcq_main.py
index 2e50f25100..099f6e025b 100755
--- a/dizoo/d4rl/entry/d4rl_bcq_main.py
+++ b/dizoo/d4rl/entry/d4rl_bcq_main.py
@@ -5,7 +5,7 @@
 
 def train(args):
     # launch from anywhere
-    config = Path(__file__).absolute().parent.parent / 'config' / args.config 
+    config = Path(__file__).absolute().parent.parent / 'config' / args.config
     config = read_config(str(config))
     config[0].exp_name = config[0].exp_name.replace('0', str(args.seed))
     serial_pipeline_offline(config, seed=args.seed)
@@ -18,4 +18,4 @@ def train(args):
     parser.add_argument('--seed', '-s', type=int, default=0)
     parser.add_argument('--config', '-c', type=str, default='halfcheetah_medium_bcq_config.py')
     args = parser.parse_args()
-    train(args)
\ No newline at end of file
+    train(args)

From 9c08f7cc5b729dc57ee4ad3eaffff77adbf42923 Mon Sep 17 00:00:00 2001
From: Super1ce <278042904@qq.com>
Date: Sun, 23 Apr 2023 22:23:58 +0800
Subject: [PATCH 6/8] format

---
 ding/model/template/__init__.py | 2 +-
 ding/model/template/bcq.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/ding/model/template/__init__.py b/ding/model/template/__init__.py
index b22ea61d6e..694dcd732c 100644
--- a/ding/model/template/__init__.py
+++ b/ding/model/template/__init__.py
@@ -23,4 +23,4 @@
 from .vae import VanillaVAE
 from .decision_transformer import DecisionTransformer
 from .procedure_cloning import ProcedureCloningMCTS, ProcedureCloningBFS
-from .bcq import BCQ
\ No newline at end of file
+from .bcq import BCQ
diff --git a/ding/model/template/bcq.py b/ding/model/template/bcq.py
index ee76c68697..7b8d013e9e 100755
--- a/ding/model/template/bcq.py
+++ b/ding/model/template/bcq.py
@@ -26,7 +26,7 @@ def __init__(
     ) -> None:
         """
         Overview:
-            Initialize QMIX neural network, i.e. agent Q network and mixer.
+            Initialize neural network, i.e. agent Q network and actor.
         Arguments:
             - obs_shape (:obj:`int`): the dimension of observation state
             - action_shape (:obj:`int`): the dimension of action shape

From d594000ea716569914162d64c40cb13f0d3b8e07 Mon Sep 17 00:00:00 2001
From: Super1ce <278042904@qq.com>
Date: Mon, 24 Apr 2023 09:22:33 +0800
Subject: [PATCH 7/8] format

---
 ding/policy/bcq.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ding/policy/bcq.py b/ding/policy/bcq.py
index 1cf7997ff1..9a8388b00f 100755
--- a/ding/policy/bcq.py
+++ b/ding/policy/bcq.py
@@ -196,8 +196,8 @@ def _forward_learn(self, data: dict) -> Dict[str, Any]:
             target_q_value = self._target_model.forward(next_data, mode='compute_critic')['q_value']
             # the value of a policy according to the maximum entropy objective
             # find min one as target q value
-            target_q_value = self.lmbda * torch.min(target_q_value[0],target_q_value[1]) \
-                + (1 - self.lmbda) * torch.max(target_q_value[0],target_q_value[1])
+            target_q_value = self.lmbda * torch.min(target_q_value[0], target_q_value[1]) \
+                + (1 - self.lmbda) * torch.max(target_q_value[0], target_q_value[1])
             target_q_value = target_q_value.reshape(batch_size, -1).max(1)[0].reshape(-1, 1)
 
         q_data0 = v_1step_td_data(q_value[0], target_q_value, reward, done, data['weight'])

From 4d4c9977f5643181f9d89c8ca53bc746eba5b13b Mon Sep 17 00:00:00 2001
From: Super1ce <278042904@qq.com>
Date: Tue, 30 May 2023 11:34:50 +0800
Subject: [PATCH 8/8] modify format

---
 ding/policy/command_mode_policy_instance.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/ding/policy/command_mode_policy_instance.py b/ding/policy/command_mode_policy_instance.py
index c131b596a6..8b6123c063 100755
--- a/ding/policy/command_mode_policy_instance.py
+++ b/ding/policy/command_mode_policy_instance.py
@@ -385,6 +385,9 @@ class IBCCommandModePolicy(IBCPolicy, DummyCommandModePolicy):
 
 @POLICY_REGISTRY.register('bcq_command')
 class BCQCommandModelPolicy(BCQPolicy, DummyCommandModePolicy):
+    pass
+
+
 @POLICY_REGISTRY.register('edac_command')
 class EDACCommandModelPolicy(EDACPolicy, DummyCommandModePolicy):
     pass