From 48a529826bcaedfbdfcd15b20d3238c54a6234a0 Mon Sep 17 00:00:00 2001
From: zjowowen <zjowowen@outlook.com>
Date: Mon, 18 Sep 2023 14:34:56 +0000
Subject: [PATCH 1/2] polish ppof code

---
 ding/bonus/config.py                          | 136 +++++++++---------
 ding/bonus/ppof.py                            |  76 ++++++----
 .../framework/middleware/functional/logger.py |  50 +++++--
 3 files changed, 153 insertions(+), 109 deletions(-)

diff --git a/ding/bonus/config.py b/ding/bonus/config.py
index 676474c28a..c449c3fdc5 100644
--- a/ding/bonus/config.py
+++ b/ding/bonus/config.py
@@ -7,24 +7,24 @@
 from ding.policy import PPOFPolicy
 
 
-def get_instance_config(env: str, algorithm: str) -> EasyDict:
+def get_instance_config(env_id: str, algorithm: str) -> EasyDict:
     if algorithm == 'PPOF':
         cfg = PPOFPolicy.default_config()
-        if env == 'lunarlander_discrete':
+        if env_id == 'LunarLander-v2':
             cfg.n_sample = 512
             cfg.value_norm = 'popart'
             cfg.entropy_weight = 1e-3
-        elif env == 'lunarlander_continuous':
+        elif env_id == 'LunarLanderContinuous-v2':
             cfg.action_space = 'continuous'
             cfg.n_sample = 400
-        elif env == 'bipedalwalker':
+        elif env_id == 'BipedalWalker-v3':
             cfg.learning_rate = 1e-3
             cfg.action_space = 'continuous'
             cfg.n_sample = 1024
-        elif env == 'acrobot':
+        elif env_id == 'acrobot':
             cfg.learning_rate = 1e-4
             cfg.n_sample = 400
-        elif env == 'rocket_landing':
+        elif env_id == 'rocket_landing':
             cfg.n_sample = 2048
             cfg.adv_norm = False
             cfg.model = dict(
@@ -32,13 +32,13 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict:
                 actor_head_hidden_size=128,
                 critic_head_hidden_size=128,
             )
-        elif env == 'drone_fly':
+        elif env_id == 'drone_fly':
             cfg.action_space = 'continuous'
             cfg.adv_norm = False
             cfg.epoch_per_collect = 5
             cfg.learning_rate = 5e-5
             cfg.n_sample = 640
-        elif env == 'hybrid_moving':
+        elif env_id == 'hybrid_moving':
             cfg.action_space = 'hybrid'
             cfg.n_sample = 3200
             cfg.entropy_weight = 0.03
@@ -50,13 +50,13 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict:
                 fixed_sigma_value=0.3,
                 bound_type='tanh',
             )
-        elif env == 'evogym_carrier':
+        elif env_id == 'evogym_carrier':
             cfg.action_space = 'continuous'
             cfg.n_sample = 2048
             cfg.batch_size = 256
             cfg.epoch_per_collect = 10
             cfg.learning_rate = 3e-3
-        elif env == 'mario':
+        elif env_id == 'mario':
             cfg.n_sample = 256
             cfg.batch_size = 64
             cfg.epoch_per_collect = 2
@@ -66,14 +66,14 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict:
                 critic_head_hidden_size=128,
                 actor_head_hidden_size=128,
             )
-        elif env == 'di_sheep':
+        elif env_id == 'di_sheep':
             cfg.n_sample = 3200
             cfg.batch_size = 320
             cfg.epoch_per_collect = 10
             cfg.learning_rate = 3e-4
             cfg.adv_norm = False
             cfg.entropy_weight = 0.001
-        elif env == 'procgen_bigfish':
+        elif env_id == 'procgen_bigfish':
             cfg.n_sample = 16384
             cfg.batch_size = 16384
             cfg.epoch_per_collect = 10
@@ -83,7 +83,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict:
                 critic_head_hidden_size=256,
                 actor_head_hidden_size=256,
             )
-        elif env in ['atari_qbert', 'atari_kangaroo', 'atari_bowling']:
+        elif env_id in ['KangarooNoFrameskip-v4', 'BowlingNoFrameskip-v4']:
             cfg.n_sample = 1024
             cfg.batch_size = 128
             cfg.epoch_per_collect = 10
@@ -94,7 +94,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict:
                 critic_head_hidden_size=128,
                 critic_head_layer_num=2,
             )
-        elif env == 'PongNoFrameskip':
+        elif env_id == 'PongNoFrameskip-v4':
             cfg.n_sample = 3200
             cfg.batch_size = 320
             cfg.epoch_per_collect = 10
@@ -104,7 +104,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict:
                 actor_head_hidden_size=128,
                 critic_head_hidden_size=128,
             )
-        elif env == 'SpaceInvadersNoFrameskip':
+        elif env_id == 'SpaceInvadersNoFrameskip-v4':
             cfg.n_sample = 320
             cfg.batch_size = 320
             cfg.epoch_per_collect = 1
@@ -116,7 +116,7 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict:
                 actor_head_hidden_size=128,
                 critic_head_hidden_size=128,
             )
-        elif env == 'QbertNoFrameskip':
+        elif env_id == 'QbertNoFrameskip-v4':
             cfg.n_sample = 3200
             cfg.batch_size = 320
             cfg.epoch_per_collect = 10
@@ -127,13 +127,13 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict:
                 actor_head_hidden_size=128,
                 critic_head_hidden_size=128,
             )
-        elif env == 'minigrid_fourroom':
+        elif env_id == 'minigrid_fourroom':
             cfg.n_sample = 3200
             cfg.batch_size = 320
             cfg.learning_rate = 3e-4
             cfg.epoch_per_collect = 10
             cfg.entropy_weight = 0.001
-        elif env == 'metadrive':
+        elif env_id == 'metadrive':
             cfg.learning_rate = 3e-4
             cfg.action_space = 'continuous'
             cfg.entropy_weight = 0.001
@@ -146,49 +146,61 @@ def get_instance_config(env: str, algorithm: str) -> EasyDict:
                 critic_head_hidden_size=128,
                 critic_head_layer_num=2,
             )
-        elif env in ['hopper']:
+        elif env_id == 'Hopper-v3':
+            cfg.action_space = "continuous"
+            cfg.n_sample = 3200
+            cfg.batch_size = 320
+            cfg.epoch_per_collect = 10
+            cfg.learning_rate = 3e-4
+        elif env_id == 'HalfCheetah-v3':
+            cfg.action_space = "continuous"
+            cfg.n_sample = 3200
+            cfg.batch_size = 320
+            cfg.epoch_per_collect = 10
+            cfg.learning_rate = 3e-4
+        elif env_id == 'Walker2d-v3':
             cfg.action_space = "continuous"
             cfg.n_sample = 3200
             cfg.batch_size = 320
             cfg.epoch_per_collect = 10
             cfg.learning_rate = 3e-4
         else:
-            raise KeyError("not supported env type: {}".format(env))
+            raise KeyError("not supported env type: {}".format(env_id))
     else:
         raise KeyError("not supported algorithm type: {}".format(algorithm))
 
     return cfg
 
 
-def get_instance_env(env: str) -> BaseEnv:
-    if env == 'lunarlander_discrete':
+def get_instance_env(env_id: str) -> BaseEnv:
+    if env_id == 'LunarLander-v2':
         return DingEnvWrapper(gym.make('LunarLander-v2'))
-    elif env == 'lunarlander_continuous':
-        return DingEnvWrapper(gym.make('LunarLander-v2', continuous=True))
-    elif env == 'bipedalwalker':
+    elif env_id == 'LunarLanderContinuous-v2':
+        return DingEnvWrapper(gym.make('LunarLanderContinuous-v2', continuous=True))
+    elif env_id == 'BipedalWalker-v3':
         return DingEnvWrapper(gym.make('BipedalWalker-v3'), cfg={'act_scale': True, 'rew_clip': True})
-    elif env == 'pendulum':
+    elif env_id == 'Pendulum-v1':
         return DingEnvWrapper(gym.make('Pendulum-v1'), cfg={'act_scale': True})
-    elif env == 'acrobot':
+    elif env_id == 'acrobot':
         return DingEnvWrapper(gym.make('Acrobot-v1'))
-    elif env == 'rocket_landing':
+    elif env_id == 'rocket_landing':
         from dizoo.rocket.envs import RocketEnv
         cfg = EasyDict({
             'task': 'landing',
             'max_steps': 800,
         })
         return RocketEnv(cfg)
-    elif env == 'drone_fly':
+    elif env_id == 'drone_fly':
         from dizoo.gym_pybullet_drones.envs import GymPybulletDronesEnv
         cfg = EasyDict({
             'env_id': 'flythrugate-aviary-v0',
             'action_type': 'VEL',
         })
         return GymPybulletDronesEnv(cfg)
-    elif env == 'hybrid_moving':
+    elif env_id == 'hybrid_moving':
         import gym_hybrid
         return DingEnvWrapper(gym.make('Moving-v0'))
-    elif env == 'evogym_carrier':
+    elif env_id == 'evogym_carrier':
         import evogym.envs
         from evogym import sample_robot, WorldObject
         path = os.path.join(os.path.dirname(__file__), '../../dizoo/evogym/envs/world_data/carry_bot.json')
@@ -203,7 +215,7 @@ def get_instance_env(env: str) -> BaseEnv:
                 ]
             }
         )
-    elif env == 'mario':
+    elif env_id == 'mario':
         import gym_super_mario_bros
         from nes_py.wrappers import JoypadSpace
         return DingEnvWrapper(
@@ -219,10 +231,10 @@ def get_instance_env(env: str) -> BaseEnv:
                 ]
             }
         )
-    elif env == 'di_sheep':
+    elif env_id == 'di_sheep':
         from sheep_env import SheepEnv
         return DingEnvWrapper(SheepEnv(level=9))
-    elif env == 'procgen_bigfish':
+    elif env_id == 'procgen_bigfish':
         return DingEnvWrapper(
             gym.make('procgen:procgen-bigfish-v0', start_level=0, num_levels=1),
             cfg={
@@ -234,7 +246,7 @@ def get_instance_env(env: str) -> BaseEnv:
             },
             seed_api=False,
         )
-    elif env == 'hopper':
+    elif env_id == 'Hopper-v3':
         cfg = EasyDict(
             env_id='Hopper-v3',
             env_wrapper='mujoco_default',
@@ -242,7 +254,7 @@ def get_instance_env(env: str) -> BaseEnv:
             rew_clip=True,
         )
         return DingEnvWrapper(gym.make('Hopper-v3'), cfg=cfg)
-    elif env == 'HalfCheetah':
+    elif env_id == 'HalfCheetah-v3':
         cfg = EasyDict(
             env_id='HalfCheetah-v3',
             env_wrapper='mujoco_default',
@@ -250,7 +262,7 @@ def get_instance_env(env: str) -> BaseEnv:
             rew_clip=True,
         )
         return DingEnvWrapper(gym.make('HalfCheetah-v3'), cfg=cfg)
-    elif env == 'Walker2d':
+    elif env_id == 'Walker2d-v3':
         cfg = EasyDict(
             env_id='Walker2d-v3',
             env_wrapper='mujoco_default',
@@ -258,42 +270,24 @@ def get_instance_env(env: str) -> BaseEnv:
             rew_clip=True,
         )
         return DingEnvWrapper(gym.make('Walker2d-v3'), cfg=cfg)
-    elif env == "SpaceInvadersNoFrameskip":
-        cfg = EasyDict({
-            'env_id': "SpaceInvadersNoFrameskip-v4",
-            'env_wrapper': 'atari_default',
-        })
-        return DingEnvWrapper(gym.make("SpaceInvadersNoFrameskip-v4"), cfg=cfg)
-    elif env == "PongNoFrameskip":
-        cfg = EasyDict({
-            'env_id': "PongNoFrameskip-v4",
-            'env_wrapper': 'atari_default',
-        })
-        return DingEnvWrapper(gym.make("PongNoFrameskip-v4"), cfg=cfg)
-    elif env == "QbertNoFrameskip":
-        cfg = EasyDict({
-            'env_id': "QbertNoFrameskip-v4",
-            'env_wrapper': 'atari_default',
-        })
-        return DingEnvWrapper(gym.make("QbertNoFrameskip-v4"), cfg=cfg)
-    elif env in ['atari_qbert', 'atari_kangaroo', 'atari_bowling', 'atari_breakout', 'atari_spaceinvader',
-                 'atari_gopher']:
-        from dizoo.atari.envs.atari_env import AtariEnv
-        atari_env_list = {
-            'atari_qbert': 'QbertNoFrameskip-v4',
-            'atari_kangaroo': 'KangarooNoFrameskip-v4',
-            'atari_bowling': 'BowlingNoFrameskip-v4',
-            'atari_breakout': 'BreakoutNoFrameskip-v4',
-            'atari_spaceinvader': 'SpaceInvadersNoFrameskip-v4',
-            'atari_gopher': 'GopherNoFrameskip-v4'
-        }
+
+    elif env_id in [
+            'BowlingNoFrameskip-v4',
+            'BreakoutNoFrameskip-v4',
+            'GopherNoFrameskip-v4'
+            'KangarooNoFrameskip-v4',
+            'PongNoFrameskip-v4',
+            'QbertNoFrameskip-v4',
+            'SpaceInvadersNoFrameskip-v4',
+    ]:
+
         cfg = EasyDict({
-            'env_id': atari_env_list[env],
+            'env_id': env_id,
             'env_wrapper': 'atari_default',
         })
-        ding_env_atari = DingEnvWrapper(gym.make(atari_env_list[env]), cfg=cfg)
+        ding_env_atari = DingEnvWrapper(gym.make(env_id), cfg=cfg)
         return ding_env_atari
-    elif env == 'minigrid_fourroom':
+    elif env_id == 'minigrid_fourroom':
         import gymnasium
         return DingEnvWrapper(
             gymnasium.make('MiniGrid-FourRooms-v0'),
@@ -306,7 +300,7 @@ def get_instance_env(env: str) -> BaseEnv:
                 ]
             }
         )
-    elif env == 'metadrive':
+    elif env_id == 'metadrive':
         from dizoo.metadrive.env.drive_env import MetaDrivePPOOriginEnv
         from dizoo.metadrive.env.drive_wrapper import DriveEnvWrapper
         cfg = dict(
@@ -319,7 +313,7 @@ def get_instance_env(env: str) -> BaseEnv:
         cfg = EasyDict(cfg)
         return DriveEnvWrapper(MetaDrivePPOOriginEnv(cfg))
     else:
-        raise KeyError("not supported env type: {}".format(env))
+        raise KeyError("not supported env type: {}".format(env_id))
 
 
 def get_hybrid_shape(action_space) -> EasyDict:
diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py
index 3948f726ce..4109f62590 100644
--- a/ding/bonus/ppof.py
+++ b/ding/bonus/ppof.py
@@ -21,9 +21,9 @@
 class PPOF:
     supported_env_list = [
         # common
-        'lunarlander_discrete',
-        'lunarlander_continuous',
-        'bipedalwalker',
+        'LunarLander-v2',
+        'LunarLanderContinuous-v2',
+        'BipedalWalker-v3',
         'acrobot',
         # ch2: action
         'rocket_landing',
@@ -38,42 +38,64 @@ class PPOF:
         'minigrid_fourroom',
         'metadrive',
         # atari
-        'atari_qbert',
-        'atari_kangaroo',
-        'atari_bowling',
-        'PongNoFrameskip',
-        'SpaceInvadersNoFrameskip',
-        'QbertNoFrameskip',
+        'BowlingNoFrameskip-v4',
+        'BreakoutNoFrameskip-v4',
+        'GopherNoFrameskip-v4'
+        'KangarooNoFrameskip-v4',
+        'PongNoFrameskip-v4',
+        'QbertNoFrameskip-v4',
+        'SpaceInvadersNoFrameskip-v4',
         # mujoco
-        'hopper',
+        'Hopper-v3',
+        'HalfCheetah-v3',
+        'Walker2d-v3',
     ]
 
     def __init__(
             self,
-            env: Union[str, BaseEnv],
+            env_id: str = None,
+            env: BaseEnv = None,
             seed: int = 0,
-            exp_name: str = 'default_experiment',
+            exp_name: str = None,
             model: Optional[torch.nn.Module] = None,
-            cfg: Optional[EasyDict] = None,
-            policy_state_dict: str = None,
+            cfg: Optional[Union[EasyDict, dict]] = None,
+            policy_state_dict: str = None
     ) -> None:
-        if isinstance(env, str):
-            assert env in PPOF.supported_env_list, "Please use supported envs: {}".format(PPOF.supported_env_list)
-            self.env = get_instance_env(env)
+        assert env_id is not None or cfg is not None, "Please specify env_id or cfg."
+
+        if cfg is not None and not isinstance(cfg, EasyDict):
+            cfg = EasyDict(cfg)
+
+        if env_id is not None:
+            assert env_id in PPOF.supported_env_list, "Please use supported envs: {}".format(PPOF.supported_env_list)
             if cfg is None:
-                # 'It should be default env tuned config'
-                self.cfg = get_instance_config(env, algorithm="PPO")
-            else:
-                self.cfg = cfg
-        elif isinstance(env, BaseEnv):
-            self.cfg = cfg
-            raise NotImplementedError
+                cfg = get_instance_config(env_id, algorithm="PPOF")
+
+            if not hasattr(cfg, "env_id"):
+                cfg.env_id = env_id
+            assert cfg.env_id == env_id, "env_id in cfg should be the same as env_id in args."
         else:
-            raise TypeError("not support env type: {}, only strings and instances of `BaseEnv` now".format(type(env)))
+            assert hasattr(cfg, "env_id"), "Please specify env_id in cfg."
+            assert cfg.env_id in PPOF.supported_env_list, "Please use supported envs: {}".format(
+                PPOF.supported_env_list
+            )
+
+        if exp_name is not None:
+            cfg.exp_name = exp_name
+        elif not hasattr(cfg, "exp_name"):
+            cfg.exp_name = "{}-{}".format(cfg.env_id, "PPO")
+        self.cfg = cfg
+        self.exp_name = self.cfg.exp_name
+
+        if env is None:
+            self.env = get_instance_env(self.cfg.env_id)
+        else:
+            self.env = env
+
         logging.getLogger().setLevel(logging.INFO)
         self.seed = seed
-        set_pkg_seed(self.seed)
-        self.exp_name = exp_name
+        set_pkg_seed(self.seed, use_cuda=self.cfg.cuda)
+
         if not os.path.exists(self.exp_name):
             os.makedirs(self.exp_name)
         save_config_py(self.cfg, os.path.join(self.exp_name, 'policy_config.py'))
diff --git a/ding/framework/middleware/functional/logger.py b/ding/framework/middleware/functional/logger.py
index eba569cfbc..9f62e2f429 100644
--- a/ding/framework/middleware/functional/logger.py
+++ b/ding/framework/middleware/functional/logger.py
@@ -303,16 +303,30 @@ def _plot(ctx: "OnlineRLContext"):
             )
 
         if ctx.eval_value != -np.inf:
-            info_for_logging.update(
-                {
+            if hasattr(ctx, "eval_value_min"):
+                info_for_logging.update({
                     "episode return min": ctx.eval_value_min,
+                })
+            if hasattr(ctx, "eval_value_max"):
+                info_for_logging.update({
                     "episode return max": ctx.eval_value_max,
-                    "episode return mean": ctx.eval_value,
+                })
+            if hasattr(ctx, "eval_value_std"):
+                info_for_logging.update({
                     "episode return std": ctx.eval_value_std,
+                })
+            if hasattr(ctx, "eval_value"):
+                info_for_logging.update({
+                    "episode return mean": ctx.eval_value,
+                })
+            if hasattr(ctx, "train_iter"):
+                info_for_logging.update({
                     "train iter": ctx.train_iter,
-                    "env step": ctx.env_step
-                }
-            )
+                })
+            if hasattr(ctx, "env_step"):
+                info_for_logging.update({
+                    "env step": ctx.env_step,
+                })
 
             eval_output = ctx.eval_output['output']
             episode_return = ctx.eval_output['episode_return']
@@ -597,16 +611,30 @@ def _plot(ctx: "OfflineRLContext"):
             )
 
         if ctx.eval_value != -np.inf:
-            info_for_logging.update(
-                {
+            if hasattr(ctx, "eval_value_min"):
+                info_for_logging.update({
                     "episode return min": ctx.eval_value_min,
+                })
+            if hasattr(ctx, "eval_value_max"):
+                info_for_logging.update({
                     "episode return max": ctx.eval_value_max,
-                    "episode return mean": ctx.eval_value,
+                })
+            if hasattr(ctx, "eval_value_std"):
+                info_for_logging.update({
                     "episode return std": ctx.eval_value_std,
+                })
+            if hasattr(ctx, "eval_value"):
+                info_for_logging.update({
+                    "episode return mean": ctx.eval_value,
+                })
+            if hasattr(ctx, "train_iter"):
+                info_for_logging.update({
                     "train iter": ctx.train_iter,
+                })
+            if hasattr(ctx, "train_epoch"):
+                info_for_logging.update({
                     "train_epoch": ctx.train_epoch,
-                }
-            )
+                })
 
             eval_output = ctx.eval_output['output']
             episode_return = ctx.eval_output['episode_return']

From 16d1e2844c6bc6e9f7e19b64a29789967e40f397 Mon Sep 17 00:00:00 2001
From: zjowowen <zjowowen@outlook.com>
Date: Wed, 20 Sep 2023 04:38:09 +0000
Subject: [PATCH 2/2] polish ppof code

---
 ding/bonus/ppof.py | 64 +++++++++++++++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/ding/bonus/ppof.py b/ding/bonus/ppof.py
index 4109f62590..149b42c3c1 100644
--- a/ding/bonus/ppof.py
+++ b/ding/bonus/ppof.py
@@ -5,6 +5,7 @@
 import os
 import gym
 import gymnasium
+import numpy as np
 import torch
 from ding.framework import task, OnlineRLContext
 from ding.framework.middleware import interaction_evaluator_ttorch, PPOFStepCollector, multistep_trainer, CkptSaver, \
@@ -12,6 +13,7 @@
 from ding.envs import BaseEnv, BaseEnvManagerV2, SubprocessEnvManagerV2
 from ding.policy import PPOFPolicy, single_env_forward_wrapper_ttorch
 from ding.utils import set_pkg_seed
+from ding.utils import get_env_fps, render
 from ding.config import save_config_py
 from .model import PPOFModel
 from .config import get_instance_config, get_instance_env, get_hybrid_shape
@@ -174,19 +176,36 @@ def train(
 
         return TrainingReturn(wandb_url=task.ctx.wandb_url)
 
-    def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None, debug: bool = False) -> float:
+    def deploy(
+            self,
+            enable_save_replay: bool = False,
+            concatenate_all_replay: bool = False,
+            replay_save_path: str = None,
+            seed: Optional[Union[int, List]] = None,
+            debug: bool = False
+    ) -> EvalReturn:
         if debug:
             logging.getLogger().setLevel(logging.DEBUG)
         # define env and policy
         env = self.env.clone(caller='evaluator')
-        env.seed(self.seed, dynamic_seed=False)
 
-        if enable_save_replay and replay_save_path:
+        if seed is not None and isinstance(seed, int):
+            seeds = [seed]
+        elif seed is not None and isinstance(seed, list):
+            seeds = seed
+        else:
+            seeds = [self.seed]
+
+        returns = []
+        images = []
+        if enable_save_replay:
+            replay_save_path = os.path.join(self.exp_name, 'videos') if replay_save_path is None else replay_save_path
             env.enable_save_replay(replay_path=replay_save_path)
-        elif enable_save_replay:
-            env.enable_save_replay(replay_path=os.path.join(self.exp_name, 'videos'))
         else:
             logging.warning('No video would be generated during the deploy.')
+            if concatenate_all_replay:
+                logging.warning('concatenate_all_replay is set to False because enable_save_replay is False.')
+                concatenate_all_replay = False
 
         forward_fn = single_env_forward_wrapper_ttorch(self.policy.eval, self.cfg.cuda)
 
@@ -194,22 +213,31 @@ def deploy(self, enable_save_replay: bool = False, replay_save_path: str = None,
         # env will be reset again in the main loop
         env.reset()
 
-        # main loop
-        return_ = 0.
-        step = 0
-        obs = env.reset()
-        while True:
-            action = forward_fn(obs)
-            obs, rew, done, info = env.step(action)
-            return_ += rew
-            step += 1
-            if done:
-                break
-        logging.info(f'PPOF deploy is finished, final episode return with {step} steps is: {return_}')
+        for seed in seeds:
+            env.seed(seed, dynamic_seed=False)
+            return_ = 0.
+            step = 0
+            obs = env.reset()
+            images.append(render(env)[None]) if concatenate_all_replay else None
+            while True:
+                action = forward_fn(obs)
+                obs, rew, done, info = env.step(action)
+                images.append(render(env)[None]) if concatenate_all_replay else None
+                return_ += rew
+                step += 1
+                if done:
+                    break
+            logging.info(f'DQN deploy is finished, final episode return with {step} steps is: {return_}')
+            returns.append(return_)
 
         env.close()
 
-        return return_
+        if concatenate_all_replay:
+            images = np.concatenate(images, axis=0)
+            import imageio
+            imageio.mimwrite(os.path.join(replay_save_path, 'deploy.mp4'), images, fps=get_env_fps(env))
+
+        return EvalReturn(eval_value=np.mean(returns), eval_value_std=np.std(returns))
 
     def collect_data(
             self,