DLR-RM · carlosluis · Feb 19, 2022 · Feb 19, 2022 · Feb 19, 2022 · Feb 21, 2022
diff --git a/.github/ISSUE_TEMPLATE/custom_env.md b/.github/ISSUE_TEMPLATE/custom_env.md
@@ -44,19 +44,20 @@ from stable_baselines3.common.env_checker import check_env
 class CustomEnv(gym.Env):
 
   def __init__(self):
-    super(CustomEnv, self).__init__()
+    super().__init__()
     self.observation_space = gym.spaces.Box(low=-np.inf, high=np.inf, shape=(14,))
     self.action_space = gym.spaces.Box(low=-1, high=1, shape=(6,))
 
   def reset(self):
-    return self.observation_space.sample()
+    return self.observation_space.sample(), {}
 
   def step(self, action):
     obs = self.observation_space.sample()
     reward = 1.0
     done = False
+    truncated = False
     info = {}
-    return obs, reward, done, info
+    return obs, reward, done, truncated, info
 
 env = CustomEnv()
 check_env(env)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -1,4 +1,4 @@
-image: stablebaselines/stable-baselines3-cpu:1.4.1a0
+image: stablebaselines/stable-baselines3-cpu:1.5.1a6
 
 type-check:
   script:

diff --git a/Dockerfile b/Dockerfile
@@ -3,6 +3,9 @@ FROM $PARENT_IMAGE
 ARG PYTORCH_DEPS=cpuonly
 ARG PYTHON_VERSION=3.7
 
+# for tzdata
+ENV DEBIAN_FRONTEND="noninteractive" TZ="Europe/Paris"
+
 RUN apt-get update && apt-get install -y --no-install-recommends \
          build-essential \
          cmake \
@@ -20,7 +23,7 @@ RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest
      ~/miniconda.sh -b -p /opt/conda && \
      rm ~/miniconda.sh && \
      /opt/conda/bin/conda install -y python=$PYTHON_VERSION numpy pyyaml scipy ipython mkl mkl-include && \
-     /opt/conda/bin/conda install -y pytorch $PYTORCH_DEPS -c pytorch && \
+     /opt/conda/bin/conda install -y pytorch=1.11 $PYTORCH_DEPS -c pytorch && \
      /opt/conda/bin/conda clean -ya
 ENV PATH /opt/conda/bin:$PATH
 

diff --git a/Makefile b/Makefile
@@ -29,7 +29,8 @@ check-codestyle:
 commit-checks: format type lint
 
 doc:
-	cd docs && make html
+	# Prevent weird error due to protobuf
+	cd docs && PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=cpp make html
 
 spelling:
 	cd docs && make spelling

diff --git a/README.md b/README.md
@@ -124,12 +124,12 @@ env = gym.make("CartPole-v1")
 model = PPO("MlpPolicy", env, verbose=1)
 model.learn(total_timesteps=10_000)
 
-obs = env.reset()
+obs, info = env.reset()
 for i in range(1000):
     action, _states = model.predict(obs, deterministic=True)
-    obs, reward, done, info = env.step(action)
+    obs, reward, done, truncated, info = env.step(action)
     env.render()
-    if done:
+    if done or truncated:
       obs = env.reset()
 
 env.close()

diff --git a/docs/conda_env.yml b/docs/conda_env.yml
@@ -4,11 +4,11 @@ channels:
   - defaults
 dependencies:
   - cpuonly=1.0=0
-  - pip=21.1
+  - pip=22.1.1
   - python=3.7
-  - pytorch=1.11=py3.7_cpu_0
+  - pytorch=1.11.0=py3.7_cpu_0
   - pip:
-    - gym==0.21
+    - gym==0.26
     - cloudpickle
     - opencv-python-headless
     - pandas

diff --git a/docs/guide/examples.rst b/docs/guide/examples.rst
@@ -94,11 +94,12 @@ In the following example, we will train, save and load a DQN model on the Lunar
   mean_reward, std_reward = evaluate_policy(model, model.get_env(), n_eval_episodes=10)
 
   # Enjoy trained agent
-  obs = env.reset()
+  vec_env = model.get_env()
+  obs = vec_env.reset()
   for i in range(1000):
       action, _states = model.predict(obs, deterministic=True)
-      obs, rewards, dones, info = env.step(action)
-      env.render()
+      obs, rewards, dones, info = vec_env.step(action)
+      vec_env.render()
 
 
 Multiprocessing: Unleashing the Power of Vectorized Environments
@@ -470,19 +471,19 @@ The parking env is a goal-conditioned continuous control task, in which the vehi
   # HER must be loaded with the env
   model = SAC.load("her_sac_highway", env=env)
 
-  obs = env.reset()
+  obs, info = env.reset()
 
   # Evaluate the agent
   episode_reward = 0
   for _ in range(100):
       action, _ = model.predict(obs, deterministic=True)
-      obs, reward, done, info = env.step(action)
+      obs, reward, done, truncated, info = env.step(action)
       env.render()
       episode_reward += reward
-      if done or info.get("is_success", False):
+      if done or truncated or info.get("is_success", False):
           print("Reward:", episode_reward, "Success?", info.get("is_success", False))
           episode_reward = 0.0
-          obs = env.reset()
+          obs, info = env.reset()
 
 
 Learning Rate Schedule

diff --git a/docs/guide/quickstart.rst b/docs/guide/quickstart.rst
@@ -14,18 +14,24 @@ Here is a quick example of how to train and run A2C on a CartPole environment:
 
   from stable_baselines3 import A2C
 
-  env = gym.make('CartPole-v1')
+  env = gym.make("CartPole-v1")
 
-  model = A2C('MlpPolicy', env, verbose=1)
+  model = A2C("MlpPolicy", env, verbose=1)
   model.learn(total_timesteps=10000)
 
-  obs = env.reset()
+  # Note: Gym 0.26+ reset() returns a tuple
+  # where SB3 VecEnv only return an observation
+  obs, info = env.reset()
   for i in range(1000):
       action, _state = model.predict(obs, deterministic=True)
-      obs, reward, done, info = env.step(action)
+      # Note: Gym 0.26+ step() returns an additional boolean
+      # "truncated" where SB3 store truncation information
+      # in info["TimeLimit.truncated"]
+      obs, reward, done, truncated, info = env.step(action)
       env.render()
-      if done:
-        obs = env.reset()
+      # Note: reset is automated in SB3 VecEnv
+      if done or truncated:
+        obs, info = env.reset()
 
 .. note::
 
@@ -40,4 +46,4 @@ the policy is registered:
 
     from stable_baselines3 import A2C
 
-    model = A2C('MlpPolicy', 'CartPole-v1').learn(10000)
+    model = A2C("MlpPolicy", "CartPole-v1").learn(10000)
diff --git a/docs/misc/changelog.rst b/docs/misc/changelog.rst
@@ -65,6 +65,7 @@ Release 1.6.0 (2022-07-11)
 
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
+- Switched minimum Gym version to 0.24 (@carlosluis)
 - Changed the way policy "aliases" are handled ("MlpPolicy", "CnnPolicy", ...), removing the former
   ``register_policy`` helper, ``policy_base`` parameter and using ``policy_aliases`` static attributes instead (@Gregwar)
 - SB3 now requires PyTorch >= 1.11
@@ -73,6 +74,7 @@ Breaking Changes:
 
 New Features:
 ^^^^^^^^^^^^^
+- ``noop_max`` and ``frame_skip`` are now allowed to be equal to zero when using ``AtariWrapper``
 
 SB3-Contrib
 ^^^^^^^^^^^
@@ -98,6 +100,7 @@ Deprecations:
 Others:
 ^^^^^^^
 - Upgraded to Python 3.7+ syntax using ``pyupgrade``
+- Updated docker base image to Ubuntu 20.04 and cuda 11.3
 - Removed redundant double-check for nested observations from ``BaseAlgorithm._wrap_env`` (@TibiGG)
 
 Documentation:
@@ -107,6 +110,7 @@ Documentation:
 - Added link to PPO ICLR blog post
 - Added remark about breaking Markov assumption and timeout handling
 - Added doc about MLFlow integration via custom logger (@git-thor)
+- Updated tutorials to work with Gym 0.23 (@arjun-kg)
 - Updated Huggingface integration doc
 - Added copy button for code snippets
 - Added doc about EnvPool and Isaac Gym support
@@ -119,7 +123,7 @@ Release 1.5.0 (2022-03-25)
 
 Breaking Changes:
 ^^^^^^^^^^^^^^^^^
-- Switched minimum Gym version to 0.21.0.
+- Switched minimum Gym version to 0.21.0
 
 New Features:
 ^^^^^^^^^^^^^
@@ -1043,5 +1047,6 @@ And all the contributors:
 @eleurent @ac-93 @cove9988 @theDebugger811 @hsuehch @Demetrio92 @thomasgubler @IperGiove @ScheiklP
 @simoninithomas @armandpl @manuel-delverme @Gautam-J @gianlucadecola @buoyancy99 @caburu @xy9485
 @Gregwar @ycheng517 @quantitative-technologies @bcollazo @git-thor @TibiGG @cool-RR @MWeltevrede
+@carlosluis @arjun-kg @tlpss
 @Melanol @qgallouedec @francescoluciano @jlp-ue @burakdmb @timothe-chaumont @honglu2875
 @anand-bala @hughperkins @sidney-tio @AlexPasqua @dominicgkerr @Akhilez @Rocamonde
diff --git a/docs/modules/her.rst b/docs/modules/her.rst
@@ -22,7 +22,10 @@ It creates "virtual" transitions by relabeling transitions (changing the desired
 
 .. warning::
 
-    HER requires the environment to inherits from `gym.GoalEnv <https://github.com/openai/gym/blob/3394e245727c1ae6851b504a50ba77c73cd4c65b/gym/core.py#L160>`_
+    HER requires the environment to follow the legacy `gym.GoalEnv interface <https://github.com/openai/gym/blob/3394e245727c1ae6851b504a50ba77c73cd4c65b/gym/core.py#L160>`_
+    In short, the ``gym.Env`` must have:
+    - a vectorized implementation of ``compute_reward()``
+    - a dictionary observation space with three keys: ``observation``, ``achieved_goal`` and ``desired_goal``
 
 
 .. warning::

diff --git a/scripts/build_docker.sh b/scripts/build_docker.sh
@@ -1,14 +1,14 @@
 #!/bin/bash
 
-CPU_PARENT=ubuntu:18.04
-GPU_PARENT=nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+CPU_PARENT=ubuntu:20.04
+GPU_PARENT=nvidia/cuda:11.3.1-base-ubuntu20.04
 
 TAG=stablebaselines/stable-baselines3
 VERSION=$(cat ./stable_baselines3/version.txt)
 
 if [[ ${USE_GPU} == "True" ]]; then
   PARENT=${GPU_PARENT}
-  PYTORCH_DEPS="cudatoolkit=10.1"
+  PYTORCH_DEPS="cudatoolkit=11.3"
 else
   PARENT=${CPU_PARENT}
   PYTORCH_DEPS="cpuonly"

diff --git a/setup.cfg b/setup.cfg
@@ -10,11 +10,12 @@ filterwarnings =
 		# Tensorboard warnings
 		ignore::DeprecationWarning:tensorboard
 		# Gym warnings
-		ignore:Parameters to load are deprecated.:DeprecationWarning
-		ignore:the imp module is deprecated in favour of importlib:PendingDeprecationWarning
+		; ignore:Parameters to load are deprecated.:DeprecationWarning
+		; ignore:the imp module is deprecated in favour of importlib:PendingDeprecationWarning
 		ignore::UserWarning:gym
-		ignore:SelectableGroups dict interface is deprecated.:DeprecationWarning
-		ignore:`np.bool` is a deprecated alias for the builtin `bool`:DeprecationWarning
+		; ignore:SelectableGroups dict interface is deprecated.:DeprecationWarning
+		; ignore:`np.bool` is a deprecated alias for the builtin `bool`:DeprecationWarning
+		ignore:.*step API:DeprecationWarning:gym
 markers =
     expensive: marks tests as expensive (deselect with '-m "not expensive"')
 

diff --git a/setup.py b/setup.py
@@ -48,13 +48,13 @@
 model = PPO("MlpPolicy", env, verbose=1)
 model.learn(total_timesteps=10_000)
 
-obs = env.reset()
+obs, info = env.reset()
 for i in range(1000):
     action, _states = model.predict(obs, deterministic=True)
-    obs, reward, done, info = env.step(action)
+    obs, reward, done, truncated, info = env.step(action)
     env.render()
-    if done:
-        obs = env.reset()
+    if done or truncated:
+        obs, info = env.reset()
 ```
 
 Or just train a model with a one liner if [the environment is registered in Gym](https://www.gymlibrary.ml/content/environment_creation/) and if [the policy is registered](https://stable-baselines3.readthedocs.io/en/master/guide/custom_policy.html):
@@ -73,7 +73,7 @@
     packages=[package for package in find_packages() if package.startswith("stable_baselines3")],
     package_data={"stable_baselines3": ["py.typed", "version.txt"]},
     install_requires=[
-        "gym==0.21",  # Fixed version due to breaking changes in 0.22
+        "gym==0.26",
         "numpy",
         "torch>=1.11",
         # For saving models
@@ -100,11 +100,9 @@
             "isort>=5.0",
             # Reformat
             "black",
-            # For toy text Gym envs
-            "scipy>=1.4.1",
         ],
         "docs": [
-            "sphinx",
+            "sphinx~=4.5.0",
             "sphinx-autobuild",
             "sphinx-rtd-theme",
             # For spelling
@@ -117,8 +115,9 @@
         "extra": [
             # For render
             "opencv-python",
+            "pygame",
             # For atari games,
-            "ale-py==0.7.4",
+            "ale-py~=0.8.0",
             "autorom[accept-rom-license]~=0.4.2",
             "pillow",
             # Tensorboard support

diff --git a/stable_baselines3/__init__.py b/stable_baselines3/__init__.py
@@ -1,4 +1,5 @@
 import os
+import warnings
 
 from stable_baselines3.a2c import A2C
 from stable_baselines3.common.utils import get_system_info
@@ -14,6 +15,9 @@
 with open(version_file) as file_handler:
     __version__ = file_handler.read().strip()
 
+# Silence Gym warnings due to new API
+warnings.filterwarnings("ignore", message=r".*step API", module="gym")
+
 
 def HER(*args, **kwargs):
     raise ImportError(