HULKs · schluis · Dec 3, 2024 · Oct 16, 2024 · Oct 20, 2024 · Oct 20, 2024
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/tools/machine-learning/mujoco/.gitignore b/tools/machine-learning/mujoco/.gitignore
@@ -0,0 +1,2 @@
+videos/
+runs/
diff --git a/tools/machine-learning/mujoco/README.md b/tools/machine-learning/mujoco/README.md
@@ -0,0 +1,20 @@
+# Setup
+
+Make sure `glfw` is installed on your machine.
+
+For python use [uv](https://docs.astral.sh/uv/).
+After installing uv, run `uv sync` to install all python dependencies or directly execute an example from below.
+
+## Example usage
+
+To view the model:
+
+- `uv run interactive_viewer.py`
+
+To train the standup task:
+
+- `uv run standup.py`
+
+## To build a custom NAO environment
+
+Add a new `MujocoEnv` class in the `nao_env` folder and add it to the `__init__.py` file.
diff --git a/tools/machine-learning/mujoco/interactive_viewer.py b/tools/machine-learning/mujoco/interactive_viewer.py
@@ -0,0 +1,13 @@
+import os
+
+import mujoco
+from mujoco import viewer
+
+os.environ["MUJOCO_GL"] = "egl"
+
+model = mujoco.MjModel.from_xml_path("model/scene.xml")
+data = mujoco.MjData(model)
+
+# mujoco.mj_resetDataKeyframe(model, data, 2)
+
+viewer.launch(model, data)
diff --git a/tools/machine-learning/mujoco/nao_env/__init__.py b/tools/machine-learning/mujoco/nao_env/__init__.py
@@ -0,0 +1,6 @@
+from gymnasium.envs.mujoco.mujoco_env import MujocoEnv, MuJocoPyEnv  # noqa: F401
+
+# ^^^^^ so that user gets the correct error
+# message if mujoco is not installed correctly
+
+from nao_env.nao_standup import NaoStandup
diff --git a/tools/machine-learning/mujoco/nao_env/nao_standup.py b/tools/machine-learning/mujoco/nao_env/nao_standup.py
@@ -0,0 +1,132 @@
+from pathlib import Path
+
+import numpy as np
+from gymnasium import utils
+from gymnasium.envs.mujoco.mujoco_env import MujocoEnv
+from gymnasium.spaces import Box
+
+DEFAULT_CAMERA_CONFIG = {
+    "trackbodyid": 1,
+    "distance": 4.0,
+    "lookat": np.array((0.0, 0.0, 0.8925)),
+    "elevation": -20.0,
+}
+
+
+class NaoStandup(MujocoEnv, utils.EzPickle):
+    metadata = {
+        "render_modes": [
+            "human",
+            "rgb_array",
+            "depth_array",
+        ],
+    }
+
+    def __init__(self, **kwargs) -> None:
+        observation_space = Box(
+            low=-np.inf,
+            high=np.inf,
+            shape=(661,),
+            dtype=np.float64,
+        )
+
+        MujocoEnv.__init__(
+            self,
+            str(Path.cwd().joinpath("model", "scene.xml")),
+            5,
+            observation_space=observation_space,
+            default_camera_config=DEFAULT_CAMERA_CONFIG,
+            **kwargs,
+        )
+        utils.EzPickle.__init__(self, **kwargs)
+
+    def _get_obs(self) -> np.ndarray:
+        data = self.data
+        return np.concatenate(
+            [
+                data.qpos.flat[2:],
+                data.qvel.flat,
+                data.cinert.flat,
+                data.cvel.flat,
+                data.qfrc_actuator.flat,
+                data.cfrc_ext.flat,
+            ],
+        )
+
+    def step(self, a):
+        self.do_simulation(a, self.frame_skip)
+        data = self.data
+
+        head_center_id = self.model.site("head_center").id
+        head_center_z = data.site_xpos[head_center_id][2]
+        uph_cost = (head_center_z - 0) / self.model.opt.timestep
+
+        quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
+        quad_impact_cost = 0.5e-6 * np.square(data.cfrc_ext).sum()
+        quad_impact_cost = min(quad_impact_cost, 10)
+        reward = uph_cost - quad_ctrl_cost - quad_impact_cost + 1
+
+        if self.render_mode == "human":
+            self.render()
+        return (
+            self._get_obs(),
+            reward,
+            False,
+            False,
+            {
+                "reward_linup": uph_cost,
+                "reward_quadctrl": -quad_ctrl_cost,
+                "reward_impact": -quad_impact_cost,
+            },
+        )
+
+    def reset_model(self):
+        half_random_offset = 0.03
+        face_down_keyframe_qpos = [
+            0.452845,
+            0.219837,
+            0.0556939,
+            0.710551,
+            -0.0810676,
+            0.693965,
+            0.0834173,
+            -0.000571484,
+            0.0239414,
+            0.000401842,
+            -3.89047e-05,
+            -0.00175077,
+            0.357233,
+            0.0114063,
+            0.000212495,
+            0.000422366,
+            3.92127e-05,
+            -0.00133669,
+            0.356939,
+            0.0112884,
+            -0.000206283,
+            1.46985,
+            0.110264,
+            0.000766453,
+            -0.034298,
+            3.65047e-05,
+            1.47067,
+            -0.110094,
+            -0.00201064,
+            0.0342998,
+            -0.00126886,
+        ]
+        self.set_state(
+            face_down_keyframe_qpos
+            + self.np_random.uniform(
+                low=-half_random_offset,
+                high=half_random_offset,
+                size=self.model.nq,
+            ),
+            self.init_qvel
+            + self.np_random.uniform(
+                low=-half_random_offset,
+                high=half_random_offset,
+                size=self.model.nv,
+            ),
+        )
+        return self._get_obs()
diff --git a/tools/machine-learning/mujoco/pyproject.toml b/tools/machine-learning/mujoco/pyproject.toml
@@ -5,14 +5,16 @@ description = "Add your description here"
 readme = "README.md"
 requires-python = ">=3.10"
 dependencies = [
-    "gymnasium[classic-control,mujoco]==0.28.1",
+    "mujoco>=3.2.4",
+    "gymnasium[classic-control]",
     "ipykernel>=6.29.5",
     "mediapy>=1.2.2",
-    "mujoco>=3.2.4",
     "numpy>=2.1.2",
     "scipy>=1.14.1",
     "moviepy>=1.0.3",
     "stable-baselines3>=2.3.2",
+    "wandb>=0.18.5",
+    "tensorboard>=2.18.0",
 ]
 
 [tool.ruff]
@@ -64,8 +66,10 @@ ignore = [
 [tool.ruff.lint.per-file-ignores]
 "tests/*" = ["S101", "S603"]
 
-[tool.ruff.lint.isort]
-required-imports = ["from __future__ import annotations"]
-
 [tool.uv]
-dev-dependencies = ["pytest>=8.3.3", "ruff>=0.7.3"]
+
+dev-dependencies = [
+    "neovim>=0.3.1",
+    "pytest>=8.3.3",
+    "ruff>=0.7.3",
+]
diff --git a/tools/machine-learning/mujoco/standup.py b/tools/machine-learning/mujoco/standup.py
@@ -1,23 +1,93 @@
-import gymnasium as gym
+import os
 
+import gymnasium as gym
+import torch
+import wandb
 from stable_baselines3 import PPO
+from stable_baselines3.common.monitor import Monitor
+from stable_baselines3.common.utils import get_device
+from stable_baselines3.common.vec_env import DummyVecEnv, VecVideoRecorder
+from wandb.integration.sb3 import WandbCallback
+
+if get_device() != torch.device("cpu"):
+    NVIDIA_ICD_CONFIG_PATH = "/usr/share/glvnd/egl_vendor.d/10_nvidia.json"
+    if not os.path.exists(NVIDIA_ICD_CONFIG_PATH):
+        with open(NVIDIA_ICD_CONFIG_PATH, "w") as f:
+            _ = f.write("""{
+                                "file_format_version" : "1.0.0",
+                                "ICD" : {
+                                    "library_path" : "libEGL_nvidia.so.0"
+                                }
+                            }""")
+
+    # Configure MuJoCo to use the EGL rendering backend (requires GPU)
+    os.environ["MUJOCO_GL"] = "egl"
+
+
+# taken from https://gymnasium.farama.org/main/_modules/gymnasium/wrappers/record_video/
+def capped_cubic_video_schedule(episode_id: int) -> bool:
+    """The default episode trigger.
+
+    This function will trigger recordings at the episode indices 0, 1, 8, 27, ..., :math:`k^3`, ..., 729, 1000, 2000, 3000, ...
+
+    Args:
+        episode_id: The episode number
+
+    Returns:
+        If to apply a video schedule number
+    """
+    if episode_id < 10000:
+        return int(round(episode_id ** (1.0 / 3))) ** 3 == episode_id
+    else:
+        return episode_id % 10000 == 0
+
+
+gym.register(
+    id="NaoStandup-v1",
+    entry_point="nao_env:NaoStandup",
+    max_episode_steps=2500,
+)
+
+config = {
+    "policy_type": "MlpPolicy",
+    "total_timesteps": 1000000,
+    "env_name": "NaoStandup-v1",
+    "render_mode": "rgb_array",
+}
+
 
-env = gym.make("CartPole-v1", render_mode="human")
+run = wandb.init(
+    project="nao_standup",
+    config=config,
+    sync_tensorboard=True,
+    monitor_gym=True,
+    save_code=False,
+    mode="disabled",
+)
 
-model = PPO("MlpPolicy", env, verbose=1)
-model.learn(total_timesteps=10_000)
 
-vec_env = model.get_env()
-if vec_env is None:
-    raise ValueError("Model does not have a VecEnv")
+def make_env():
+    env = gym.make(config["env_name"], render_mode=config["render_mode"])
+    env = Monitor(env)  # record stats such as returns
+    return env
 
-obs = vec_env.reset()
-for i in range(1000):
-    action, _states = model.predict(obs, deterministic=True)
-    obs, reward, done, info = vec_env.step(action)
-    vec_env.render()
-    # VecEnv resets automatically
-    # if done:
-    #   obs = env.reset()
 
-env.close()
+env = DummyVecEnv([make_env])
+env = VecVideoRecorder(
+    env,
+    f"videos/{run.id}",
+    record_video_trigger=capped_cubic_video_schedule,
+    video_length=200,
+)
+model = PPO(
+    config["policy_type"], env, verbose=1, tensorboard_log=f"runs/{run.id}"
+)
+model.learn(
+    total_timesteps=config["total_timesteps"],
+    callback=WandbCallback(
+        gradient_save_freq=100,
+        model_save_path=f"models/{run.id}",
+        verbose=2,
+    ),
+)
+run.finish()