From 0b94a69bd38ce9e6b5cc35fa124351224bcbecf5 Mon Sep 17 00:00:00 2001
From: Kevin Lin <kevin.lin20000@gmail.com>
Date: Mon, 7 Oct 2024 01:24:47 -0500
Subject: [PATCH 01/13] Unify mink action dict keys

---
 .../third_party_controller/mink_controller.py |  14 +-
 .../third_party_controller/teleop_mink.py     | 191 ++++++++++++++++++
 2 files changed, 202 insertions(+), 3 deletions(-)
 create mode 100644 robosuite/examples/third_party_controller/teleop_mink.py

diff --git a/robosuite/examples/third_party_controller/mink_controller.py b/robosuite/examples/third_party_controller/mink_controller.py
index c0bf299331..a9cde3c922 100644
--- a/robosuite/examples/third_party_controller/mink_controller.py
+++ b/robosuite/examples/third_party_controller/mink_controller.py
@@ -248,12 +248,14 @@ def set_posture_target(self, posture_target: np.ndarray):
     def action_split_indexes(self) -> Dict[str, Tuple[int, int]]:
         action_split_indexes: Dict[str, Tuple[int, int]] = {}
         previous_idx = 0
+
         for site_name in self.site_names:
             total_dim = self.pos_dim + self.rot_dim
             last_idx = previous_idx + total_dim
-            action_split_indexes[site_name + "_pos"] = (previous_idx, previous_idx + self.pos_dim)
-            action_split_indexes[site_name + f"_{self.input_rotation_repr}"] = (previous_idx + self.pos_dim, last_idx)
+            simplified_site_name = "left" if "left" in site_name else "right"  # hack to simplify site names
+            action_split_indexes[simplified_site_name] = (previous_idx, last_idx)
             previous_idx = last_idx
+
         return action_split_indexes
 
     def solve(self, target_action: Optional[np.ndarray] = None) -> np.ndarray:
@@ -381,10 +383,16 @@ def _init_joint_action_policy(self):
         for part_name in self.composite_controller_specific_config["ik_controlled_part_names"]:
             joint_names += self.part_controllers[part_name].joint_names
 
+        default_site_names: List[str] = []
+        for arm in ['right', 'left']:
+            if arm  in self.part_controller_config:
+                default_site_names.append(self.part_controller_config[arm]["ref_name"])
+
         self.joint_action_policy = IKSolverMink(
             model=self.sim.model._model,
             data=self.sim.data._data,
-            site_names=self.composite_controller_specific_config["ref_name"],
+            site_names=self.composite_controller_specific_config["ref_name"] \
+                if "ref_name" in self.composite_controller_specific_config else default_site_names,
             robot_model=self.robot_model.mujoco_model,
             robot_joint_names=joint_names,
             input_action_repr=self.composite_controller_specific_config.get("ik_input_action_repr", "absolute"),
diff --git a/robosuite/examples/third_party_controller/teleop_mink.py b/robosuite/examples/third_party_controller/teleop_mink.py
new file mode 100644
index 0000000000..00e865826f
--- /dev/null
+++ b/robosuite/examples/third_party_controller/teleop_mink.py
@@ -0,0 +1,191 @@
+"""
+A script to teleop a robot using mink:
+
+python robosuite/examples/third_party_controller/teleop_mink.py --controller robosuite/examples/third_party_controller/default_mink_ik_gr1.json --robots GR1FixedLowerBody --device mocap
+"""
+
+import argparse
+import os
+import time
+from typing import List, Tuple
+
+import mujoco
+import numpy as np
+
+import robosuite as suite
+from robosuite.controllers import load_composite_controller_config
+
+# mink-related imports
+from robosuite.examples.third_party_controller.mink_controller import WholeBodyMinkIK
+from robosuite.utils import transform_utils
+from robosuite.utils.input_utils import input2action
+from robosuite.wrappers import DataCollectionWrapper
+from robosuite.devices.keyboard import Keyboard
+
+
+def set_target_pose(sim, target_pos=None, target_mat=None, mocap_name: str = "target"):
+    mocap_id = sim.model.body(mocap_name).mocapid[0]
+    if target_pos is not None:
+        sim.data.mocap_pos[mocap_id] = target_pos
+    if target_mat is not None:
+        # convert mat to quat
+        target_quat = np.empty(4)
+        mujoco.mju_mat2Quat(target_quat, target_mat.reshape(9, 1))
+        sim.data.mocap_quat[mocap_id] = target_quat
+
+
+def get_target_pose(sim, mocap_name: str = "target") -> Tuple[np.ndarray]:
+    mocap_id = sim.model.body(mocap_name).mocapid[0]
+    target_pos = np.copy(sim.data.mocap_pos[mocap_id])
+    target_quat = np.copy(sim.data.mocap_quat[mocap_id])
+    target_mat = np.empty(9)
+    mujoco.mju_quat2Mat(target_mat, target_quat)
+    target_mat = target_mat.reshape(3, 3)
+    return target_pos, target_mat
+
+
+def collect_human_trajectory(env, device, arm, env_configuration, end_effector: str = "right"):
+    """
+    Use the device (keyboard or SpaceNav 3D mouse) to collect a demonstration.
+    The rollout trajectory is saved to files in npz format.
+    Modify the DataCollectionWrapper wrapper to add new fields or change data formats.
+
+    Args:
+        env (MujocoEnv): environment to control
+        device (Device): to receive controls from the device
+        arms (str): which arm to control (eg bimanual) 'right' or 'left'
+        env_configuration (str): specified environment configuration
+    """
+
+    env.reset()
+    env.render()
+
+    site_names: List[str] = env.robots[0].composite_controller.joint_action_policy.site_names
+    right_pos = env.sim.data.site_xpos[env.sim.model.site_name2id(site_names[0])]
+    right_mat = env.sim.data.site_xmat[env.sim.model.site_name2id(site_names[0])]
+    left_pos = env.sim.data.site_xpos[env.sim.model.site_name2id(site_names[1])]
+    left_mat = env.sim.data.site_xmat[env.sim.model.site_name2id(site_names[1])]
+
+    set_target_pose(env.sim, right_pos, right_mat, "right_eef_target")
+    set_target_pose(env.sim, left_pos, left_mat, "left_eef_target")
+
+    device.start_control()
+
+    for robot in env.robots:
+        robot.print_action_info_dict()
+
+    # Keep track of prev gripper actions when using since they are position-based and must be maintained when arms switched
+    all_prev_gripper_actions = [
+        {
+            f"{robot_arm}_gripper": np.repeat([0], robot.gripper[robot_arm].dof)
+            for robot_arm in robot.arms
+            if robot.gripper[robot_arm].dof > 0
+        }
+        for robot in env.robots
+    ]
+
+    # Loop until we get a reset from the input or the task completes
+    while True:
+        # Set active robot
+        active_robot = env.robots[device.active_robot]
+        prev_gripper_actions = all_prev_gripper_actions[device.active_robot]
+
+        arm = device.active_arm
+        # Check if we have gripper actions for the active arm
+        arm_using_gripper = f"{arm}_gripper" in all_prev_gripper_actions[device.active_robot]
+        # Get the newest action
+        input_action, grasp = input2action(
+            device=device,
+            robot=active_robot,
+            active_arm=arm,
+            active_end_effector=end_effector,
+            env_configuration=env_configuration,
+        )
+
+        # If action is none, then this a reset so we should break
+        if input_action is None:
+            break
+
+        action_dict = {}
+        for arm in active_robot.arms:
+            pos_target, mat_target = get_target_pose(env.sim, f"{arm}_eef_target")
+            axis_angle_target = transform_utils.quat2axisangle(transform_utils.mat2quat(mat_target))
+            action_dict[arm] = np.concatenate([pos_target, axis_angle_target])
+
+        for gripper_name, gripper in active_robot.gripper.items():
+            action_dict[f"{gripper_name}_gripper"] = np.zeros(gripper.dof)  # what's the 'do nothing' action for all grippers?
+
+        if arm_using_gripper:
+            action_dict[f"{arm}_gripper"] = np.repeat(input_action[6:7], active_robot.gripper[arm].dof)
+            prev_gripper_actions[f"{arm}_gripper"] = np.repeat(input_action[6:7], active_robot.gripper[arm].dof)
+
+        # Maintain gripper state for each robot but only update the active robot with action
+        env_action = [robot.create_action_vector(all_prev_gripper_actions[i]) for i, robot in enumerate(env.robots)]
+        env_action[device.active_robot] = active_robot.create_action_vector(action_dict)
+        env_action = np.concatenate(env_action)
+
+        env.step(env_action)
+        env.render()
+
+    # cleanup for end of data collection episodes
+    env.close()
+
+
+if __name__ == "__main__":
+    # Arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--environment", type=str, default="Lift")
+    parser.add_argument("--robots", nargs="+", type=str, default="Panda", help="Which robot(s) to use in the env")
+    parser.add_argument(
+        "--config", type=str, default="default", help="Specified environment configuration if necessary"
+    )
+    parser.add_argument("--arm", type=str, default="right", help="Which arm to control (eg bimanual) 'right' or 'left'")
+    parser.add_argument("--camera", type=str, default="agentview", help="Which camera to use for collecting demos")
+    parser.add_argument(
+        "--controller",
+        type=str,
+        default=None,
+        help="Choice of controller json file (see robosuite/controllers/config for examples)",
+    )
+    parser.add_argument("--device", type=str, default="keyboard")
+    parser.add_argument("--pos-sensitivity", type=float, default=1.0, help="How much to scale position user inputs")
+    parser.add_argument("--rot-sensitivity", type=float, default=1.0, help="How much to scale rotation user inputs")
+    args = parser.parse_args()
+
+    # Get controller config
+    controller_config = load_composite_controller_config(
+        controller=args.controller,
+        robot=args.robots[0],
+    )
+
+    # Create argument configuration
+    config = {
+        "env_name": args.environment,
+        "robots": args.robots,
+        "controller_configs": controller_config,
+    }
+
+    # Check if we're using a multi-armed environment and use env_configuration argument if so
+    if "TwoArm" in args.environment:
+        config["env_configuration"] = args.config
+
+    # Create environment
+    env = suite.make(
+        **config,
+        has_renderer=True,
+        renderer="mjviewer",
+        has_offscreen_renderer=False,
+        render_camera=args.camera,
+        ignore_done=True,
+        use_camera_obs=False,
+        reward_shaping=True,
+        control_freq=20,
+    )
+
+    # wrap the environment with data collection wrapper
+    tmp_directory = "teleop-mink-data/{}".format(str(time.time()).replace(".", "_"))
+    env = DataCollectionWrapper(env, tmp_directory)  # need this wrapper's reset for UI mocap dragging to work
+    device = Keyboard(env=env, pos_sensitivity=args.pos_sensitivity, rot_sensitivity=args.rot_sensitivity)
+
+    while True:
+        collect_human_trajectory(env, device, args.arm, args.config)
\ No newline at end of file

From fbc4fc51345f3defffc9cae341f6bcbb899dc5b0 Mon Sep 17 00:00:00 2001
From: Kevin Lin <kevin.lin20000@gmail.com>
Date: Mon, 7 Oct 2024 01:30:52 -0500
Subject: [PATCH 02/13] Remove redundant script

---
 .../collect_human_demonstrations_mocap.py     | 391 ------------------
 1 file changed, 391 deletions(-)
 delete mode 100644 robosuite/examples/third_party_controller/collect_human_demonstrations_mocap.py

diff --git a/robosuite/examples/third_party_controller/collect_human_demonstrations_mocap.py b/robosuite/examples/third_party_controller/collect_human_demonstrations_mocap.py
deleted file mode 100644
index 1282266eb0..0000000000
--- a/robosuite/examples/third_party_controller/collect_human_demonstrations_mocap.py
+++ /dev/null
@@ -1,391 +0,0 @@
-"""
-A script to collect a batch of human demonstrations.
-
-The demonstrations can be played back using the `playback_demonstrations_from_hdf5.py` script.
-"""
-
-import argparse
-import datetime
-import json
-import os
-import time
-from glob import glob
-from typing import List
-
-import h5py
-import mujoco
-import numpy as np
-
-import robosuite as suite
-import robosuite.examples.third_party_controller.mink_controller
-import robosuite.macros as macros
-from robosuite.controllers import load_composite_controller_config
-from robosuite.examples.third_party_controller.mink_controller import WholeBodyMinkIK
-from robosuite.utils import transform_utils
-from robosuite.utils.input_utils import input2action
-from robosuite.wrappers import DataCollectionWrapper, VisualizationWrapper
-
-"""
-Command for running GR1 + mocap (only works for GR1 b/c of site_names assumption)
-
-python robosuite/examples/third_party_controller/collect_human_demonstrations_mocap.py --environment Lift --robots GR1FixedLowerBody --device keyboard --camera frontview --custom-controller-config robosuite/examples/third_party_controller/default_mink_ik_gr1.json
-
-Need to Esc; Tab; Scroll to Group Enable, then press Group 2. Then, double click on the mocap cube and Ctrl+left or right click.
-
-"""
-
-
-def set_target(sim, target_pos=None, target_mat=None, mocap_name: str = "target"):
-    mocap_id = sim.model.body(mocap_name).mocapid[0]
-    if target_pos is not None:
-        sim.data.mocap_pos[mocap_id] = target_pos
-    if target_mat is not None:
-        # convert mat to quat
-        target_quat = np.empty(4)
-        mujoco.mju_mat2Quat(target_quat, target_mat.reshape(9, 1))
-        sim.data.mocap_quat[mocap_id] = target_quat
-
-
-def get_target(sim, mocap_name: str = "target"):
-    mocap_id = sim.model.body(mocap_name).mocapid[0]
-    target_pos = np.copy(sim.data.mocap_pos[mocap_id])
-    target_quat = np.copy(sim.data.mocap_quat[mocap_id])
-    target_mat = np.empty(9)
-    mujoco.mju_quat2Mat(target_mat, target_quat)
-    target_mat = target_mat.reshape(3, 3)
-    return target_pos, target_mat
-
-
-def collect_human_trajectory(env, device, arm, env_configuration, end_effector: str = "right", use_mocap: bool = False):
-    """
-    Use the device (keyboard or SpaceNav 3D mouse) to collect a demonstration.
-    The rollout trajectory is saved to files in npz format.
-    Modify the DataCollectionWrapper wrapper to add new fields or change data formats.
-
-    Args:
-        env (MujocoEnv): environment to control
-        device (Device): to receive controls from the device
-        arms (str): which arm to control (eg bimanual) 'right' or 'left'
-        env_configuration (str): specified environment configuration
-    """
-
-    env.reset()
-    env.render()
-    if use_mocap:
-        site_names: List[str] = env.robots[0].composite_controller.joint_action_policy.site_names
-        right_pos = env.sim.data.site_xpos[env.sim.model.site_name2id(site_names[0])]
-        right_mat = env.sim.data.site_xmat[env.sim.model.site_name2id(site_names[0])]
-        left_pos = env.sim.data.site_xpos[env.sim.model.site_name2id(site_names[1])]
-        left_mat = env.sim.data.site_xmat[env.sim.model.site_name2id(site_names[1])]
-
-        # # Add mocap bodies if they don't exist
-        # if "right_eef_target" not in env.sim.model.body_names:
-        #     add_mocap_body(env.sim.model, "right_eef_target", right_pos)
-        # if "left_eef_target" not in env.sim.model.body_names:
-        #     add_mocap_body(env.sim.model, "left_eef_target", left_pos)
-
-        set_target(env.sim, right_pos, right_mat, "right_eef_target")
-        set_target(env.sim, left_pos, left_mat, "left_eef_target")
-
-    task_completion_hold_count = -1  # counter to collect 10 timesteps after reaching goal
-    device.start_control()
-    env.robots[0].print_action_info_dict()
-    # Loop until we get a reset from the input or the task completes
-
-    while True:
-        # Set active robot
-        active_robot = env.robots[0]  # if env_configuration == "bimanual" else env.robots[arm == "left"]
-
-        # Get the newest action
-        input_action, grasp = input2action(
-            device=device,
-            robot=active_robot,
-            active_arm=arm,
-            active_end_effector=end_effector,
-            env_configuration=env_configuration,
-        )
-
-        # If action is none, then this a reset so we should break
-        if input_action is None:
-            break
-
-        # Run environment step
-        if env.robots[0].is_mobile:
-            arm_actions = input_action[:12].copy() if "bimanual" in env.robots[0].name else input_action[:6].copy()
-            if "GR1" in env.robots[0].name:
-                # "relative" actions by default for now
-                action_dict = {
-                    "robot0_l_eef_site_pos": input_action[:3] * 0.1,
-                    "robot0_l_eef_site_axis_angle": input_action[3:6],
-                    "robot0_r_eef_site_pos": np.zeros(3),
-                    "robot0_r_eef_site_axis_angle": np.zeros(3),
-                    "left_gripper": np.array([0.0] * env.robots[0].gripper["left"].dof),
-                    "right_gripper": np.array([0.0] * env.robots[0].gripper["right"].dof),
-                }
-
-                if use_mocap:
-                    right_pos, right_mat = get_target(env.sim, "right_eef_target")
-                    left_pos, left_mat = get_target(env.sim, "left_eef_target")
-                    # convert mat to quat wxyz
-                    right_quat_wxyz = np.empty(4)
-                    left_quat_wxyz = np.empty(4)
-                    mujoco.mju_mat2Quat(right_quat_wxyz, right_mat.reshape(9, 1))
-                    mujoco.mju_mat2Quat(left_quat_wxyz, left_mat.reshape(9, 1))
-
-                    # convert to quat xyzw
-                    right_quat_xyzw = np.roll(right_quat_wxyz, -1)
-                    left_quat_xyzw = np.roll(left_quat_wxyz, -1)
-                    # convert to axis angle
-                    right_axis_angle = transform_utils.quat2axisangle(right_quat_xyzw)
-                    left_axis_angle = transform_utils.quat2axisangle(left_quat_xyzw)
-
-                    action_dict["robot0_l_eef_site_pos"] = left_pos
-                    action_dict["robot0_r_eef_site_pos"] = right_pos
-                    action_dict["robot0_l_eef_site_axis_angle"] = left_axis_angle
-                    action_dict["robot0_r_eef_site_axis_angle"] = right_axis_angle
-                    action_dict["gripper0_left_grip_site_pos"] = left_pos
-                    action_dict["gripper0_right_grip_site_pos"] = right_pos
-                    action_dict["gripper0_left_grip_site_axis_angle"] = left_axis_angle
-                    action_dict["gripper0_right_grip_site_axis_angle"] = right_axis_angle
-
-            elif "Tiago" in env.robots[0].name:
-                action_dict = {
-                    "right_gripper": np.array([0.0]),
-                    "left_gripper": np.array([0.0]),
-                    "robot0_l_eef_site_pos": np.array([-0.4189254, 0.22745755, 1.0597]) + input_action[:3] * 0.05,
-                    "robot0_l_eef_site_axis_angle": np.array([-2.1356914, 2.50323857, -2.45929076]),
-                    "robot0_r_eef_site_pos": np.array([-0.41931295, -0.22706004, 1.0566]),
-                    "robot0_r_eef_site_axis_angle": np.array([-1.26839518, 1.15421975, 0.99332174]),
-                }
-            else:
-                action_dict = {}
-                base_action = input_action[-5:-2]
-                torso_action = input_action[-2:-1]
-
-                right_action = [0.0] * 5
-                right_action[0] = 0.0
-                action_dict = {
-                    arm: arm_actions,
-                    f"{end_effector}_gripper": np.repeat(input_action[6:7], env.robots[0].gripper[end_effector].dof),
-                    env.robots[0].base: base_action,
-                    # env.robots[0].head: base_action,
-                    # env.robots[0].torso: base_action
-                    # env.robots[0].torso: torso_action
-                }
-
-            action = env.robots[0].create_action_vector(action_dict)
-            mode_action = input_action[-1]
-
-            if mode_action > 0:
-                env.robots[0].enable_parts(base=True, right=True, left=True, torso=True)
-            else:
-                if "GR1FixedLowerBody" in env.robots[0].name or "Tiago" in env.robots[0].name:
-                    env.robots[0].enable_parts(base=False, right=True, left=True, torso=True)
-                else:
-                    env.robots[0].enable_parts(base=False, right=True, left=True, torso=False)
-        else:
-            arm_actions = input_action
-            action = env.robots[0].create_action_vector(
-                {arm: arm_actions[:-1], f"{end_effector}_gripper": arm_actions[-1:]}
-            )
-
-        env.step(action)
-        env.render()
-
-        # Also break if we complete the task
-        if task_completion_hold_count == 0:
-            break
-
-        # state machine to check for having a success for 10 consecutive timesteps
-        if env._check_success():
-            if task_completion_hold_count > 0:
-                task_completion_hold_count -= 1  # latched state, decrement count
-            else:
-                task_completion_hold_count = 10  # reset count on first success timestep
-        else:
-            task_completion_hold_count = -1  # null the counter if there's no success
-
-    # cleanup for end of data collection episodes
-    env.close()
-
-
-def gather_demonstrations_as_hdf5(directory, out_dir, env_info):
-    """
-    Gathers the demonstrations saved in @directory into a
-    single hdf5 file.
-
-    The strucure of the hdf5 file is as follows.
-
-    data (group)
-        date (attribute) - date of collection
-        time (attribute) - time of collection
-        repository_version (attribute) - repository version used during collection
-        env (attribute) - environment name on which demos were collected
-
-        demo1 (group) - every demonstration has a group
-            model_file (attribute) - model xml string for demonstration
-            states (dataset) - flattened mujoco states
-            actions (dataset) - actions applied during demonstration
-
-        demo2 (group)
-        ...
-
-    Args:
-        directory (str): Path to the directory containing raw demonstrations.
-        out_dir (str): Path to where to store the hdf5 file.
-        env_info (str): JSON-encoded string containing environment information,
-            including controller and robot info
-    """
-
-    hdf5_path = os.path.join(out_dir, "demo.hdf5")
-    f = h5py.File(hdf5_path, "w")
-
-    # store some metadata in the attributes of one group
-    grp = f.create_group("data")
-
-    num_eps = 0
-    env_name = None  # will get populated at some point
-
-    for ep_directory in os.listdir(directory):
-
-        state_paths = os.path.join(directory, ep_directory, "state_*.npz")
-        states = []
-        actions = []
-        success = False
-
-        for state_file in sorted(glob(state_paths)):
-            dic = np.load(state_file, allow_pickle=True)
-            env_name = str(dic["env"])
-
-            states.extend(dic["states"])
-            for ai in dic["action_infos"]:
-                actions.append(ai["actions"])
-            success = success or dic["successful"]
-
-        if len(states) == 0:
-            continue
-
-        # Add only the successful demonstration to dataset
-        if success:
-            print("Demonstration is successful and has been saved")
-            # Delete the last state. This is because when the DataCollector wrapper
-            # recorded the states and actions, the states were recorded AFTER playing that action,
-            # so we end up with an extra state at the end.
-            del states[-1]
-            assert len(states) == len(actions)
-
-            num_eps += 1
-            ep_data_grp = grp.create_group("demo_{}".format(num_eps))
-
-            # store model xml as an attribute
-            xml_path = os.path.join(directory, ep_directory, "model.xml")
-            with open(xml_path, "r") as f:
-                xml_str = f.read()
-            ep_data_grp.attrs["model_file"] = xml_str
-
-            # write datasets for states and actions
-            ep_data_grp.create_dataset("states", data=np.array(states))
-            ep_data_grp.create_dataset("actions", data=np.array(actions))
-        else:
-            print("Demonstration is unsuccessful and has NOT been saved")
-
-    # write dataset attributes (metadata)
-    now = datetime.datetime.now()
-    grp.attrs["date"] = "{}-{}-{}".format(now.month, now.day, now.year)
-    grp.attrs["time"] = "{}:{}:{}".format(now.hour, now.minute, now.second)
-    grp.attrs["repository_version"] = suite.__version__
-    grp.attrs["env"] = env_name
-    grp.attrs["env_info"] = env_info
-
-    f.close()
-
-
-if __name__ == "__main__":
-    # Arguments
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--directory",
-        type=str,
-        default=os.path.join(suite.models.assets_root, "demonstrations"),
-    )
-    parser.add_argument("--environment", type=str, default="Lift")
-    parser.add_argument("--robots", nargs="+", type=str, default="Panda", help="Which robot(s) to use in the env")
-    parser.add_argument(
-        "--config", type=str, default="default", help="Specified environment configuration if necessary"
-    )
-    parser.add_argument("--arm", type=str, default="right", help="Which arm to control (eg bimanual) 'right' or 'left'")
-    parser.add_argument("--camera", type=str, default="agentview", help="Which camera to use for collecting demos")
-    parser.add_argument(
-        "--controller", type=str, default="OSC_POSE", help="Choice of controller. Can be 'IK_POSE' or 'OSC_POSE'"
-    )
-    parser.add_argument("--device", type=str, default="keyboard")
-    parser.add_argument("--pos-sensitivity", type=float, default=1.0, help="How much to scale position user inputs")
-    parser.add_argument("--rot-sensitivity", type=float, default=1.0, help="How much to scale rotation user inputs")
-    parser.add_argument(
-        "--renderer",
-        type=str,
-        default="mjviewer",
-        help="Use the Nvisii viewer (Nvisii), OpenCV viewer (mujoco), or Mujoco's builtin interactive viewer (mjviewer)",
-    )
-    args = parser.parse_args()
-
-    # Get controller config
-    composite_controller_config = load_composite_controller_config(controller=args.controller, robot=args.robots[0])
-
-    # Create argument configuration
-    config = {
-        "env_name": args.environment,
-        "robots": args.robots,
-        "controller_configs": composite_controller_config,
-        # "composite_controller_configs": composite_controller_config,
-    }
-
-    # Check if we're using a multi-armed environment and use env_configuration argument if so
-    if "TwoArm" in args.environment:
-        config["env_configuration"] = args.config
-
-    # Create environment
-    env = suite.make(
-        **config,
-        has_renderer=True,
-        renderer=args.renderer,
-        has_offscreen_renderer=False,
-        render_camera=args.camera,
-        ignore_done=True,
-        use_camera_obs=False,
-        reward_shaping=True,
-        control_freq=20,
-    )
-
-    # Wrap this with visualization wrapper
-    env = VisualizationWrapper(env)
-
-    # Grab reference to controller config and convert it to json-encoded string
-    env_info = json.dumps(config)
-
-    # wrap the environment with data collection wrapper
-    tmp_directory = "/tmp/{}".format(str(time.time()).replace(".", "_"))
-    env = DataCollectionWrapper(env, tmp_directory)
-
-    # initialize device
-    if args.device == "keyboard":
-        from robosuite.devices import Keyboard
-
-        device = Keyboard(env=env, pos_sensitivity=args.pos_sensitivity, rot_sensitivity=args.rot_sensitivity)
-    elif args.device == "spacemouse":
-        from robosuite.devices import SpaceMouse
-
-        device = SpaceMouse(env=env, pos_sensitivity=args.pos_sensitivity, rot_sensitivity=args.rot_sensitivity)
-
-    else:
-        raise Exception("Invalid device choice: choose either 'keyboard' or 'spacemouse'.")
-
-    # make a new timestamped directory
-    t1, t2 = str(time.time()).split(".")
-    new_dir = os.path.join(args.directory, "{}_{}".format(t1, t2))
-    os.makedirs(new_dir)
-
-    # collect demonstrations
-    while True:
-        collect_human_trajectory(env, device, args.arm, args.config, use_mocap=True)
-        gather_demonstrations_as_hdf5(tmp_directory, new_dir, env_info)

From c2283fd3b7f10bb1e76a5184d55bc3ada6d4286e Mon Sep 17 00:00:00 2001
From: Kevin Lin <kevin.lin20000@gmail.com>
Date: Fri, 11 Oct 2024 00:54:58 -0500
Subject: [PATCH 03/13] Refactor mink ik to use input_type, input_ref_frame.
 Pending base input_ref_frame

---
 .../composite/composite_controller.py         |   2 +-
 .../default/composite/whole_body_ik.json      |   2 +-
 .../config/robots/default_gr1.json            |   4 +-
 .../default_mink_ik_gr1.json                  |   2 +-
 .../third_party_controller/mink_controller.py | 151 +++++++++---------
 5 files changed, 82 insertions(+), 79 deletions(-)

diff --git a/robosuite/controllers/composite/composite_controller.py b/robosuite/controllers/composite/composite_controller.py
index dcfe81458b..ca098f4c67 100644
--- a/robosuite/controllers/composite/composite_controller.py
+++ b/robosuite/controllers/composite/composite_controller.py
@@ -451,6 +451,6 @@ def _init_joint_action_policy(self):
             max_dq=self.composite_controller_specific_config.get("ik_max_dq", 4),
             max_dq_torso=self.composite_controller_specific_config.get("ik_max_dq_torso", 0.2),
             input_rotation_repr=self.composite_controller_specific_config.get("ik_input_rotation_repr", "axis_angle"),
-            input_action_repr=self.composite_controller_specific_config.get("ik_input_action_repr", "axis_angle"),
+            input_action_repr=self.composite_controller_specific_config.get("ik_input_type", "axis_angle"),
             debug=self.composite_controller_specific_config.get("verbose", False),
         )
diff --git a/robosuite/controllers/config/default/composite/whole_body_ik.json b/robosuite/controllers/config/default/composite/whole_body_ik.json
index 861acefebd..55799b94f8 100644
--- a/robosuite/controllers/config/default/composite/whole_body_ik.json
+++ b/robosuite/controllers/config/default/composite/whole_body_ik.json
@@ -20,7 +20,7 @@
         "ik_integration_dt": 1e-1,
         "ik_max_dq": 4.0,
         "ik_max_dq_torso": 0.2,
-        "ik_input_action_repr": "absolute",
+        "ik_input_type": "absolute",
         "ik_input_rotation_repr": "axis_angle",
         "verbose": false
     },
diff --git a/robosuite/controllers/config/robots/default_gr1.json b/robosuite/controllers/config/robots/default_gr1.json
index 92fd86a92f..528929871a 100644
--- a/robosuite/controllers/config/robots/default_gr1.json
+++ b/robosuite/controllers/config/robots/default_gr1.json
@@ -20,9 +20,9 @@
         "ik_integration_dt": 1e-1,
         "ik_max_dq": 4.0,
         "ik_max_dq_torso": 0.2,
-        "ik_input_action_repr": "absolute",
+        "ik_input_type": "absolute",
         "ik_input_rotation_repr": "axis_angle",
-        "verbose": true
+        "ik_verbose": true
     },
     "body_parts_controller_configs": {
         "arms": {
diff --git a/robosuite/examples/third_party_controller/default_mink_ik_gr1.json b/robosuite/examples/third_party_controller/default_mink_ik_gr1.json
index 1bb7fb7806..bccc8cc048 100644
--- a/robosuite/examples/third_party_controller/default_mink_ik_gr1.json
+++ b/robosuite/examples/third_party_controller/default_mink_ik_gr1.json
@@ -7,7 +7,7 @@
             "max_dq": 4,
             "ik_pseudo_inverse_damping": 5e-2,
             "ik_integration_dt": 1e-1,
-            "ik_input_action_repr": "absolute",
+            "ik_input_type": "absolute",
             "ik_input_rotation_repr": "axis_angle",
             "verbose": false,
             "ik_posture_weights": {
diff --git a/robosuite/examples/third_party_controller/mink_controller.py b/robosuite/examples/third_party_controller/mink_controller.py
index a9cde3c922..1593f57c45 100644
--- a/robosuite/examples/third_party_controller/mink_controller.py
+++ b/robosuite/examples/third_party_controller/mink_controller.py
@@ -149,8 +149,9 @@ def __init__(
         site_names: List[str],
         robot_model: mujoco.MjModel,
         robot_joint_names: Optional[List[str]] = None,
-        debug: bool = False,
-        input_action_repr: Literal["absolute", "relative", "relative_pose"] = "absolute",
+        verbose: bool = False,
+        input_type: Literal["absolute", "relative", "relative_pose"] = "absolute",
+        input_ref_frame: Literal["world", "base", "eef"] = "world",
         input_rotation_repr: Literal["quat_wxyz", "axis_angle"] = "axis_angle",
         posture_weights: Dict[str, float] = None,
         solve_freq: float = 20.0,
@@ -175,8 +176,7 @@ def __init__(
                 for i in range(self.robot_model.njnt)
                 if self.robot_model.joint(i).type != 0
             ]  # Exclude fixed joints
-        # get all ids for robot bodies
-        # self.robot_body_ids: List[int] = np.array([self.robot_model.body(name).id for name in robot_joint_names])
+
         self.full_model_dof_ids: List[int] = np.array([self.full_model.joint(name).id for name in robot_joint_names])
 
         self.robot_model_dof_ids: List[int] = np.array([self.robot_model.joint(name).id for name in robot_joint_names])
@@ -191,7 +191,8 @@ def __init__(
 
         self.solve_freq = solve_freq
 
-        self.input_action_repr = input_action_repr
+        self.input_type = input_type
+        self.input_ref_frame = input_ref_frame
         self.input_rotation_repr = input_rotation_repr
         ROTATION_REPRESENTATION_DIMS: Dict[str, int] = {"quat_wxyz": 4, "axis_angle": 3}
         self.rot_dim = ROTATION_REPRESENTATION_DIMS[input_rotation_repr]
@@ -201,8 +202,8 @@ def __init__(
         self.control_limits = np.array([-np.inf] * self.control_dim), np.array([np.inf] * self.control_dim)
 
         self.i = 0
-        self.debug = debug
-        if debug:
+        self.verbose = verbose
+        if verbose:
             self.task_errors: List[np.ndarray] = []
             self.trask_translation_errors: List[np.ndarray] = []
             self.task_orientation_errors: List[np.ndarray] = []
@@ -258,83 +259,85 @@ def action_split_indexes(self) -> Dict[str, Tuple[int, int]]:
 
         return action_split_indexes
 
-    def solve(self, target_action: Optional[np.ndarray] = None) -> np.ndarray:
-        # update configuration's data to match self.data for the joints we care about
+    def solve(self, target_action: np.ndarray) -> np.ndarray:
+        """
+        Solve for the joint angles that achieve the desired target action.
+        
+        We assume target_action is specified as self.input_type, self.input_ref_frame, self.input_rotation_repr.
+        We also assume target_action has the following format: [site1_pos, site1_rot, site2_pos, site2_rot, ...].
+        """
+        # update configuration's base to match actual base
         self.configuration.model.body("robot0_base").pos = self.full_model.body("robot0_base").pos
         self.configuration.model.body("robot0_base").quat = self.full_model.body("robot0_base").quat
+        # update configuration's qpos to match actual qpos
         self.configuration.update(
             self.full_model_data.qpos[self.full_model_dof_ids], update_idxs=self.robot_model_dof_ids
         )
 
-        if target_action is not None:
-            target_action = target_action.reshape(len(self.site_names), -1)
-            target_pos = target_action[:, : self.pos_dim]
-            target_ori = target_action[:, self.pos_dim :]
-            target_quat_wxyz = None
-
-            if self.input_rotation_repr == "axis_angle":
-                target_quat_wxyz = np.array(
-                    [np.roll(T.axisangle2quat(target_ori[i]), 1) for i in range(len(target_ori))]
-                )
-            elif self.input_rotation_repr == "mat":
-                target_quat_wxyz = np.array([np.roll(T.mat2quat(target_ori[i])) for i in range(len(target_ori))])
-            elif self.input_rotation_repr == "quat_wxyz":
-                target_quat_wxyz = target_ori
-
-            if "relative" in self.input_action_repr:
-                cur_pos = np.array([self.configuration.data.site(site_id).xpos for site_id in self.site_ids])
-                cur_ori = np.array([self.configuration.data.site(site_id).xmat for site_id in self.site_ids])
-            if self.input_action_repr == "relative":
-                # decoupled pos and rotation deltas
-                target_pos += cur_pos
-                target_quat_xyzw = np.array(
-                    [
-                        T.quat_multiply(T.mat2quat(cur_ori[i].reshape(3, 3)), np.roll(target_quat_wxyz[i], -1))
-                        for i in range(len(self.site_ids))
-                    ]
-                )
-                target_quat_wxyz = np.array([np.roll(target_quat_xyzw[i], shift=1) for i in range(len(self.site_ids))])
-            elif self.input_action_repr == "relative_pose":
-                cur_poses = np.zeros((len(self.site_ids), 4, 4))
-                for i in range(len(self.site_ids)):
-                    cur_poses[i, :3, :3] = cur_ori[i].reshape(3, 3)
-                    cur_poses[i, :3, 3] = cur_pos[i]
-                    cur_poses[i, 3, :] = [0, 0, 0, 1]
-
-                # Convert target action to target pose
-                target_poses = np.zeros_like(cur_poses)
-                for i in range(len(self.site_ids)):
-                    target_poses[i, :3, :3] = T.quat2mat(target_quat_wxyz[i])
-                    target_poses[i, :3, 3] = target_pos[i]
-                    target_poses[i, 3, :] = [0, 0, 0, 1]
-
-                # Apply target pose to current pose
-                new_target_poses = np.array([np.dot(cur_poses[i], target_poses[i]) for i in range(len(self.site_ids))])
-
-                # Split new target pose back into position and quaternion
-                target_pos = new_target_poses[:, :3, 3]
-                target_quat_wxyz = np.array(
-                    [np.roll(T.mat2quat(new_target_poses[i, :3, :3]), shift=1) for i in range(len(self.site_ids))]
-                )
-
-            # create targets list shape is n_sites, 4, 4
-            targets = [np.eye(4) for _ in range(len(self.site_names))]
-            for i, (pos, quat_wxyz) in enumerate(zip(target_pos, target_quat_wxyz)):
-                targets[i][:3, 3] = pos
-                targets[i][:3, :3] = T.quat2mat(np.roll(quat_wxyz, -1))
-
-            # set target poses
-            for task, target in zip(self.hand_tasks, targets):
-                with suppress_stdout():
-                    se3_target = mink.SE3.from_matrix(target)
-                task.set_target(se3_target)
+        target_action = target_action.reshape(len(self.site_names), -1)
+        target_pos = target_action[:, : self.pos_dim]
+        target_ori = target_action[:, self.pos_dim :]
+
+        target_quat_wxyz = None
+        if self.input_rotation_repr == "axis_angle":
+            target_quat_wxyz = np.array(
+                [np.roll(T.axisangle2quat(target_ori[i]), 1) for i in range(len(target_ori))]
+            )
+        elif self.input_rotation_repr == "mat":
+            target_quat_wxyz = np.array([np.roll(T.mat2quat(target_ori[i])) for i in range(len(target_ori))])
+        elif self.input_rotation_repr == "quat_wxyz":
+            target_quat_wxyz = target_ori
+
+        if "delta" in self.input_type:
+            cur_pos = np.array([self.configuration.data.site(site_id).xpos for site_id in self.site_ids])
+            cur_ori = np.array([self.configuration.data.site(site_id).xmat for site_id in self.site_ids])
+        if self.input_type == "delta":
+            # decoupled pos and rotation deltas
+            target_pos += cur_pos
+            target_quat_xyzw = np.array(
+                [
+                    T.quat_multiply(T.mat2quat(cur_ori[i].reshape(3, 3)), np.roll(target_quat_wxyz[i], -1))
+                    for i in range(len(self.site_ids))
+                ]
+            )
+            target_quat_wxyz = np.array([np.roll(target_quat_xyzw[i], shift=1) for i in range(len(self.site_ids))])
+        elif self.input_type == "delta_pose":
+            cur_poses = np.zeros((len(self.site_ids), 4, 4))
+            for i in range(len(self.site_ids)):
+                cur_poses[i] = T.make_pose(cur_pos[i], cur_ori[i])
+
+            # Convert target action to target pose
+            delta_poses = np.zeros_like(cur_poses)
+            for i in range(len(self.site_ids)):
+                delta_poses[i] = T.make_pose(target_pos[i], T.quat2mat(np.roll(target_quat_wxyz[i], -1)))
+
+            # Apply target pose to current pose
+            new_target_poses = np.array([np.dot(cur_poses[i], delta_poses[i]) for i in range(len(self.site_ids))])
+
+            # Split new target pose back into position and quaternion
+            target_pos = new_target_poses[:, :3, 3]
+            target_quat_wxyz = np.array(
+                [np.roll(T.mat2quat(new_target_poses[i, :3, :3]), shift=1) for i in range(len(self.site_ids))]
+            )
+
+        # create targets list shape is n_sites, 4, 4
+        targets = [np.eye(4) for _ in range(len(self.site_names))]
+        for i, (pos, quat_wxyz) in enumerate(zip(target_pos, target_quat_wxyz)):
+            targets[i][:3, 3] = pos
+            targets[i][:3, :3] = T.quat2mat(np.roll(quat_wxyz, -1))
+
+        # set target poses
+        for task, target in zip(self.hand_tasks, targets):
+            with suppress_stdout():
+                se3_target = mink.SE3.from_matrix(target)
+            task.set_target(se3_target)
 
         with suppress_stdout():
             vel = mink.solve_ik(self.configuration, self.tasks, 1 / self.solve_freq, self.solver, 1e-5)
         self.configuration.integrate_inplace(vel, 1 / self.solve_freq)
 
         self.i += 1
-        if self.debug and self.i % 1 == 0:
+        if self.verbose and self.i % 1 == 0:
             task_errors = self._get_task_errors()
             task_translation_errors = self._get_task_translation_errors()
             task_orientation_errors = self._get_task_orientation_errors()
@@ -395,11 +398,11 @@ def _init_joint_action_policy(self):
                 if "ref_name" in self.composite_controller_specific_config else default_site_names,
             robot_model=self.robot_model.mujoco_model,
             robot_joint_names=joint_names,
-            input_action_repr=self.composite_controller_specific_config.get("ik_input_action_repr", "absolute"),
+            input_type=self.composite_controller_specific_config.get("ik_input_type", "absolute"),
             input_rotation_repr=self.composite_controller_specific_config.get("ik_input_rotation_repr", "axis_angle"),
             solve_freq=self.composite_controller_specific_config.get("ik_solve_freq", 20),
             posture_weights=self.composite_controller_specific_config.get("ik_posture_weights", {}),
             hand_pos_cost=self.composite_controller_specific_config.get("ik_hand_pos_cost", 1.0),
             hand_ori_cost=self.composite_controller_specific_config.get("ik_hand_ori_cost", 0.5),
-            debug=self.composite_controller_specific_config.get("verbose", False),
+            verbose=self.composite_controller_specific_config.get("ik_verbose", False),
         )

From ad460e851d22ef28e47e83e9408e9c26862a538b Mon Sep 17 00:00:00 2001
From: Kevin Lin <kevin.lin20000@gmail.com>
Date: Fri, 11 Oct 2024 01:05:34 -0500
Subject: [PATCH 04/13] Refactor mink controller solve() + add base input type

---
 .../composite/composite_controller.py         |  2 +-
 .../third_party_controller/mink_controller.py | 50 ++++++++++++-------
 2 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/robosuite/controllers/composite/composite_controller.py b/robosuite/controllers/composite/composite_controller.py
index ca098f4c67..0a6fa27e0f 100644
--- a/robosuite/controllers/composite/composite_controller.py
+++ b/robosuite/controllers/composite/composite_controller.py
@@ -451,6 +451,6 @@ def _init_joint_action_policy(self):
             max_dq=self.composite_controller_specific_config.get("ik_max_dq", 4),
             max_dq_torso=self.composite_controller_specific_config.get("ik_max_dq_torso", 0.2),
             input_rotation_repr=self.composite_controller_specific_config.get("ik_input_rotation_repr", "axis_angle"),
-            input_action_repr=self.composite_controller_specific_config.get("ik_input_type", "axis_angle"),
+            input_type=self.composite_controller_specific_config.get("ik_input_type", "axis_angle"),
             debug=self.composite_controller_specific_config.get("verbose", False),
         )
diff --git a/robosuite/examples/third_party_controller/mink_controller.py b/robosuite/examples/third_party_controller/mink_controller.py
index 1593f57c45..255c7dcd1f 100644
--- a/robosuite/examples/third_party_controller/mink_controller.py
+++ b/robosuite/examples/third_party_controller/mink_controller.py
@@ -259,12 +259,12 @@ def action_split_indexes(self) -> Dict[str, Tuple[int, int]]:
 
         return action_split_indexes
 
-    def solve(self, target_action: np.ndarray) -> np.ndarray:
+    def solve(self, input_action: np.ndarray) -> np.ndarray:
         """
         Solve for the joint angles that achieve the desired target action.
         
-        We assume target_action is specified as self.input_type, self.input_ref_frame, self.input_rotation_repr.
-        We also assume target_action has the following format: [site1_pos, site1_rot, site2_pos, site2_rot, ...].
+        We assume input_action is specified as self.input_type, self.input_ref_frame, self.input_rotation_repr.
+        We also assume input_action has the following format: [site1_pos, site1_rot, site2_pos, site2_rot, ...].
         """
         # update configuration's base to match actual base
         self.configuration.model.body("robot0_base").pos = self.full_model.body("robot0_base").pos
@@ -274,29 +274,40 @@ def solve(self, target_action: np.ndarray) -> np.ndarray:
             self.full_model_data.qpos[self.full_model_dof_ids], update_idxs=self.robot_model_dof_ids
         )
 
-        target_action = target_action.reshape(len(self.site_names), -1)
-        target_pos = target_action[:, : self.pos_dim]
-        target_ori = target_action[:, self.pos_dim :]
+        input_action = input_action.reshape(len(self.site_names), -1)
+        input_pos = input_action[:, : self.pos_dim]
+        input_ori = input_action[:, self.pos_dim :]
 
-        target_quat_wxyz = None
+        input_quat_wxyz = None
         if self.input_rotation_repr == "axis_angle":
-            target_quat_wxyz = np.array(
-                [np.roll(T.axisangle2quat(target_ori[i]), 1) for i in range(len(target_ori))]
+            input_quat_wxyz = np.array(
+                [np.roll(T.axisangle2quat(input_ori[i]), 1) for i in range(len(input_ori))]
             )
         elif self.input_rotation_repr == "mat":
-            target_quat_wxyz = np.array([np.roll(T.mat2quat(target_ori[i])) for i in range(len(target_ori))])
+            input_quat_wxyz = np.array([np.roll(T.mat2quat(input_ori[i])) for i in range(len(input_ori))])
         elif self.input_rotation_repr == "quat_wxyz":
-            target_quat_wxyz = target_ori
+            input_quat_wxyz = input_ori
+
+        if self.input_ref_frame == "base":
+            input_poses = np.zeros((len(self.site_ids), 4, 4))
+            for i in range(len(self.site_ids)):
+                base_pos = self.configuration.data.body("robot0_base").xpos
+                base_ori = self.configuration.data.body("robot0_base").xmat
+                base_pose = T.make_pose(base_pos, base_ori)
+                input_pose = T.make_pose(input_pos[i], T.quat2mat(np.roll(input_quat_wxyz[i], -1)))
+                input_poses[i] = np.dot(base_pose, input_pose)
+            input_pos = input_poses[:, :3, 3]
+            input_quat_wxyz = np.array([np.roll(T.mat2quat(input_poses[i, :3, :3]), shift=1) for i in range(len(self.site_ids))])
 
         if "delta" in self.input_type:
             cur_pos = np.array([self.configuration.data.site(site_id).xpos for site_id in self.site_ids])
             cur_ori = np.array([self.configuration.data.site(site_id).xmat for site_id in self.site_ids])
         if self.input_type == "delta":
             # decoupled pos and rotation deltas
-            target_pos += cur_pos
+            target_pos = input_pos + cur_pos
             target_quat_xyzw = np.array(
                 [
-                    T.quat_multiply(T.mat2quat(cur_ori[i].reshape(3, 3)), np.roll(target_quat_wxyz[i], -1))
+                    T.quat_multiply(T.mat2quat(cur_ori[i].reshape(3, 3)), np.roll(input_quat_wxyz[i], -1))
                     for i in range(len(self.site_ids))
                 ]
             )
@@ -309,16 +320,21 @@ def solve(self, target_action: np.ndarray) -> np.ndarray:
             # Convert target action to target pose
             delta_poses = np.zeros_like(cur_poses)
             for i in range(len(self.site_ids)):
-                delta_poses[i] = T.make_pose(target_pos[i], T.quat2mat(np.roll(target_quat_wxyz[i], -1)))
+                delta_poses[i] = T.make_pose(input_pos[i], T.quat2mat(np.roll(input_quat_wxyz[i], -1)))
 
             # Apply target pose to current pose
-            new_target_poses = np.array([np.dot(cur_poses[i], delta_poses[i]) for i in range(len(self.site_ids))])
+            target_poses = np.array([np.dot(cur_poses[i], delta_poses[i]) for i in range(len(self.site_ids))])
 
             # Split new target pose back into position and quaternion
-            target_pos = new_target_poses[:, :3, 3]
+            target_pos = target_poses[:, :3, 3]
             target_quat_wxyz = np.array(
-                [np.roll(T.mat2quat(new_target_poses[i, :3, :3]), shift=1) for i in range(len(self.site_ids))]
+                [np.roll(T.mat2quat(target_poses[i, :3, :3]), shift=1) for i in range(len(self.site_ids))]
             )
+        elif self.input_type == "absolute":
+            target_pos = input_pos
+            target_quat_wxyz = input_quat_wxyz
+        else:
+            raise ValueError(f"Invalid input type: {self.input_type}")
 
         # create targets list shape is n_sites, 4, 4
         targets = [np.eye(4) for _ in range(len(self.site_names))]

From 5cadf222186b02b13e6c4a538bcd7e62980a21f3 Mon Sep 17 00:00:00 2001
From: Kevin Lin <kevin.lin20000@gmail.com>
Date: Fri, 11 Oct 2024 01:15:07 -0500
Subject: [PATCH 05/13] Document mink more

---
 robosuite/examples/third_party_controller/mink_controller.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/robosuite/examples/third_party_controller/mink_controller.py b/robosuite/examples/third_party_controller/mink_controller.py
index 255c7dcd1f..d9d8467135 100644
--- a/robosuite/examples/third_party_controller/mink_controller.py
+++ b/robosuite/examples/third_party_controller/mink_controller.py
@@ -265,6 +265,9 @@ def solve(self, input_action: np.ndarray) -> np.ndarray:
         
         We assume input_action is specified as self.input_type, self.input_ref_frame, self.input_rotation_repr.
         We also assume input_action has the following format: [site1_pos, site1_rot, site2_pos, site2_rot, ...].
+
+        By updating configuration's bose to match the actual base pose (in 'world' frame),
+        we're requiring our tasks' targets to be in the 'world' frame for mink.solve_ik().
         """
         # update configuration's base to match actual base
         self.configuration.model.body("robot0_base").pos = self.full_model.body("robot0_base").pos
@@ -298,6 +301,8 @@ def solve(self, input_action: np.ndarray) -> np.ndarray:
                 input_poses[i] = np.dot(base_pose, input_pose)
             input_pos = input_poses[:, :3, 3]
             input_quat_wxyz = np.array([np.roll(T.mat2quat(input_poses[i, :3, :3]), shift=1) for i in range(len(self.site_ids))])
+        elif self.input_ref_frame == "eef":
+            raise NotImplementedError("Input reference frame 'eef' not yet implemented")
 
         if "delta" in self.input_type:
             cur_pos = np.array([self.configuration.data.site(site_id).xpos for site_id in self.site_ids])

From 2530347e816f24f52345cd7f3727e35c107aeacd Mon Sep 17 00:00:00 2001
From: Kevin Lin <kevin.lin20000@gmail.com>
Date: Fri, 11 Oct 2024 10:52:59 -0500
Subject: [PATCH 06/13] Add transform fn skeleton

---
 .../third_party_controller/mink_controller.py         | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/robosuite/examples/third_party_controller/mink_controller.py b/robosuite/examples/third_party_controller/mink_controller.py
index d9d8467135..3660718fcf 100644
--- a/robosuite/examples/third_party_controller/mink_controller.py
+++ b/robosuite/examples/third_party_controller/mink_controller.py
@@ -259,6 +259,17 @@ def action_split_indexes(self) -> Dict[str, Tuple[int, int]]:
 
         return action_split_indexes
 
+    # TODO: implement this method
+    def transform_frame(self, pose: np.ndarray, src_ref_frame: Literal['world', 'base'], dst_ref_frame: Literal["world", "base"]) -> np.ndarray:
+        raise NotImplementedError("transform_frame not yet implemented. Below is buggy.")
+        if src_ref_frame == 'base':
+            src_transform = self.configuration.get_transform_frame_to_world("robot0_base", src_ref_frame)
+            pose = src_transform.dot(pose)
+        if dst_ref_frame == 'base':
+            dst_transform = self.configuration.get_transform_frame_to_world("robot0_base", dst_ref_frame).inv()
+            pose = dst_transform.dot(pose)
+        return pose
+
     def solve(self, input_action: np.ndarray) -> np.ndarray:
         """
         Solve for the joint angles that achieve the desired target action.

From 1d369281ba8ca25ef64da9d53ab5ac6b9937d3b9 Mon Sep 17 00:00:00 2001
From: Kevin Lin <kevin.lin20000@gmail.com>
Date: Fri, 11 Oct 2024 18:40:46 -0500
Subject: [PATCH 07/13] Simply site names for IKSolver too

---
 robosuite/utils/ik_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/robosuite/utils/ik_utils.py b/robosuite/utils/ik_utils.py
index 150c41e02d..03d9ae17c0 100644
--- a/robosuite/utils/ik_utils.py
+++ b/robosuite/utils/ik_utils.py
@@ -102,8 +102,8 @@ def action_split_indexes(self) -> Dict[str, Tuple[int, int]]:
         for site_name in self.site_names:
             total_dim = self.pos_dim + self.rot_dim
             last_idx = previous_idx + total_dim
-            action_split_indexes[site_name + "_pos"] = (previous_idx, previous_idx + self.pos_dim)
-            action_split_indexes[site_name + f"_{self.input_rotation_repr}"] = (previous_idx + self.pos_dim, last_idx)
+            simplified_site_name = "left" if "left" in site_name else "right"  # hack to simplify site names
+            action_split_indexes[simplified_site_name] = (previous_idx, last_idx)
             previous_idx = last_idx
 
         return action_split_indexes

From c09571c2e67c2310d5cc556639dc1512d2e29820 Mon Sep 17 00:00:00 2001
From: Kevin Lin <kevin.lin20000@gmail.com>
Date: Tue, 22 Oct 2024 19:48:45 -0500
Subject: [PATCH 08/13] Add mjgui 'device' - may need better name

---
 robosuite/devices/mjgui.py | 109 +++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 robosuite/devices/mjgui.py

diff --git a/robosuite/devices/mjgui.py b/robosuite/devices/mjgui.py
new file mode 100644
index 0000000000..14eef4c200
--- /dev/null
+++ b/robosuite/devices/mjgui.py
@@ -0,0 +1,109 @@
+from typing import Dict, List, Optional, Tuple
+import numpy as np
+from robosuite.devices import Device
+from robosuite.utils.transform_utils import rotation_matrix
+import mujoco
+
+
+def set_mocap_pose(
+    sim, 
+    target_pos: Optional[np.ndarray] = None, 
+    target_mat: Optional[np.ndarray] = None, 
+    mocap_name: str = "target"
+):
+    mocap_id = sim.model.body(mocap_name).mocapid[0]
+    if target_pos is not None:
+        sim.data.mocap_pos[mocap_id] = target_pos
+    if target_mat is not None:
+        # convert mat to quat
+        target_quat = np.empty(4)
+        mujoco.mju_mat2Quat(target_quat, target_mat.reshape(9, 1))
+        sim.data.mocap_quat[mocap_id] = target_quat
+
+def get_mocap_pose(sim, mocap_name: str = "target") -> Tuple[np.ndarray, np.ndarray]:
+    mocap_id = sim.model.body(mocap_name).mocapid[0]
+    target_pos = np.copy(sim.data.mocap_pos[mocap_id])
+    target_quat = np.copy(sim.data.mocap_quat[mocap_id])
+    target_mat = np.empty(9)
+    mujoco.mju_quat2Mat(target_mat, target_quat)
+    target_mat = target_mat.reshape(3, 3)
+    return target_pos, target_mat
+
+
+class MJGUI(Device):
+    """
+    Class for 'device' involving mujoco viewer and mocap bodies being dragged by users mouse.
+
+    Args:
+        env (RobotEnv): The environment which contains the robot(s) to control
+                        using this device.
+    """
+
+    def __init__(self, env):
+        super().__init__(env)
+
+        self._display_controls()
+        self._reset_internal_state()
+
+        self._reset_state = 0
+        self._enabled = False
+
+    @staticmethod
+    def _display_controls():
+        """
+        Method to pretty print controls.
+        """
+        print("")
+        print("Mujoco viewer UI mouse 'device'. Use the mouse to drag mocap bodies. We use the mocap's coordinates " \
+              "to output actions.")
+        print("")
+
+    def start_control(self):
+        """
+        Method that should be called externally before controller can
+        start receiving commands.
+        """
+        self._reset_internal_state()
+        self._reset_state = 0
+        self._enabled = True
+
+    def get_controller_state(self):
+        """
+        Grabs the current state of the keyboard.
+        Returns:
+            dict: A dictionary containing dpos, orn, unmodified orn, grasp, and reset
+        """
+        return dict()
+
+    def _reset_internal_state(self):
+        """
+        Resets internal state related to robot control
+        """
+        super()._reset_internal_state()
+        self.grasp_states = [[False] * len(self.all_robot_arms[i]) for i in range(self.num_robots)]
+        self.active_arm_indices = [0] * len(self.all_robot_arms)
+        self.active_robot = 0
+        self.base_modes = [False] * len(self.all_robot_arms)
+
+        site_names: List[str] = self.env.robots[0].composite_controller.joint_action_policy.site_names
+        for site_name in site_names:
+            target_name_prefix = "right" if "right" in site_name else "left"  # hardcoded for now
+            target_pos = self.env.sim.data.site_xpos[self.env.sim.model.site_name2id(site_name)]
+            target_mat = self.env.sim.data.site_xmat[self.env.sim.model.site_name2id(site_name)]
+            set_mocap_pose(self.env.sim, target_pos, target_mat, f"{target_name_prefix}_eef_target")
+
+    def input2action(self) -> Dict[str, np.ndarray]:
+        """
+        Uses mocap body poses to determine action for robot. Obtain input_type 
+        (i.e. absolute actions or delta actions) and input_ref_frame (i.e. world frame, base frame or eef frame)
+        from the controller itself.
+        """
+        action: Dict[str, np.ndarray] = {}
+        site_names: List[str] = self.env.robots[0].composite_controller.joint_action_policy.site_names
+        for site_name in site_names:
+            target_name_prefix = "right" if "right" in site_name else "left"  # hardcoded for now
+            target_pos_world, target_ori_mat_world = get_mocap_pose(self.env.sim, f"{target_name_prefix}_eef_target")
+
+        # TODO: update action frames.
+        # now convert actions to desired frames (take from controller)
+        return action

From 3ead4a327da412c6d15b8d0efe3b77a40e3dd043 Mon Sep 17 00:00:00 2001
From: Kevin Lin <kevin.lin20000@gmail.com>
Date: Tue, 22 Oct 2024 19:49:10 -0500
Subject: [PATCH 09/13] Add if statement for device type in teleop mink

---
 robosuite/examples/third_party_controller/teleop_mink.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/robosuite/examples/third_party_controller/teleop_mink.py b/robosuite/examples/third_party_controller/teleop_mink.py
index 00e865826f..ab9d3e6295 100644
--- a/robosuite/examples/third_party_controller/teleop_mink.py
+++ b/robosuite/examples/third_party_controller/teleop_mink.py
@@ -16,6 +16,7 @@
 from robosuite.controllers import load_composite_controller_config
 
 # mink-related imports
+from robosuite.devices.mjgui import MJGUI
 from robosuite.examples.third_party_controller.mink_controller import WholeBodyMinkIK
 from robosuite.utils import transform_utils
 from robosuite.utils.input_utils import input2action
@@ -185,7 +186,9 @@ def collect_human_trajectory(env, device, arm, env_configuration, end_effector:
     # wrap the environment with data collection wrapper
     tmp_directory = "teleop-mink-data/{}".format(str(time.time()).replace(".", "_"))
     env = DataCollectionWrapper(env, tmp_directory)  # need this wrapper's reset for UI mocap dragging to work
-    device = Keyboard(env=env, pos_sensitivity=args.pos_sensitivity, rot_sensitivity=args.rot_sensitivity)
-
+    if args.device == "keyboard":
+        device = Keyboard(env=env, pos_sensitivity=args.pos_sensitivity, rot_sensitivity=args.rot_sensitivity)
+    elif args.device == "mjgui":
+        device = MJGUI(env=env)
     while True:
         collect_human_trajectory(env, device, args.arm, args.config)
\ No newline at end of file

From bbde4818c4d5d26b799b42fef88a0b8760ae42f0 Mon Sep 17 00:00:00 2001
From: Kevin Lin <kevin.lin20000@gmail.com>
Date: Tue, 22 Oct 2024 20:26:23 -0500
Subject: [PATCH 10/13] Update mjgui and teleop mink for world abs actions ---
 working

---
 robosuite/devices/mjgui.py                    | 12 +++++++++-
 .../third_party_controller/teleop_mink.py     | 23 ++-----------------
 2 files changed, 13 insertions(+), 22 deletions(-)

diff --git a/robosuite/devices/mjgui.py b/robosuite/devices/mjgui.py
index 14eef4c200..d13a60b5d9 100644
--- a/robosuite/devices/mjgui.py
+++ b/robosuite/devices/mjgui.py
@@ -1,6 +1,7 @@
 from typing import Dict, List, Optional, Tuple
 import numpy as np
 from robosuite.devices import Device
+from robosuite.utils import transform_utils
 from robosuite.utils.transform_utils import rotation_matrix
 import mujoco
 
@@ -39,7 +40,7 @@ class MJGUI(Device):
                         using this device.
     """
 
-    def __init__(self, env):
+    def __init__(self, env, active_end_effector: Optional[str] = "right"):
         super().__init__(env)
 
         self._display_controls()
@@ -48,6 +49,8 @@ def __init__(self, env):
         self._reset_state = 0
         self._enabled = False
 
+        self.active_end_effector = active_end_effector
+
     @staticmethod
     def _display_controls():
         """
@@ -99,10 +102,17 @@ def input2action(self) -> Dict[str, np.ndarray]:
         from the controller itself.
         """
         action: Dict[str, np.ndarray] = {}
+        gripper_dof = self.env.robots[0].gripper[self.active_end_effector].dof
         site_names: List[str] = self.env.robots[0].composite_controller.joint_action_policy.site_names
         for site_name in site_names:
             target_name_prefix = "right" if "right" in site_name else "left"  # hardcoded for now
             target_pos_world, target_ori_mat_world = get_mocap_pose(self.env.sim, f"{target_name_prefix}_eef_target")
+            # convert ori mat to axis angle
+            axis_angle_target = transform_utils.quat2axisangle(transform_utils.mat2quat(target_ori_mat_world))
+            action[target_name_prefix] = np.concatenate([target_pos_world, axis_angle_target])
+
+            grasp = 1  # hardcode grasp action for now
+            action[f"{target_name_prefix}_gripper"] = np.array([grasp] * gripper_dof)
 
         # TODO: update action frames.
         # now convert actions to desired frames (take from controller)
diff --git a/robosuite/examples/third_party_controller/teleop_mink.py b/robosuite/examples/third_party_controller/teleop_mink.py
index ab9d3e6295..a5381b2df1 100644
--- a/robosuite/examples/third_party_controller/teleop_mink.py
+++ b/robosuite/examples/third_party_controller/teleop_mink.py
@@ -18,8 +18,6 @@
 # mink-related imports
 from robosuite.devices.mjgui import MJGUI
 from robosuite.examples.third_party_controller.mink_controller import WholeBodyMinkIK
-from robosuite.utils import transform_utils
-from robosuite.utils.input_utils import input2action
 from robosuite.wrappers import DataCollectionWrapper
 from robosuite.devices.keyboard import Keyboard
 
@@ -95,30 +93,13 @@ def collect_human_trajectory(env, device, arm, env_configuration, end_effector:
         # Check if we have gripper actions for the active arm
         arm_using_gripper = f"{arm}_gripper" in all_prev_gripper_actions[device.active_robot]
         # Get the newest action
-        input_action, grasp = input2action(
-            device=device,
-            robot=active_robot,
-            active_arm=arm,
-            active_end_effector=end_effector,
-            env_configuration=env_configuration,
-        )
-
-        # If action is none, then this a reset so we should break
-        if input_action is None:
-            break
-
-        action_dict = {}
-        for arm in active_robot.arms:
-            pos_target, mat_target = get_target_pose(env.sim, f"{arm}_eef_target")
-            axis_angle_target = transform_utils.quat2axisangle(transform_utils.mat2quat(mat_target))
-            action_dict[arm] = np.concatenate([pos_target, axis_angle_target])
+        action_dict = device.input2action()
 
         for gripper_name, gripper in active_robot.gripper.items():
             action_dict[f"{gripper_name}_gripper"] = np.zeros(gripper.dof)  # what's the 'do nothing' action for all grippers?
 
         if arm_using_gripper:
-            action_dict[f"{arm}_gripper"] = np.repeat(input_action[6:7], active_robot.gripper[arm].dof)
-            prev_gripper_actions[f"{arm}_gripper"] = np.repeat(input_action[6:7], active_robot.gripper[arm].dof)
+            prev_gripper_actions[f"{arm}_gripper"] = action_dict[f"{arm}_gripper"]
 
         # Maintain gripper state for each robot but only update the active robot with action
         env_action = [robot.create_action_vector(all_prev_gripper_actions[i]) for i, robot in enumerate(env.robots)]

From 3bca50c7fad7e022b687081af66ace8662891543 Mon Sep 17 00:00:00 2001
From: Kevin Lin <kevin.lin20000@gmail.com>
Date: Tue, 22 Oct 2024 20:27:19 -0500
Subject: [PATCH 11/13] Add teleop_mink new command

---
 robosuite/examples/third_party_controller/teleop_mink.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/robosuite/examples/third_party_controller/teleop_mink.py b/robosuite/examples/third_party_controller/teleop_mink.py
index a5381b2df1..ee0e2ef6b4 100644
--- a/robosuite/examples/third_party_controller/teleop_mink.py
+++ b/robosuite/examples/third_party_controller/teleop_mink.py
@@ -1,7 +1,7 @@
 """
-A script to teleop a robot using mink:
+A script to teleop a robot using mink and mj-viewer GUI + mocap + mouse:
 
-python robosuite/examples/third_party_controller/teleop_mink.py --controller robosuite/examples/third_party_controller/default_mink_ik_gr1.json --robots GR1FixedLowerBody --device mocap
+python robosuite/examples/third_party_controller/teleop_mink.py --controller robosuite/examples/third_party_controller/default_mink_ik_gr1.json --robots GR1FixedLowerBody --device mjgui
 """
 
 import argparse

From 986cc5030e3c03a0161351e321dc35466db8e9f2 Mon Sep 17 00:00:00 2001
From: Kevin Lin <kevin.lin20000@gmail.com>
Date: Tue, 22 Oct 2024 21:15:33 -0500
Subject: [PATCH 12/13] Enable base frame control for mink gr1 and mjgui device

---
 robosuite/devices/mjgui.py                    | 18 +++++--
 .../default_mink_ik_gr1.json                  | 45 +++++++++---------
 .../third_party_controller/mink_controller.py | 47 ++++++++++++++-----
 3 files changed, 74 insertions(+), 36 deletions(-)

diff --git a/robosuite/devices/mjgui.py b/robosuite/devices/mjgui.py
index d13a60b5d9..a69f34b0a2 100644
--- a/robosuite/devices/mjgui.py
+++ b/robosuite/devices/mjgui.py
@@ -107,10 +107,22 @@ def input2action(self) -> Dict[str, np.ndarray]:
         for site_name in site_names:
             target_name_prefix = "right" if "right" in site_name else "left"  # hardcoded for now
             target_pos_world, target_ori_mat_world = get_mocap_pose(self.env.sim, f"{target_name_prefix}_eef_target")
-            # convert ori mat to axis angle
-            axis_angle_target = transform_utils.quat2axisangle(transform_utils.mat2quat(target_ori_mat_world))
-            action[target_name_prefix] = np.concatenate([target_pos_world, axis_angle_target])
+            # check if need to update frames
+            # TODO: should be more general
+            if self.env.robots[0].composite_controller.composite_controller_specific_config.get("ik_input_ref_frame", "world") != "world":
+                target_pose = np.eye(4)
+                target_pose[:3, 3] = target_pos_world
+                target_pose[:3, :3] = target_ori_mat_world
+                target_pose = self.env.robots[0].composite_controller.joint_action_policy.transform_pose(
+                    src_frame_pose=target_pose, 
+                    src_frame="world", # mocap pose is world coordinates
+                    dst_frame=self.env.robots[0].composite_controller.composite_controller_specific_config.get("ik_input_ref_frame", "world")
+                )
+                target_pos, target_ori_mat = target_pose[:3, 3], target_pose[:3, :3]
 
+            # convert ori mat to axis angle
+            axis_angle_target = transform_utils.quat2axisangle(transform_utils.mat2quat(target_ori_mat))
+            action[target_name_prefix] = np.concatenate([target_pos, axis_angle_target])
             grasp = 1  # hardcode grasp action for now
             action[f"{target_name_prefix}_gripper"] = np.array([grasp] * gripper_dof)
 
diff --git a/robosuite/examples/third_party_controller/default_mink_ik_gr1.json b/robosuite/examples/third_party_controller/default_mink_ik_gr1.json
index bccc8cc048..e705aa27f2 100644
--- a/robosuite/examples/third_party_controller/default_mink_ik_gr1.json
+++ b/robosuite/examples/third_party_controller/default_mink_ik_gr1.json
@@ -2,28 +2,29 @@
     "type": "WHOLE_BODY_MINK_IK",
     "composite_controller_specific_configs": {
         "ref_name": ["gripper0_right_grip_site", "gripper0_left_grip_site"],
-            "interpolation": null,
-            "ik_controlled_part_names": ["torso", "head", "right", "left"],
-            "max_dq": 4,
-            "ik_pseudo_inverse_damping": 5e-2,
-            "ik_integration_dt": 1e-1,
-            "ik_input_type": "absolute",
-            "ik_input_rotation_repr": "axis_angle",
-            "verbose": false,
-            "ik_posture_weights": {
-                "robot0_torso_waist_yaw": 10.0,
-                "robot0_torso_waist_pitch": 10.0,
-                "robot0_torso_waist_roll": 200.0,
-                "robot0_l_shoulder_pitch": 4.0,
-                "robot0_r_shoulder_pitch": 4.0,
-                "robot0_l_shoulder_roll": 3.0,
-                "robot0_r_shoulder_roll": 3.0,
-                "robot0_l_shoulder_yaw": 2.0,
-                "robot0_r_shoulder_yaw": 2.0
-            },
-            "ik_hand_pos_cost": 1.0,
-            "ik_hand_ori_cost": 0.5,
-            "use_joint_angle_action_input": false
+        "interpolation": null,
+        "ik_controlled_part_names": ["torso", "head", "right", "left"],
+        "max_dq": 4,
+        "ik_pseudo_inverse_damping": 5e-2,
+        "ik_integration_dt": 1e-1,
+        "ik_input_type": "absolute",
+        "ik_input_ref_frame": "base", 
+        "ik_input_rotation_repr": "axis_angle",
+        "verbose": false,
+        "ik_posture_weights": {
+            "robot0_torso_waist_yaw": 10.0,
+            "robot0_torso_waist_pitch": 10.0,
+            "robot0_torso_waist_roll": 200.0,
+            "robot0_l_shoulder_pitch": 4.0,
+            "robot0_r_shoulder_pitch": 4.0,
+            "robot0_l_shoulder_roll": 3.0,
+            "robot0_r_shoulder_roll": 3.0,
+            "robot0_l_shoulder_yaw": 2.0,
+            "robot0_r_shoulder_yaw": 2.0
+        },
+        "ik_hand_pos_cost": 1.0,
+        "ik_hand_ori_cost": 0.5,
+        "use_joint_angle_action_input": false
     },
     "body_parts_controller_configs": {
         "arms": {
diff --git a/robosuite/examples/third_party_controller/mink_controller.py b/robosuite/examples/third_party_controller/mink_controller.py
index 3660718fcf..18d724ac57 100644
--- a/robosuite/examples/third_party_controller/mink_controller.py
+++ b/robosuite/examples/third_party_controller/mink_controller.py
@@ -259,16 +259,40 @@ def action_split_indexes(self) -> Dict[str, Tuple[int, int]]:
 
         return action_split_indexes
 
-    # TODO: implement this method
-    def transform_frame(self, pose: np.ndarray, src_ref_frame: Literal['world', 'base'], dst_ref_frame: Literal["world", "base"]) -> np.ndarray:
-        raise NotImplementedError("transform_frame not yet implemented. Below is buggy.")
-        if src_ref_frame == 'base':
-            src_transform = self.configuration.get_transform_frame_to_world("robot0_base", src_ref_frame)
-            pose = src_transform.dot(pose)
-        if dst_ref_frame == 'base':
-            dst_transform = self.configuration.get_transform_frame_to_world("robot0_base", dst_ref_frame).inv()
-            pose = dst_transform.dot(pose)
-        return pose
+    def transform_pose(
+        self, 
+        src_frame_pose: np.ndarray, 
+        src_frame: Literal['world', 'base'], 
+        dst_frame: Literal["world", "base"]
+    ) -> np.ndarray:
+        """
+        Transforms src_frame_pose from src_frame to dst_frame.
+        """
+        if src_frame == dst_frame:
+            return src_frame_pose
+
+        self.configuration.model.body("robot0_base").pos = self.full_model.body("robot0_base").pos
+        self.configuration.model.body("robot0_base").quat = self.full_model.body("robot0_base").quat
+        self.configuration.update()
+
+        X_src_frame_pose = src_frame_pose
+        # convert src frame pose to world frame pose
+        if src_frame != "world":
+            X_W_src_frame = self.configuration.get_transform_frame_to_world(src_frame, "body").as_matrix()
+            X_W_pose = X_W_src_frame @ X_src_frame_pose
+        else:
+            X_W_pose = src_frame_pose
+
+        # now convert to destination frame
+        if dst_frame == "world":
+            X_dst_frame_pose = X_W_pose
+        elif dst_frame == "base":
+            X_dst_frame_W = np.linalg.inv(
+                self.configuration.get_transform_frame_to_world("robot0_base", "body").as_matrix()
+            ) # hardcode name of base
+            X_dst_frame_pose = X_dst_frame_W.dot(X_W_pose)
+
+        return X_dst_frame_pose
 
     def solve(self, input_action: np.ndarray) -> np.ndarray:
         """
@@ -306,7 +330,7 @@ def solve(self, input_action: np.ndarray) -> np.ndarray:
             input_poses = np.zeros((len(self.site_ids), 4, 4))
             for i in range(len(self.site_ids)):
                 base_pos = self.configuration.data.body("robot0_base").xpos
-                base_ori = self.configuration.data.body("robot0_base").xmat
+                base_ori = self.configuration.data.body("robot0_base").xmat.reshape(3, 3)
                 base_pose = T.make_pose(base_pos, base_ori)
                 input_pose = T.make_pose(input_pos[i], T.quat2mat(np.roll(input_quat_wxyz[i], -1)))
                 input_poses[i] = np.dot(base_pose, input_pose)
@@ -431,6 +455,7 @@ def _init_joint_action_policy(self):
             robot_model=self.robot_model.mujoco_model,
             robot_joint_names=joint_names,
             input_type=self.composite_controller_specific_config.get("ik_input_type", "absolute"),
+            input_ref_frame=self.composite_controller_specific_config.get("ik_input_ref_frame", "world"),
             input_rotation_repr=self.composite_controller_specific_config.get("ik_input_rotation_repr", "axis_angle"),
             solve_freq=self.composite_controller_specific_config.get("ik_solve_freq", 20),
             posture_weights=self.composite_controller_specific_config.get("ik_posture_weights", {}),

From a48258bcdb5f80dd32bc2cdd8093cece0e258585 Mon Sep 17 00:00:00 2001
From: Kevin Lin <kevin.lin20000@gmail.com>
Date: Wed, 23 Oct 2024 12:10:25 -0500
Subject: [PATCH 13/13] Add docs, fix logic

---
 robosuite/devices/mjgui.py                                | 4 +++-
 .../examples/third_party_controller/mink_controller.py    | 8 +++++---
 robosuite/utils/ik_utils.py                               | 2 ++
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/robosuite/devices/mjgui.py b/robosuite/devices/mjgui.py
index a69f34b0a2..17ff408364 100644
--- a/robosuite/devices/mjgui.py
+++ b/robosuite/devices/mjgui.py
@@ -33,7 +33,7 @@ def get_mocap_pose(sim, mocap_name: str = "target") -> Tuple[np.ndarray, np.ndar
 
 class MJGUI(Device):
     """
-    Class for 'device' involving mujoco viewer and mocap bodies being dragged by users mouse.
+    Class for 'device' involving mujoco viewer and mocap bodies being dragged by user's mouse.
 
     Args:
         env (RobotEnv): The environment which contains the robot(s) to control
@@ -119,6 +119,8 @@ def input2action(self) -> Dict[str, np.ndarray]:
                     dst_frame=self.env.robots[0].composite_controller.composite_controller_specific_config.get("ik_input_ref_frame", "world")
                 )
                 target_pos, target_ori_mat = target_pose[:3, 3], target_pose[:3, :3]
+            else:
+                target_pos, target_ori_mat = target_pos_world, target_ori_mat_world
 
             # convert ori mat to axis angle
             axis_angle_target = transform_utils.quat2axisangle(transform_utils.mat2quat(target_ori_mat))
diff --git a/robosuite/examples/third_party_controller/mink_controller.py b/robosuite/examples/third_party_controller/mink_controller.py
index 18d724ac57..414c74238c 100644
--- a/robosuite/examples/third_party_controller/mink_controller.py
+++ b/robosuite/examples/third_party_controller/mink_controller.py
@@ -254,6 +254,8 @@ def action_split_indexes(self) -> Dict[str, Tuple[int, int]]:
             total_dim = self.pos_dim + self.rot_dim
             last_idx = previous_idx + total_dim
             simplified_site_name = "left" if "left" in site_name else "right"  # hack to simplify site names
+            # goal is to specify the end effector actions as "left" or "right" instead of the actual site name
+            # we assume that the site names for the ik solver are unique and contain "left" or "right" in them
             action_split_indexes[simplified_site_name] = (previous_idx, last_idx)
             previous_idx = last_idx
 
@@ -393,7 +395,7 @@ def solve(self, input_action: np.ndarray) -> np.ndarray:
         self.configuration.integrate_inplace(vel, 1 / self.solve_freq)
 
         self.i += 1
-        if self.verbose and self.i % 1 == 0:
+        if self.verbose:
             task_errors = self._get_task_errors()
             task_translation_errors = self._get_task_translation_errors()
             task_orientation_errors = self._get_task_orientation_errors()
@@ -401,8 +403,8 @@ def solve(self, input_action: np.ndarray) -> np.ndarray:
             self.trask_translation_errors.append(task_translation_errors)
             self.task_orientation_errors.append(task_orientation_errors)
 
-            # if self.i % 50:
-            #     print(f"Task errors: {task_translation_errors}")
+            if self.i % 50:
+                print(f"Task errors: {task_translation_errors}")
 
         return self.configuration.data.qpos[self.robot_model_dof_ids]
 
diff --git a/robosuite/utils/ik_utils.py b/robosuite/utils/ik_utils.py
index 03d9ae17c0..c726e67117 100644
--- a/robosuite/utils/ik_utils.py
+++ b/robosuite/utils/ik_utils.py
@@ -103,6 +103,8 @@ def action_split_indexes(self) -> Dict[str, Tuple[int, int]]:
             total_dim = self.pos_dim + self.rot_dim
             last_idx = previous_idx + total_dim
             simplified_site_name = "left" if "left" in site_name else "right"  # hack to simplify site names
+            # goal is to specify the end effector actions as "left" or "right" instead of the actual site name
+            # we assume that the site names for the ik solver are unique and contain "left" or "right" in them
             action_split_indexes[simplified_site_name] = (previous_idx, last_idx)
             previous_idx = last_idx