TensorAuto · shuheng-liu · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -43,13 +43,15 @@ Dependency management is **`uv` (>= 0.8.4) only** — `pyproject.toml`/`uv.lock`
 
 ```bash
 uv sync --extra dev --extra libero          # standard dev setup (matches CI)
-uv sync --all-extras                         # everything (libero + urdf now co-install on numpy 2.x)
+uv sync --all-extras                         # everything (libero + robocasa + urdf co-install on the shared robosuite-1.5 master / numpy 2.x stack); Linux-only (trt)
 source .venv/bin/activate
 ```
 
 Re-run `uv sync` whenever `pyproject.toml`/`uv.lock` change. Add deps with `uv add <pkg>`; lock with `uv lock`.
 
-Installable extras: `dev` (pre-commit, sphinx, pytest), `libero` (sim env — pulls a forked LIBERO from `shuheng-liu/LIBERO`, runs on numpy 2.x + gymnasium), `urdf` (rerun ≥0.28, numpy 2.x), `trt` (TensorRT, Linux/Win x86_64 only).
+Installable extras: `dev` (pre-commit, sphinx, pytest), `libero` (sim env — pulls a forked LIBERO from `shuheng-liu/LIBERO`, on robosuite 1.5 master + numpy 2.x + gymnasium), `robocasa` (RoboCasa365 kitchen sim — co-installs with `libero` on the shared robosuite stack; see the RoboCasa365 note below), `urdf` (rerun ≥0.28, numpy 2.x), `trt` (TensorRT, Linux/Win x86_64 only).
+
+**RoboCasa365** (`envs/robocasa.py`) is a first-class extra that co-installs with `libero` on a shared robosuite stack: `uv sync --extra robocasa`. Two non-obvious things make it resolve: (1) robocasa needs `MujocoEnv(load_model_on_init=...)`, added on robosuite **master** *after* the 1.5.2 PyPI release, so `[tool.uv.sources]` repins `robosuite` to a master commit — which still self-reports "1.5.2" (matching the extras' pins) and is validated to also run LIBERO; (2) robocasa is pulled from the `shuheng-liu/robocasa` packaging fork (mirroring the `shuheng-liu/LIBERO` / `egl_probe` forks) that drops upstream's `lerobot==0.3.3` / `tianshou` / `opencv-python` / `hidapi` deps and loosens its `numpy`/`numba`/`scipy`/`mujoco` pins + import-time version asserts, since uv can't `--no-deps` a single package in a lock. Kitchen assets (~5-10GB) are a separate runtime step — `python -m robocasa.scripts.download_kitchen_assets` — then run headless with `MUJOCO_GL=egl`. NOTE: a full `uv lock` / `uv sync --all-extras` must run on Linux (the `trt` extra's TensorRT sdist can't build on macOS arm64); `uv sync --extra robocasa` from the committed lock works anywhere.
 
 ## Common commands
 
@@ -117,7 +119,7 @@ Key invariant on `TrainPipelineConfig`: `batch_size == dataloader_batch_size * g
 
 - `configs/` — dataclass configs (train, eval, policies, envs, optim, deployment, libero, ros2lerobot)
 - `datasets/` — LeRobot-compatible datasets, `WeightedDatasetMixture` (heterogeneous co-training), VQA datasets, v1→v2 / v2→v2.1 converters under `v2/`, `v21/`
-- `envs/` — gym/gymnasium envs (currently LIBERO); `factory.make_envs()`
+- `envs/` — gym/gymnasium envs (LIBERO in `libero.py`, RoboCasa365 in `robocasa.py`); `factory.make_envs()` dispatches per `env.type`. Both return `dict[group][task_id] -> VectorEnv` so the env-agnostic eval pipeline (`scripts/eval.py`) gives per-task success rates + `grid_summary` wandb videos for free.
 - `optim/` — optimizer + LR-scheduler dataclass-configured factories
 - `planner/` — high-level planner using `prompts.yaml`
 - `policies/` — `pi0`, `pi05`, `pi05_mem`, `pi06`, `pi07/{high_level_planner,low_level}` (current π0.7 impl: Gemma 3 backbone + SpaceTime SigLIP video encoder; note `low_level/` — not `low_level_planner/`, since the low-level policy is a controller, not a planner), `pi07_paligemma/{high_level_planner,low_level}` (legacy PaliGemma variant of π0.7 — kept for older checkpoints; a fix targeting π0.7 usually needs to land in `pi07/`, not here), `value`. Each subdir has a `configuration_*.py` and `modeling_*.py`. Vision backbone wrappers: `paligemma_with_expert.py` (pi0/pi05/pi05_mem/pi07_paligemma) and `gemma3_with_expert.py` (pi06/pi07).

diff --git a/configs/examples/pi05_robocasa_eval_config.json b/configs/examples/pi05_robocasa_eval_config.json
@@ -0,0 +1,174 @@
+{
+    "dataset_mixture": {
+        "datasets": [
+            {
+                "repo_id": "lerobot/droid_100"
+            }
+        ],
+        "weights": [
+            1.0
+        ],
+        "action_freq": 20.0,
+        "image_resample_strategy": "nearest",
+        "vector_resample_strategy": "nearest"
+    },
+    "policy": {
+        "type": "pi05",
+        "pretrained_path": "TensorAuto/tPi0.5-libero",
+        "n_obs_steps": 1,
+        "input_features": {
+            "camera0": {
+                "shape": [
+                    3,
+                    224,
+                    224
+                ],
+                "type": "VISUAL"
+            },
+            "camera1": {
+                "shape": [
+                    3,
+                    224,
+                    224
+                ],
+                "type": "VISUAL"
+            },
+            "camera2": {
+                "shape": [
+                    3,
+                    224,
+                    224
+                ],
+                "type": "VISUAL"
+            },
+            "state": {
+                "shape": [
+                    32
+                ],
+                "type": "STATE"
+            }
+        },
+        "output_features": {
+            "actions": {
+                "shape": [
+                    32
+                ],
+                "type": "ACTION"
+            }
+        },
+        "normalization_mapping": {
+            "VISUAL": "IDENTITY",
+            "STATE": "MIN_MAX",
+            "ACTION": "MEAN_STD"
+        },
+        "chunk_size": 10,
+        "n_action_steps": 10,
+        "max_state_dim": 32,
+        "max_action_dim": 32,
+        "proj_width": 1024,
+        "num_steps": 10,
+        "attention_implementation": "eager",
+        "freeze_vision_encoder": false,
+        "train_expert_only": false,
+        "prompt_max_length": 256,
+        "discrete_action_max_length": 60,
+        "optimizer_lr": 2.5e-05,
+        "optimizer_betas": [
+            0.9,
+            0.95
+        ],
+        "optimizer_eps": 1e-08,
+        "optimizer_weight_decay": 1e-10,
+        "scheduler_warmup_steps": 1000,
+        "scheduler_decay_steps": 30000,
+        "scheduler_decay_lr": 2.5e-06
+    },
+    "output_dir": "outputs/pi05_robocasa_eval",
+    "resume": false,
+    "seed": 1000,
+    "resolution": [
+        224,
+        224
+    ],
+    "num_cams": 3,
+    "max_state_dim": 32,
+    "max_action_dim": 32,
+    "action_chunk": 10,
+    "loss_weighting": {
+        "MSE": 1.0,
+        "CE": 1.0
+    },
+    "num_workers": 4,
+    "batch_size": 2,
+    "gradient_accumulation_steps": 1,
+    "dataloader_batch_size": 2,
+    "prefetch_factor": 8,
+    "steps": 100,
+    "log_freq": 1,
+    "save_checkpoint": true,
+    "save_freq": 100,
+    "use_policy_training_preset": true,
+    "trace_nans": false,
+    "optimizer": {
+        "type": "adamw",
+        "lr": 2.5e-05,
+        "weight_decay": 1e-10,
+        "grad_clip_norm": 10.0,
+        "betas": [
+            0.9,
+            0.95
+        ],
+        "eps": 1e-08
+    },
+    "env": {
+        "type": "robocasa",
+        "task": "CloseFridge",
+        "fps": 20,
+        "max_parallel_tasks": 1,
+        "episode_length": 1000,
+        "obs_type": "pixels_agent_pos",
+        "render_mode": "rgb_array",
+        "camera_name": "robot0_agentview_left,robot0_eye_in_hand,robot0_agentview_right",
+        "observation_height": 256,
+        "observation_width": 256,
+        "visualization_height": 512,
+        "visualization_width": 512,
+        "split": null,
+        "obj_registries": [
+            "lightwheel"
+        ],
+        "metadata": {
+            "robot_type": "PandaOmron",
+            "control_mode": "ee"
+        }
+    },
+    "eval": {
+        "n_episodes": 2,
+        "batch_size": 2,
+        "use_async_envs": true,
+        "max_episodes_rendered": 2,
+        "grid_size": null,
+        "control_mode": "ee"
+    },
+    "scheduler": {
+        "type": "cosine_decay_with_warmup",
+        "num_warmup_steps": 1000,
+        "num_decay_steps": 30000,
+        "peak_lr": 2.5e-05,
+        "decay_lr": 2.5e-06
+    },
+    "wandb": {
+        "enable": true,
+        "entity": "wyautox-autox",
+        "project": "pi05",
+        "run_id": null,
+        "name": null,
+        "notes": "RoboCasa eval plumbing smoke (CloseFridge). Swap policy.pretrained_path for a RoboCasa-trained checkpoint for meaningful success rates.",
+        "tags": [],
+        "group": null,
+        "job_type": null,
+        "mode": null,
+        "allow_resume": true,
+        "disable_artifact": false
+    }
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -127,9 +127,11 @@ libero = [
     "egl-probe",
     "robomimic==0.2.0",
     # robosuite 1.5.2 (composite-controller framework) so LIBERO shares a venv with
-    # RoboCasa, which needs >=1.5. The LIBERO fork is ported to the 1.5 controller API
-    # (see load_arm_controller_config); robomimic 0.2.0 does not pin robosuite, so it
-    # co-installs unchanged.
+    # RoboCasa. Sourced from a pinned robosuite *master* commit (see [tool.uv.sources])
+    # because robocasa needs MujocoEnv(load_model_on_init=...), added on master after
+    # the 1.5.2 PyPI release; master still self-reports "1.5.2" so this pin matches.
+    # The LIBERO fork (1.5 controller API, see load_arm_controller_config) is validated
+    # on this commit; robomimic 0.2.0 does not pin robosuite, so it co-installs unchanged.
     "robosuite==1.5.2",
     "thop==0.1.1.post2209072238",
     "mujoco>=3.3.5",
@@ -143,6 +145,20 @@ libero = [
     "mujoco>=3.1.6 ; sys_platform == 'linux'",
     "pyopengl==3.1.10 ; sys_platform == 'linux'",
 ]
+robocasa = [
+    # RoboCasa365 kitchen sim for simulated eval, co-installed on the shared
+    # robosuite-1.5 (master) stack from the `libero` extra. Uses the
+    # shuheng-liu/robocasa packaging fork (see [tool.uv.sources]) that drops
+    # upstream's lerobot==0.3.3 / tianshou / opencv-python / hidapi deps and loosens
+    # its numpy/numba/scipy/mujoco pins so it resolves in the shared lock (upstream's
+    # setup.py can't be `--no-deps`'d inside a uv lock). robosuite is not in robocasa's
+    # deps (installed separately), so pin it here matching `libero`. Kitchen assets
+    # (~5-10GB) are a separate runtime download:
+    # `python -m robocasa.scripts.download_kitchen_assets`.
+    "robocasa",
+    "robosuite==1.5.2",
+    "mujoco>=3.3.5",
+]
 urdf = [
     "rerun-sdk>=0.28.2",
 ]
@@ -156,6 +172,15 @@ libero = { git = "https://github.com/shuheng-liu/LIBERO" , branch = "master" }
 # which CMake >= 4 rejects. This fork raises the floor to 3.5 so it builds on CMake 4
 # without pinning the build's cmake; the tag keeps the source reproducible.
 egl-probe = { git = "https://github.com/shuheng-liu/egl_probe", tag = "v1.0.1-cmake4" }
+# robocasa main needs robosuite *master* (MujocoEnv(load_model_on_init=...), added after
+# the 1.5.2 PyPI release); pin the exact commit for reproducibility. Master self-reports
+# version "1.5.2", which satisfies the `robosuite==1.5.2` pins in the extras above, and
+# LIBERO is validated to construct+step on this commit (so the shared stack still works).
+robosuite = { git = "https://github.com/ARISE-Initiative/robosuite", rev = "85abee228d1c43ab1939bce33028099945d453b4" }
+# OpenTau packaging fork of robocasa: drops upstream's lerobot==0.3.3 / tianshou /
+# opencv-python / hidapi deps and loosens its numpy/numba/scipy/mujoco pins + import-time
+# version asserts so it co-installs on the shared robosuite-1.5 / numpy-2 stack.
+robocasa = { git = "https://github.com/shuheng-liu/robocasa", rev = "f7db21c11f25408d3a59a5e878bb5c7ca9030c4d" }
 
 [tool.uv]
 # `extra-build-dependencies` injects a cmake wheel into egl-probe's PEP 517 build

diff --git a/src/opentau/envs/configs.py b/src/opentau/envs/configs.py
@@ -307,3 +307,109 @@ def gym_kwargs(self) -> dict:
             "task_ids": task_ids,
             "control_freq": self.fps,
         }
+
+
+@EnvConfig.register_subclass("robocasa")
+@dataclass
+class RoboCasaEnv(EnvConfig):
+    r"""Configuration for the RoboCasa365 kitchen environment.
+
+    RoboCasa runs on robosuite 1.5 (shared with LIBERO since the libero extra was
+    bumped to robosuite 1.5.2), so it co-installs in the same venv. The default
+    robot is the PandaOmron mobile manipulator — hence the 12-D action and 16-D
+    state, distinct from LIBERO's 7-D/8-D. Set ``metadata.robot_type`` /
+    ``eval.control_mode`` to select the matching per-(robot_type, control_mode)
+    projection head when evaluating a co-trained policy.
+
+    Args:
+        task: A RoboCasa task name (e.g. ``"CloseFridge"``), a comma-separated
+            list of task names, or a benchmark-group shortcut
+            (``atomic_seen``/``composite_seen``/``composite_unseen``/
+            ``pretrain50``/``pretrain100``/``pretrain200``/``pretrain300``), which
+            auto-expands to the upstream task list and auto-sets ``split``.
+        fps: RoboCasa control frequency (Hz); also the ``render_fps`` for videos.
+        episode_length: Maximum steps per episode (``_max_episode_steps``).
+        obs_type: ``"pixels"`` or ``"pixels_agent_pos"``.
+        render_mode: Rendering mode for the environment.
+        camera_name: Comma-separated raw RoboCasa camera names to render. The
+            wrapper remaps them to ``camera0``/``camera1``/... so the policy input
+            structure matches LIBERO regardless of the raw names; when the policy
+            was trained with a larger ``cfg.num_cams``, ``preprocess_observation``
+            zero-fills the remaining slots.
+        observation_height: Height of observation images.
+        observation_width: Width of observation images.
+        visualization_height: Height of visualization frames.
+        visualization_width: Width of visualization frames.
+        split: RoboCasa dataset split (``None``/``"all"``/``"pretrain"``/
+            ``"target"``). Left ``None`` unless a task-group shortcut sets it.
+        obj_registries: Object-mesh registries to sample assets from. Defaults to
+            ``["lightwheel"]`` (the pack the asset downloader ships by default);
+            add ``"objaverse"`` only after downloading that ~30GB pack.
+        features: Mapping from logical feature names to ``PolicyFeature`` definitions.
+        features_map: Mapping from environment keys to standardized OpenTau keys.
+    """
+
+    task: str = "CloseFridge"
+    fps: int = 20
+    episode_length: int = 1000
+    obs_type: str = "pixels_agent_pos"
+    render_mode: str = "rgb_array"
+    camera_name: str = "robot0_agentview_left,robot0_eye_in_hand,robot0_agentview_right"
+    observation_height: int = 256
+    observation_width: int = 256
+    visualization_height: int = 512
+    visualization_width: int = 512
+    split: str | None = None
+    obj_registries: list[str] = field(default_factory=lambda: ["lightwheel"])
+    features: dict[str, PolicyFeature] = field(
+        default_factory=lambda: {
+            "action": PolicyFeature(type=FeatureType.ACTION, shape=(12,)),
+        }
+    )
+    features_map: dict[str, str] = field(
+        default_factory=lambda: {
+            "action": ACTION,
+            "agent_pos": OBS_STATE,
+        }
+    )
+
+    def __post_init__(self):
+        if self.fps <= 0:
+            raise ValueError(f"RoboCasa env.fps (control frequency in Hz) must be positive, got {self.fps}")
+        if self.obs_type not in ("pixels", "pixels_agent_pos"):
+            raise ValueError(f"Unsupported obs_type: {self.obs_type}")
+
+        # The wrapper remaps the i-th raw camera to ``camera{i}``; mirror that in
+        # the feature map using OpenTau's ``image`` / ``image2`` / ... convention
+        # (camera0 -> image, camera1 -> image2, ...), matching LIBERO.
+        cams = [c.strip() for c in self.camera_name.split(",") if c.strip()]
+        for i, cam in enumerate(cams):
+            self.features[f"pixels/{cam}"] = PolicyFeature(
+                type=FeatureType.VISUAL,
+                shape=(self.observation_height, self.observation_width, 3),
+            )
+            mapped = "image" if i == 0 else f"image{i + 1}"
+            self.features_map[f"pixels/{cam}"] = f"{OBS_IMAGES}.{mapped}"
+
+        if self.obs_type == "pixels_agent_pos":
+            self.features["agent_pos"] = PolicyFeature(type=FeatureType.STATE, shape=(16,))
+
+    @property
+    def gym_kwargs(self) -> dict:
+        r"""Return the keyword arguments used to construct the RoboCasa environment.
+
+        Task resolution and per-rank sharding live in ``create_robocasa_envs`` (they
+        need the ``robocasa`` package for group expansion), so this stays sim-free
+        and only carries the obs/render parameters plus an optional ``split``.
+        """
+        kwargs: dict = {
+            "obs_type": self.obs_type,
+            "render_mode": self.render_mode,
+            "observation_height": self.observation_height,
+            "observation_width": self.observation_width,
+            "visualization_height": self.visualization_height,
+            "visualization_width": self.visualization_width,
+        }
+        if self.split is not None:
+            kwargs["split"] = self.split
+        return kwargs