From 08c1950c6df7e08a0c158ca9a9cad6a1d617de8c Mon Sep 17 00:00:00 2001 From: Logic Date: Sat, 14 Mar 2026 12:25:44 +0800 Subject: [PATCH] chore(pusht): add 5090 repro docs and uv setup --- AGENTS.md | 68 +++++++++++ PUSHT_REPRO_5090.md | 108 ++++++++++++++++++ .../env_runner/pusht_image_runner.py | 7 +- diffusion_policy/gym_util/sync_vector_env.py | 37 +++++- requirements-pusht-5090.txt | 38 ++++++ setup_uv_pusht_5090.sh | 20 ++++ 6 files changed, 270 insertions(+), 8 deletions(-) create mode 100644 AGENTS.md create mode 100644 PUSHT_REPRO_5090.md create mode 100644 requirements-pusht-5090.txt create mode 100755 setup_uv_pusht_5090.sh diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 0000000..0f022be --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,68 @@ +# Agent Notes + +## Purpose +`~/diffusion_policy` is the Diffusion Policy training repo. The main workflow here is Hydra-driven training via `train.py`, with the canonical PushT image experiment configured by `image_pusht_diffusion_policy_cnn.yaml`. + +## Top Level +- `diffusion_policy/`: core code, configs, datasets, env runners, workspaces. +- `data/`: local datasets, outputs, checkpoints, run logs. +- `train.py`: main training entrypoint. +- `eval.py`: checkpoint evaluation entrypoint. +- `image_pusht_diffusion_policy_cnn.yaml`: canonical single-seed PushT image config from the README path. +- `.venv/`: local `uv`-managed virtualenv. +- `.uv-cache/`, `.uv-python/`: local `uv` cache and Python install state. +- `README.md`: upstream instructions and canonical commands. + +## Canonical PushT Image Path +- Entrypoint: `python train.py --config-dir=. --config-name=image_pusht_diffusion_policy_cnn.yaml` +- Dataset path in config: `data/pusht/pusht_cchi_v7_replay.zarr` +- README canonical device override: `training.device=cuda:0` + +## Data +- PushT archive currently present at `data/pusht.zip` +- Unpacked dataset used by training: `data/pusht/pusht_cchi_v7_replay.zarr` + +## Local Compatibility Adjustments +- `diffusion_policy/env_runner/pusht_image_runner.py` now uses `SyncVectorEnv` instead of `AsyncVectorEnv`. + Reason: avoid shared-memory and semaphore failures on this host/session. +- `diffusion_policy/gym_util/sync_vector_env.py` has local compatibility changes: + - added `reset_async` + - seeded `reset_wait` + - updated `concatenate(...)` call order for the current `gym` API + +## Environment Expectations +- Use the local `uv` env at `.venv` +- Verified local Python: `3.9.25` +- Verified local Torch stack: `torch 2.8.0+cu128`, `torchvision 0.23.0+cu128` +- Other key installed versions verified in `.venv`: + - `gym 0.23.1` + - `hydra-core 1.2.0` + - `diffusers 0.11.1` + - `huggingface_hub 0.10.1` + - `wandb 0.13.3` + - `zarr 2.12.0` + - `numcodecs 0.10.2` + - `av 14.0.1` +- Important note: this shell currently reports `torch.cuda.is_available() == False`, so always verify CUDA access in the current session before assuming GPU is usable. + +## Logging And Outputs +- Hydra run outputs: `data/outputs/...` +- Per-run files to check first: + - `.hydra/overrides.yaml` + - `logs.json.txt` + - `train.log` + - `checkpoints/latest.ckpt` +- Extra launcher logs may live under `data/run_logs/` + +## Practical Guidance +- Inspect with `rg`, `sed`, and existing Hydra output folders before changing code. +- Prefer config overrides before code edits. +- On this host, start from these safety overrides unless revalidated: + - `logging.mode=offline` + - `dataloader.num_workers=0` + - `val_dataloader.num_workers=0` + - `task.env_runner.n_envs=1` + - `task.env_runner.n_test_vis=0` + - `task.env_runner.n_train_vis=0` +- If a run fails, inspect `.hydra/overrides.yaml`, then `logs.json.txt`, then `train.log`. +- Avoid driver or system changes unless the repo-local path is clearly blocked. diff --git a/PUSHT_REPRO_5090.md b/PUSHT_REPRO_5090.md new file mode 100644 index 0000000..99c3eb2 --- /dev/null +++ b/PUSHT_REPRO_5090.md @@ -0,0 +1,108 @@ +# PushT Repro On 5090 + +## Goal +Reproduce the canonical single-seed image PushT experiment from this repo in `~/diffusion_policy` using `image_pusht_diffusion_policy_cnn.yaml`. + +## Current Verified Local Setup +- Virtualenv: `./.venv` managed with `uv` +- Python: `3.9.25` +- Torch stack: `torch 2.8.0+cu128`, `torchvision 0.23.0+cu128` +- Version strategy used here: + - newer Torch/CUDA stack for current 5090-class hardware support + - keep older repo-era packages where they are still required by the code +- Verified key pins in `.venv`: + - `numpy 1.26.4` + - `gym 0.23.1` + - `hydra-core 1.2.0` + - `diffusers 0.11.1` + - `huggingface_hub 0.10.1` + - `wandb 0.13.3` + - `zarr 2.12.0` + - `numcodecs 0.10.2` + - `av 14.0.1` + - `robomimic 0.2.0` + +## Dataset +- README source: `https://diffusion-policy.cs.columbia.edu/data/training/pusht.zip` +- Local archive currently present: `data/pusht.zip` +- Unpacked dataset used by the config: `data/pusht/pusht_cchi_v7_replay.zarr` + +## Repo-Local Code Adjustments +- `diffusion_policy/env_runner/pusht_image_runner.py` + - switched PushT image evaluation from `AsyncVectorEnv` to `SyncVectorEnv` +- `diffusion_policy/gym_util/sync_vector_env.py` + - added `reset_async` + - added seeded `reset_wait` + - updated `concatenate(...)` call order for current `gym` + +These changes were needed to keep PushT evaluation working without the async shared-memory path. + +## Validated GPU Smoke Command +This route is verified by `data/outputs/gpu_smoke2___pusht_gpu_smoke`, which contains `logs.json.txt` plus checkpoints: + +```bash +.venv/bin/python train.py \ + --config-dir=. \ + --config-name=image_pusht_diffusion_policy_cnn.yaml \ + training.seed=42 \ + training.device=cuda:0 \ + logging.mode=offline \ + dataloader.num_workers=0 \ + val_dataloader.num_workers=0 \ + task.env_runner.n_envs=1 \ + training.debug=true \ + task.env_runner.n_test=2 \ + task.env_runner.n_test_vis=0 \ + task.env_runner.n_train=1 \ + task.env_runner.n_train_vis=0 \ + task.env_runner.max_steps=20 +``` + +## Practical Full Training Command Used Here +This matches the longer GPU run under `data/outputs/2026.03.13/15.37.00_train_diffusion_unet_hybrid_pusht_image_gpu_seed42`: + +```bash +.venv/bin/python train.py \ + --config-dir=. \ + --config-name=image_pusht_diffusion_policy_cnn.yaml \ + training.seed=42 \ + training.device=cuda:0 \ + logging.mode=offline \ + dataloader.num_workers=0 \ + val_dataloader.num_workers=0 \ + task.env_runner.n_envs=1 \ + task.env_runner.n_test_vis=0 \ + task.env_runner.n_train_vis=0 \ + hydra.run.dir=data/outputs/2026.03.13/15.37.00_train_diffusion_unet_hybrid_pusht_image_gpu_seed42 +``` + +## Why These Overrides Were Used +- `logging.mode=offline` + - avoids needing a W&B login and still leaves local run metadata in the output dir +- `dataloader.num_workers=0` and `val_dataloader.num_workers=0` + - avoids extra multiprocessing on this host +- `task.env_runner.n_envs=1` + - keeps PushT eval on the serial `SyncVectorEnv` path +- `task.env_runner.n_test_vis=0` and `task.env_runner.n_train_vis=0` + - avoids video-writing issues on this stack + - one earlier GPU run with default vis settings logged libav/libx264 `profile=high` errors in `data/outputs/_train_diffusion_unet_hybrid_pusht_image_gpu_seed42/train.log` + +## Output Locations +- Smoke run: + - `data/outputs/gpu_smoke2___pusht_gpu_smoke` +- Longer GPU run: + - `data/outputs/2026.03.13/15.37.00_train_diffusion_unet_hybrid_pusht_image_gpu_seed42` +- Files to inspect inside a run: + - `.hydra/overrides.yaml` + - `logs.json.txt` + - `train.log` + - `checkpoints/latest.ckpt` + +## Known Caveats +- The default config is still tuned for older assumptions: + - `logging.mode=online` + - `dataloader.num_workers=8` + - `task.env_runner.n_envs=null` + - `task.env_runner.n_test_vis=4` + - `task.env_runner.n_train_vis=2` +- In this shell, `torch.cuda.is_available()` currently reports `False` even though the repo contains validated GPU smoke/full run artifacts. Re-check device visibility in the current session before restarting a GPU run. diff --git a/diffusion_policy/env_runner/pusht_image_runner.py b/diffusion_policy/env_runner/pusht_image_runner.py index f65c06a..82187b6 100644 --- a/diffusion_policy/env_runner/pusht_image_runner.py +++ b/diffusion_policy/env_runner/pusht_image_runner.py @@ -8,8 +8,7 @@ import dill import math import wandb.sdk.data_types.video as wv from diffusion_policy.env.pusht.pusht_image_env import PushTImageEnv -from diffusion_policy.gym_util.async_vector_env import AsyncVectorEnv -# from diffusion_policy.gym_util.sync_vector_env import SyncVectorEnv +from diffusion_policy.gym_util.sync_vector_env import SyncVectorEnv from diffusion_policy.gym_util.multistep_wrapper import MultiStepWrapper from diffusion_policy.gym_util.video_recording_wrapper import VideoRecordingWrapper, VideoRecorder @@ -121,7 +120,9 @@ class PushTImageRunner(BaseImageRunner): env_prefixs.append('test/') env_init_fn_dills.append(dill.dumps(init_fn)) - env = AsyncVectorEnv(env_fns) + # This environment can run without multiprocessing, which avoids + # shared-memory and semaphore restrictions on some machines. + env = SyncVectorEnv(env_fns) # test env # env.reset(seed=env_seeds) diff --git a/diffusion_policy/gym_util/sync_vector_env.py b/diffusion_policy/gym_util/sync_vector_env.py index c85a682..260e1c1 100644 --- a/diffusion_policy/gym_util/sync_vector_env.py +++ b/diffusion_policy/gym_util/sync_vector_env.py @@ -60,17 +60,44 @@ class SyncVectorEnv(VectorEnv): for env, seed in zip(self.envs, seeds): env.seed(seed) - def reset_wait(self): + def reset_async(self, seed=None, return_info=False, options=None): + if seed is None: + seeds = [None for _ in range(self.num_envs)] + elif isinstance(seed, int): + seeds = [seed + i for i in range(self.num_envs)] + else: + seeds = list(seed) + assert len(seeds) == self.num_envs + self._reset_seeds = seeds + self._reset_return_info = return_info + self._reset_options = options + + def reset_wait(self, seed=None, return_info=False, options=None): + seeds = getattr(self, '_reset_seeds', None) + if seeds is None: + if seed is None: + seeds = [None for _ in range(self.num_envs)] + elif isinstance(seed, int): + seeds = [seed + i for i in range(self.num_envs)] + else: + seeds = list(seed) self._dones[:] = False observations = [] - for env in self.envs: + infos = [] + for env, seed_i in zip(self.envs, seeds): + if seed_i is not None: + env.seed(seed_i) observation = env.reset() observations.append(observation) + infos.append({}) self.observations = concatenate( - observations, self.observations, self.single_observation_space + self.single_observation_space, observations, self.observations ) - return deepcopy(self.observations) if self.copy else self.observations + obs = deepcopy(self.observations) if self.copy else self.observations + if return_info: + return obs, infos + return obs def step_async(self, actions): self._actions = actions @@ -84,7 +111,7 @@ class SyncVectorEnv(VectorEnv): observations.append(observation) infos.append(info) self.observations = concatenate( - observations, self.observations, self.single_observation_space + self.single_observation_space, observations, self.observations ) return ( diff --git a/requirements-pusht-5090.txt b/requirements-pusht-5090.txt new file mode 100644 index 0000000..9bf8f9d --- /dev/null +++ b/requirements-pusht-5090.txt @@ -0,0 +1,38 @@ +# Direct package pins for the canonical PushT image workflow on host 5090. +# Torch/TorchVision/Torchaudio are installed separately from the cu128 index in setup_uv_pusht_5090.sh. + +numpy==1.26.4 +scipy==1.11.4 +numba==0.59.1 +llvmlite==0.42.0 +cffi==1.15.1 +cython==0.29.32 +h5py==3.8.0 +pandas==2.2.3 +zarr==2.12.0 +numcodecs==0.10.2 +hydra-core==1.2.0 +einops==0.4.1 +tqdm==4.64.1 +dill==0.3.5.1 +scikit-video==1.1.11 +scikit-image==0.19.3 +gym==0.23.1 +pymunk==6.2.1 +wandb==0.13.3 +threadpoolctl==3.1.0 +shapely==1.8.5.post1 +imageio==2.22.0 +imageio-ffmpeg==0.4.7 +termcolor==2.0.1 +tensorboard==2.10.1 +tensorboardx==2.5.1 +psutil==7.2.2 +click==8.1.8 +boto3==1.24.96 +diffusers==0.11.1 +huggingface-hub==0.10.1 +av==14.0.1 +pygame==2.5.2 +robomimic==0.2.0 +opencv-python-headless==4.10.0.84 diff --git a/setup_uv_pusht_5090.sh b/setup_uv_pusht_5090.sh new file mode 100755 index 0000000..d32aefe --- /dev/null +++ b/setup_uv_pusht_5090.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -euo pipefail + +ROOT_DIR="$(cd "$(dirname "$0")" && pwd)" +cd "$ROOT_DIR" + +export UV_CACHE_DIR="${UV_CACHE_DIR:-$ROOT_DIR/.uv-cache}" +export UV_PYTHON_INSTALL_DIR="${UV_PYTHON_INSTALL_DIR:-$ROOT_DIR/.uv-python}" + +uv venv --python 3.9 .venv +source .venv/bin/activate + +uv pip install --upgrade pip wheel setuptools==80.9.0 +uv pip install --python .venv/bin/python \ + --index-url https://download.pytorch.org/whl/cu128 \ + torch==2.8.0+cu128 torchvision==0.23.0+cu128 torchaudio==2.8.0+cu128 +uv pip install --python .venv/bin/python -r requirements-pusht-5090.txt +uv pip install --python .venv/bin/python -e . + +echo "uv environment ready at $ROOT_DIR/.venv"