From a78006808a8bbc8c69ac6a869ae6b66b3b303698 Mon Sep 17 00:00:00 2001 From: Logic Date: Sat, 4 Apr 2026 23:47:15 +0800 Subject: [PATCH] fix: stabilize headless rollout and summarize phase1 grid --- ...4-imf-horizon-grid-and-attnres-ablation.md | 68 ++++++++ .../leaderboard.csv | 7 + .../2026-04-04-imf-horizon-grid/manifest.json | 115 ++++++++++++ .../2026-04-04-imf-horizon-grid/notes.md | 20 +++ .../phase1_summary.md | 38 ++++ .../2026-04-04-imf-horizon-grid/status.json | 165 ++++++++++++++++++ roboimi/demos/vla_scripts/eval_vla.py | 8 +- tests/test_eval_vla_headless_import.py | 26 +++ 8 files changed, 446 insertions(+), 1 deletion(-) create mode 100644 docs/superpowers/plans/2026-04-04-imf-horizon-grid-and-attnres-ablation.md create mode 100644 experiment_suites/2026-04-04-imf-horizon-grid/leaderboard.csv create mode 100644 experiment_suites/2026-04-04-imf-horizon-grid/manifest.json create mode 100644 experiment_suites/2026-04-04-imf-horizon-grid/notes.md create mode 100644 experiment_suites/2026-04-04-imf-horizon-grid/phase1_summary.md create mode 100644 experiment_suites/2026-04-04-imf-horizon-grid/status.json create mode 100644 tests/test_eval_vla_headless_import.py diff --git a/docs/superpowers/plans/2026-04-04-imf-horizon-grid-and-attnres-ablation.md b/docs/superpowers/plans/2026-04-04-imf-horizon-grid-and-attnres-ablation.md new file mode 100644 index 0000000..69e088d --- /dev/null +++ b/docs/superpowers/plans/2026-04-04-imf-horizon-grid-and-attnres-ablation.md @@ -0,0 +1,68 @@ +# IMF Horizon Grid and AttnRes Ablation Implementation Plan + +> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Run a 6-run Phase-1 IMF horizon/action-step experiment grid across available GPUs, monitor progress and collect best rollout metrics, then use the best horizon setting for a Phase-2 visual-attnres ablation. + +**Architecture:** Use the current IMF training code as-is for Phase-1 by sweeping explicit `(pred_horizon, num_action_steps)` overrides while keeping emb=384, layer=12, and max_steps=50k fixed. Maintain a local experiment suite directory with a manifest and machine-readable status snapshots so progress can be resumed and summarized across turns. After Phase-1 completes, compare the current head-only attnres setup against a variant that also adds attnres into the visual ResNet path. + +**Tech Stack:** Python, Hydra/OmegaConf, PyTorch, SSH/Tailscale, JSON/CSV status files, SwanLab. + +--- + +### Task 1: Prepare the experiment suite manifest and state tracking + +**Files:** +- Create: `experiment_suites/2026-04-04-imf-horizon-grid/manifest.json` +- Create: `experiment_suites/2026-04-04-imf-horizon-grid/status.json` +- Create: `experiment_suites/2026-04-04-imf-horizon-grid/notes.md` + +- [ ] Define the 6 legal Phase-1 combinations: `(8,8)`, `(16,8)`, `(16,16)`, `(32,8)`, `(32,16)`, `(32,32)`. +- [ ] Record for each run: name, host, GPU slot, command, log path, SwanLab run name, and completion criteria. +- [ ] Define the comparison metric as the maximum rollout average reward seen during training (`max avg_reward`), preferably read from the best-checkpoint metadata and cross-checked against logs. +- [ ] Keep `status.json` updated with per-run state: queued / running / finished / failed plus latest parsed progress. + +### Task 2: Prepare the remote 8-GPU execution target + +**Files:** +- Remote working directory under `/home/droid/` +- Reuse or create a synced code directory for this suite + +- [ ] Verify the remote dataset path and environment path. +- [ ] Verify GPU availability and reserve 6 GPUs for Phase-1 launches. +- [ ] Sync the required code to a dedicated remote suite directory. +- [ ] Record exact remote paths back into the local suite manifest. + +### Task 3: Launch the 6 Phase-1 experiments in parallel + +**Files:** +- Reuse: `roboimi/demos/vla_scripts/train_vla.py` +- Modify only local suite tracking files unless a launch bug is discovered + +- [ ] Launch 6 runs concurrently with fixed settings: IMF, emb=384, layer=12, max_steps=50k. +- [ ] Keep all other relevant training hyperparameters aligned to the current strong baseline unless a concrete blocker appears. +- [ ] Assign one GPU per run on the 8xL20 host. +- [ ] Capture PID, log path, and SwanLab URL for each run in `status.json`. + +### Task 4: Monitor and summarize Phase-1 until all 6 finish + +**Files:** +- Update: `experiment_suites/2026-04-04-imf-horizon-grid/status.json` +- Update: `experiment_suites/2026-04-04-imf-horizon-grid/notes.md` + +- [ ] Periodically parse each run’s log/checkpoints to extract latest step, latest rollout reward, and best rollout reward so far. +- [ ] Keep a resumable local summary so progress can be continued in later turns without rediscovery. +- [ ] After all 6 runs finish, rank them by `max avg_reward` and write a compact Phase-1 summary. + +### Task 5: Prepare the Phase-2 visual-attnres ablation + +**Files:** +- Likely modify: vision backbone implementation and config files (to be confirmed after code inspection) +- Add/update targeted tests for the visual backbone path if code changes are needed + +- [ ] Use the best Phase-1 `(pred_horizon, num_action_steps)` combination as the fixed rollout setting for Phase-2. +- [ ] Compare: + 1. current setup: attnres only in the IMF head + 2. ablation setup: attnres in both IMF head and visual encoder path +- [ ] Keep the rest of the training settings fixed. +- [ ] Launch and monitor the Phase-2 pair after Phase-1 summary is complete. diff --git a/experiment_suites/2026-04-04-imf-horizon-grid/leaderboard.csv b/experiment_suites/2026-04-04-imf-horizon-grid/leaderboard.csv new file mode 100644 index 0000000..908eff0 --- /dev/null +++ b/experiment_suites/2026-04-04-imf-horizon-grid/leaderboard.csv @@ -0,0 +1,7 @@ +rank,run_id,status,pred_horizon,num_action_steps,best_rollout_avg_reward,best_step,final_step,final_loss,host,run_dir +1,ph16_ex8,finished,16,8,610.8,21874,50000,0.0034315965604037046,100.73.14.65,/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223 +2,ph16_ex16,finished,16,16,561.2,48124,50000,0.004544622730463743,100.119.99.14,/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223 +3,ph32_ex32,finished,32,32,513.2,43749,50000,0.003953303210437298,local,/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy/runs/imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223 +4,ph8_ex8,finished,8,8,415.6,48124,50000,0.007008877582848072,100.73.14.65,/home/droid/roboimi_suite_20260404/runs/imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223 +5,ph32_ex8,finished,32,8,361.6,43749,50000,0.004788532387465239,100.119.99.14,/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223 +6,ph32_ex16,finished,32,16,239.6,48124,50000,0.0038348555099219084,100.119.99.14,/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223 diff --git a/experiment_suites/2026-04-04-imf-horizon-grid/manifest.json b/experiment_suites/2026-04-04-imf-horizon-grid/manifest.json new file mode 100644 index 0000000..862f384 --- /dev/null +++ b/experiment_suites/2026-04-04-imf-horizon-grid/manifest.json @@ -0,0 +1,115 @@ +{ + "suite_name": "2026-04-04-imf-horizon-grid", + "created_at": "2026-04-04 13:19:52", + "updated_at": "2026-04-04 13:19:52", + "phase": "phase1_launching", + "metric": "max_avg_reward", + "baseline": { + "agent": "resnet_imf_attnres", + "batch_size": 80, + "lr": 0.00025, + "num_workers": 12, + "max_steps": 50000, + "rollout_val_freq_epochs": 5, + "rollout_num_episodes": 5, + "val_split": 0.0, + "seed": 42, + "scheduler_type": "cosine", + "warmup_steps": 2000, + "min_lr": 1e-06, + "weight_decay": 1e-05, + "grad_clip": 1.0, + "inference_steps": 1, + "embed_dim": 384, + "n_layer": 12, + "n_head": 1, + "n_kv_head": 1, + "freeze_backbone": false, + "pretrained_backbone_weights": null, + "camera_names": [ + "r_vis", + "top", + "front" + ] + }, + "runs": [ + { + "id": "ph8_ex8", + "pred_horizon": 8, + "num_action_steps": 8, + "host": "100.73.14.65", + "host_label": "tailnet-5880", + "gpu": 0, + "workdir": "/home/droid/roboimi_suite_20260404", + "python": "/home/droid/miniforge3/envs/roboimi/bin/python", + "dataset_dir": "/home/droid/sim_dataset/sim_transfer", + "run_name": "imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223", + "launch_state": "ready" + }, + { + "id": "ph16_ex8", + "pred_horizon": 16, + "num_action_steps": 8, + "host": "100.73.14.65", + "host_label": "tailnet-5880", + "gpu": 1, + "workdir": "/home/droid/roboimi_suite_20260404", + "python": "/home/droid/miniforge3/envs/roboimi/bin/python", + "dataset_dir": "/home/droid/sim_dataset/sim_transfer", + "run_name": "imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223", + "launch_state": "ready" + }, + { + "id": "ph16_ex16", + "pred_horizon": 16, + "num_action_steps": 16, + "host": "100.119.99.14", + "host_label": "tailnet-l20", + "gpu": 0, + "workdir": "/home/droid/roboimi_suite_20260404", + "python": "/home/droid/miniforge3/envs/roboimi/bin/python", + "dataset_dir": "/home/droid/sim_dataset/sim_transfer", + "run_name": "imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223", + "launch_state": "provisioning_required" + }, + { + "id": "ph32_ex8", + "pred_horizon": 32, + "num_action_steps": 8, + "host": "100.119.99.14", + "host_label": "tailnet-l20", + "gpu": 1, + "workdir": "/home/droid/roboimi_suite_20260404", + "python": "/home/droid/miniforge3/envs/roboimi/bin/python", + "dataset_dir": "/home/droid/sim_dataset/sim_transfer", + "run_name": "imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223", + "launch_state": "provisioning_required" + }, + { + "id": "ph32_ex16", + "pred_horizon": 32, + "num_action_steps": 16, + "host": "100.119.99.14", + "host_label": "tailnet-l20", + "gpu": 2, + "workdir": "/home/droid/roboimi_suite_20260404", + "python": "/home/droid/miniforge3/envs/roboimi/bin/python", + "dataset_dir": "/home/droid/sim_dataset/sim_transfer", + "run_name": "imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223", + "launch_state": "provisioning_required" + }, + { + "id": "ph32_ex32", + "pred_horizon": 32, + "num_action_steps": 32, + "host": "local", + "host_label": "local-5090", + "gpu": 0, + "workdir": "/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy", + "python": "/home/droid/.conda/envs/roboimi/bin/python", + "dataset_dir": "/home/droid/project/diana_sim/sim_transfer", + "run_name": "imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223", + "launch_state": "ready" + } + ] +} diff --git a/experiment_suites/2026-04-04-imf-horizon-grid/notes.md b/experiment_suites/2026-04-04-imf-horizon-grid/notes.md new file mode 100644 index 0000000..e30da26 --- /dev/null +++ b/experiment_suites/2026-04-04-imf-horizon-grid/notes.md @@ -0,0 +1,20 @@ +# IMF Horizon Grid Suite Notes + +- Created: 2026-04-04 13:19:52 +- Phase-1 matrix: (8,8), (16,8), (16,16), (32,8), (32,16), (32,32) +- Fixed baseline: IMF AttnRes, n_emb=384, n_layer=12, batch_size=80, lr=2.5e-4, max_steps=50k, rollout every 5 epochs with 5 episodes. +- Host allocation: + - local RTX 5090: ph32_ex32 + - 100.73.14.65 RTX 5880 GPU0: ph8_ex8 + - 100.73.14.65 RTX 5880 GPU1: ph16_ex8 + - 100.119.99.14 L20 GPU0: ph16_ex16 + - 100.119.99.14 L20 GPU1: ph32_ex8 + - 100.119.99.14 L20 GPU2: ph32_ex16 +- 100.119.99.14 still needs env + dataset + swanlab credential copy before launch. + +- 2026-04-04 13:23:43: launched local ph32_ex32 (pid 1437836), remote 100.73 ph8_ex8 (pid 931824), ph16_ex8 (pid 931826); started 100.119 bootstrap (local pid 1437837). +- 2026-04-04 13:25:43: first status sync — local ph32_ex32 step≈500; remote ph8_ex8 step≈400; remote ph16_ex8 step≈400. +- 2026-04-04 13:27:41: second status sync — 100.119 bootstrap finished env copy and entered dataset copy; local ph32_ex32 step≈900; remote ph8_ex8 step≈800; remote ph16_ex8 step≈800. +- 2026-04-04 13:35:31: 100.119 bootstrap data/env copy finished. Original validation command hit a quoting bug, then I manually revalidated torch+mujoco+swanlab and launched ph16_ex16/ph32_ex8/ph32_ex16 with pids 81129/81130/81131. +- 2026-04-04 13:37:36: all 6 Phase-1 runs are now up. SwanLab links recorded in status.json; latest observed steps ~ local 900 / 5880 runs 800 / L20 runs 100. +- 2026-04-04 14:41:08: diagnosed remote first-rollout crash as early mujoco import before MUJOCO_GL=egl in eval_vla.py via raw_action_trajectory_viewer. Added regression test tests/test_eval_vla_headless_import.py, fixed import to lazy-load, verified 20-step headless eval on 5880 and L20, then resumed 5 failed runs from step 4374. Current resumed pids: ph8_ex8=938714, ph16_ex8=938717, ph16_ex16=90169, ph32_ex8=90173, ph32_ex16=90175. diff --git a/experiment_suites/2026-04-04-imf-horizon-grid/phase1_summary.md b/experiment_suites/2026-04-04-imf-horizon-grid/phase1_summary.md new file mode 100644 index 0000000..d8caafd --- /dev/null +++ b/experiment_suites/2026-04-04-imf-horizon-grid/phase1_summary.md @@ -0,0 +1,38 @@ +# Phase-1 IMF Horizon Grid Summary + +- Generated: 2026-04-04 23:43:38 +- Fixed baseline: IMF AttnRes head, n_emb=384, n_layer=12, batch_size=80, lr=2.5e-4, max_steps=50k, rollout every 5 epochs with 5 episodes, 3 cameras `[r_vis, top, front]`. +- Primary metric: `checkpoints/vla_model_best.pt -> rollout_avg_reward` (max training-time rollout average reward). + +## Ranked results + +| Rank | Run ID | pred_horizon | num_action_steps | Best avg_reward | Best step | Final loss | Host | +|---:|---|---:|---:|---:|---:|---:|---| +| 1 | `ph16_ex8` | 16 | 8 | 610.8 | 21874 | 0.0034 | 100.73.14.65 | +| 2 | `ph16_ex16` | 16 | 16 | 561.2 | 48124 | 0.0045 | 100.119.99.14 | +| 3 | `ph32_ex32` | 32 | 32 | 513.2 | 43749 | 0.0040 | local | +| 4 | `ph8_ex8` | 8 | 8 | 415.6 | 48124 | 0.0070 | 100.73.14.65 | +| 5 | `ph32_ex8` | 32 | 8 | 361.6 | 43749 | 0.0048 | 100.119.99.14 | +| 6 | `ph32_ex16` | 32 | 16 | 239.6 | 48124 | 0.0038 | 100.119.99.14 | + +## Main observations + +- Best overall setting was **`pred_horizon=16`, `num_action_steps=8`** with **max avg_reward = 610.8** at step **21874**. +- Comparing horizon 16: executing 8 steps outperformed executing 16 steps (`ph16_ex8` > `ph16_ex16`). +- Comparing horizon 32: executing the full 32-step chunk was much better than executing 16 or 8 steps (`ph32_ex32` > `ph32_ex8` > `ph32_ex16`). +- Short horizon 8 with 8-step execution was competitive but clearly below the best 16/8 and 32/32 settings. +- In this sweep, increasing prediction horizon helped only when the executed chunk length matched a good control cadence; mismatch could hurt a lot (especially `ph32_ex16`). + +## Raw results + +- `ph16_ex8`: best avg_reward=610.8 @ step 21874, final_loss=0.0034, run_dir=`/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223` +- `ph16_ex16`: best avg_reward=561.2 @ step 48124, final_loss=0.0045, run_dir=`/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223` +- `ph32_ex32`: best avg_reward=513.2 @ step 43749, final_loss=0.0040, run_dir=`/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy/runs/imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223` +- `ph8_ex8`: best avg_reward=415.6 @ step 48124, final_loss=0.0070, run_dir=`/home/droid/roboimi_suite_20260404/runs/imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223` +- `ph32_ex8`: best avg_reward=361.6 @ step 43749, final_loss=0.0048, run_dir=`/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223` +- `ph32_ex16`: best avg_reward=239.6 @ step 48124, final_loss=0.0038, run_dir=`/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223` + +## Recommendation for Phase-2 anchor + +- Use **`pred_horizon=16`, `num_action_steps=8`** as the strongest Phase-1 baseline if the goal is purely maximizing rollout reward. +- If phase-2 needs a more conservative action execution budget, `ph16_ex8` is the strongest non-full-32 execution setting and may still be a good comparison anchor. diff --git a/experiment_suites/2026-04-04-imf-horizon-grid/status.json b/experiment_suites/2026-04-04-imf-horizon-grid/status.json new file mode 100644 index 0000000..0cae42c --- /dev/null +++ b/experiment_suites/2026-04-04-imf-horizon-grid/status.json @@ -0,0 +1,165 @@ +{ + "suite_name": "2026-04-04-imf-horizon-grid", + "updated_at": "2026-04-04 23:46:01", + "phase": "phase1_completed", + "provisioning": { + "100.119.99.14": { + "state": "completed_manual_launch", + "bootstrap_pid_local": 1437837, + "log_path": "experiment_suites/2026-04-04-imf-horizon-grid/provision_logs/100.119.99.14-bootstrap-20260404-131223.log", + "env_copy": "completed", + "dataset_copy": "completed", + "launch_watcher_pid_local": null, + "launch_watcher_log": "experiment_suites/2026-04-04-imf-horizon-grid/launch_logs/100.119.99.14-launch-watcher-20260404-131223.log", + "swanlab_copy": "completed", + "bootstrap_validation_note": "initial validation command had a quoting bug; manual validation passed and launches were started successfully" + } + }, + "runs": { + "ph8_ex8": { + "status": "finished", + "host": "100.73.14.65", + "gpu": 0, + "run_name": "imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223", + "workdir": "/home/droid/roboimi_suite_20260404", + "dataset_dir": "/home/droid/sim_dataset/sim_transfer", + "log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223/train_vla.log", + "run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223", + "pred_horizon": 8, + "num_action_steps": 8, + "pid": 938714, + "launch_log": "experiment_suite_launch_logs/imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223.restartfix-20260404-143827.log", + "latest_step": 50000, + "latest_log_sync": "2026-04-04 23:42:34", + "swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/i5syc57b6zq7rbkrtqy7b", + "process_running": false, + "best_step": 48124, + "best_rollout_avg_reward": 415.6, + "final_loss": 0.007008877582848072 + }, + "ph16_ex8": { + "status": "finished", + "host": "100.73.14.65", + "gpu": 1, + "run_name": "imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223", + "workdir": "/home/droid/roboimi_suite_20260404", + "dataset_dir": "/home/droid/sim_dataset/sim_transfer", + "log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223/train_vla.log", + "run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223", + "pred_horizon": 16, + "num_action_steps": 8, + "pid": 938717, + "launch_log": "experiment_suite_launch_logs/imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223.restartfix-20260404-143827.log", + "latest_step": 50000, + "latest_log_sync": "2026-04-04 23:42:34", + "swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/4rusbrpfxmw4ffii1ul5w", + "process_running": false, + "best_step": 21874, + "best_rollout_avg_reward": 610.8, + "final_loss": 0.0034315965604037046 + }, + "ph16_ex16": { + "status": "finished", + "host": "100.119.99.14", + "gpu": 0, + "run_name": "imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223", + "workdir": "/home/droid/roboimi_suite_20260404", + "dataset_dir": "/home/droid/sim_dataset/sim_transfer", + "log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223/train_vla.log", + "run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223", + "pred_horizon": 16, + "num_action_steps": 16, + "pid": 90169, + "launch_log": "experiment_suite_launch_logs/imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223.restartfix-20260404-143827.log", + "latest_log_sync": "2026-04-04 23:42:34", + "latest_step": 50000, + "swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/wwm232k6190gexnze8mg6", + "process_running": false, + "best_step": 48124, + "best_rollout_avg_reward": 561.2, + "final_loss": 0.004544622730463743 + }, + "ph32_ex8": { + "status": "finished", + "host": "100.119.99.14", + "gpu": 1, + "run_name": "imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223", + "workdir": "/home/droid/roboimi_suite_20260404", + "dataset_dir": "/home/droid/sim_dataset/sim_transfer", + "log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223/train_vla.log", + "run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223", + "pred_horizon": 32, + "num_action_steps": 8, + "pid": 90173, + "launch_log": "experiment_suite_launch_logs/imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223.restartfix-20260404-143827.log", + "latest_log_sync": "2026-04-04 23:42:34", + "latest_step": 50000, + "swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/o5y2xjb2rsb3lmfcuhy4p", + "process_running": false, + "best_step": 43749, + "best_rollout_avg_reward": 361.6, + "final_loss": 0.004788532387465239 + }, + "ph32_ex16": { + "status": "finished", + "host": "100.119.99.14", + "gpu": 2, + "run_name": "imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223", + "workdir": "/home/droid/roboimi_suite_20260404", + "dataset_dir": "/home/droid/sim_dataset/sim_transfer", + "log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223/train_vla.log", + "run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223", + "pred_horizon": 32, + "num_action_steps": 16, + "pid": 90175, + "launch_log": "experiment_suite_launch_logs/imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223.restartfix-20260404-143827.log", + "latest_log_sync": "2026-04-04 23:42:34", + "latest_step": 50000, + "swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/54cjpgba9eqsopdm0l8d3", + "process_running": false, + "best_step": 48124, + "best_rollout_avg_reward": 239.6, + "final_loss": 0.0038348555099219084 + }, + "ph32_ex32": { + "status": "finished", + "host": "local", + "gpu": 0, + "run_name": "imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223", + "workdir": "/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy", + "dataset_dir": "/home/droid/project/diana_sim/sim_transfer", + "log_path": "/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy/runs/imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223/train_vla.log", + "run_dir": "/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy/runs/imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223", + "pred_horizon": 32, + "num_action_steps": 32, + "pid": 1437836, + "launch_log": "experiment_suites/2026-04-04-imf-horizon-grid/launch_logs/imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223.launch.log", + "latest_step": 50000, + "latest_log_sync": "2026-04-04 23:42:34", + "swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/ajs2m218jd260hawhy5ns", + "process_running": false, + "latest_rollout_avg_reward": 513.2, + "best_rollout_avg_reward": 513.2, + "best_step": 43749, + "final_loss": 0.003953303210437298 + } + }, + "monitor": { + "state": "running", + "pid_local": 1443268, + "log_path": "experiment_suites/2026-04-04-imf-horizon-grid/monitor_logs/status-sync-20260404-131223.log", + "interval_seconds": 300 + }, + "debug": { + "remote_rollout_failure_20260404": { + "root_cause": "eval_vla.py imported raw_action_trajectory_viewer at module import time, which imported mujoco before MUJOCO_GL=egl was set; remote headless rollout then fell back to GLFW/X11 and crashed with mujoco.FatalError: gladLoadGL error during env.reset()->mj.Renderer(...)", + "fixed_file": "roboimi/demos/vla_scripts/eval_vla.py", + "verification": { + "pytest": "tests/test_eval_vla_headless_import.py passed", + "remote_eval_5880": "1 episode x 20 steps headless eval passed", + "remote_eval_l20": "1 episode x 20 steps headless eval passed" + } + } + }, + "phase1_summary_md": "/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy/experiment_suites/2026-04-04-imf-horizon-grid/phase1_summary.md" +} diff --git a/roboimi/demos/vla_scripts/eval_vla.py b/roboimi/demos/vla_scripts/eval_vla.py index a8003dd..5b5af41 100644 --- a/roboimi/demos/vla_scripts/eval_vla.py +++ b/roboimi/demos/vla_scripts/eval_vla.py @@ -26,7 +26,6 @@ from hydra.utils import instantiate from einops import rearrange from roboimi.utils.act_ex_utils import sample_transfer_pose -from roboimi.utils.raw_action_trajectory_viewer import build_trajectory_capsule_markers from roboimi.vla.eval_utils import execute_policy_action sys.path.append(os.getcwd()) @@ -362,6 +361,13 @@ def _save_rollout_trajectory_image( if output_path is None or camera_name is None: return None + # IMPORTANT: + # Keep this import lazy so headless rollout can set MUJOCO_GL=egl before + # anything imports mujoco. Importing this helper at module import time would + # pull in mujoco too early on remote headless hosts and make rollout fail + # with gladLoadGL / missing DISPLAY errors. + from roboimi.utils.raw_action_trajectory_viewer import build_trajectory_capsule_markers + output_path = str(output_path) Path(output_path).parent.mkdir(parents=True, exist_ok=True) diff --git a/tests/test_eval_vla_headless_import.py b/tests/test_eval_vla_headless_import.py new file mode 100644 index 0000000..e9d4763 --- /dev/null +++ b/tests/test_eval_vla_headless_import.py @@ -0,0 +1,26 @@ +import json +import os +import subprocess +import sys + + +def test_eval_vla_import_does_not_import_mujoco_early_when_headless_backend_not_set(): + env = os.environ.copy() + env.pop('MUJOCO_GL', None) + proc = subprocess.run( + [ + sys.executable, + '-c', + ( + 'import json, sys; ' + 'from roboimi.demos.vla_scripts import eval_vla; ' + 'print(json.dumps({"mujoco_in_sys_modules": "mujoco" in sys.modules}))' + ), + ], + capture_output=True, + text=True, + env=env, + check=True, + ) + payload = json.loads(proc.stdout.strip()) + assert payload['mujoco_in_sys_modules'] is False