fix: stabilize headless rollout and summarize phase1 grid

This commit is contained in:
Logic
2026-04-04 23:47:15 +08:00
parent 0586a6e6c7
commit a78006808a
8 changed files with 446 additions and 1 deletions

View File

@@ -0,0 +1,7 @@
rank,run_id,status,pred_horizon,num_action_steps,best_rollout_avg_reward,best_step,final_step,final_loss,host,run_dir
1,ph16_ex8,finished,16,8,610.8,21874,50000,0.0034315965604037046,100.73.14.65,/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223
2,ph16_ex16,finished,16,16,561.2,48124,50000,0.004544622730463743,100.119.99.14,/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223
3,ph32_ex32,finished,32,32,513.2,43749,50000,0.003953303210437298,local,/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy/runs/imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223
4,ph8_ex8,finished,8,8,415.6,48124,50000,0.007008877582848072,100.73.14.65,/home/droid/roboimi_suite_20260404/runs/imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223
5,ph32_ex8,finished,32,8,361.6,43749,50000,0.004788532387465239,100.119.99.14,/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223
6,ph32_ex16,finished,32,16,239.6,48124,50000,0.0038348555099219084,100.119.99.14,/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223
1 rank run_id status pred_horizon num_action_steps best_rollout_avg_reward best_step final_step final_loss host run_dir
2 1 ph16_ex8 finished 16 8 610.8 21874 50000 0.0034315965604037046 100.73.14.65 /home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223
3 2 ph16_ex16 finished 16 16 561.2 48124 50000 0.004544622730463743 100.119.99.14 /home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223
4 3 ph32_ex32 finished 32 32 513.2 43749 50000 0.003953303210437298 local /home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy/runs/imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223
5 4 ph8_ex8 finished 8 8 415.6 48124 50000 0.007008877582848072 100.73.14.65 /home/droid/roboimi_suite_20260404/runs/imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223
6 5 ph32_ex8 finished 32 8 361.6 43749 50000 0.004788532387465239 100.119.99.14 /home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223
7 6 ph32_ex16 finished 32 16 239.6 48124 50000 0.0038348555099219084 100.119.99.14 /home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223

View File

@@ -0,0 +1,115 @@
{
"suite_name": "2026-04-04-imf-horizon-grid",
"created_at": "2026-04-04 13:19:52",
"updated_at": "2026-04-04 13:19:52",
"phase": "phase1_launching",
"metric": "max_avg_reward",
"baseline": {
"agent": "resnet_imf_attnres",
"batch_size": 80,
"lr": 0.00025,
"num_workers": 12,
"max_steps": 50000,
"rollout_val_freq_epochs": 5,
"rollout_num_episodes": 5,
"val_split": 0.0,
"seed": 42,
"scheduler_type": "cosine",
"warmup_steps": 2000,
"min_lr": 1e-06,
"weight_decay": 1e-05,
"grad_clip": 1.0,
"inference_steps": 1,
"embed_dim": 384,
"n_layer": 12,
"n_head": 1,
"n_kv_head": 1,
"freeze_backbone": false,
"pretrained_backbone_weights": null,
"camera_names": [
"r_vis",
"top",
"front"
]
},
"runs": [
{
"id": "ph8_ex8",
"pred_horizon": 8,
"num_action_steps": 8,
"host": "100.73.14.65",
"host_label": "tailnet-5880",
"gpu": 0,
"workdir": "/home/droid/roboimi_suite_20260404",
"python": "/home/droid/miniforge3/envs/roboimi/bin/python",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"run_name": "imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223",
"launch_state": "ready"
},
{
"id": "ph16_ex8",
"pred_horizon": 16,
"num_action_steps": 8,
"host": "100.73.14.65",
"host_label": "tailnet-5880",
"gpu": 1,
"workdir": "/home/droid/roboimi_suite_20260404",
"python": "/home/droid/miniforge3/envs/roboimi/bin/python",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"run_name": "imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223",
"launch_state": "ready"
},
{
"id": "ph16_ex16",
"pred_horizon": 16,
"num_action_steps": 16,
"host": "100.119.99.14",
"host_label": "tailnet-l20",
"gpu": 0,
"workdir": "/home/droid/roboimi_suite_20260404",
"python": "/home/droid/miniforge3/envs/roboimi/bin/python",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"run_name": "imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223",
"launch_state": "provisioning_required"
},
{
"id": "ph32_ex8",
"pred_horizon": 32,
"num_action_steps": 8,
"host": "100.119.99.14",
"host_label": "tailnet-l20",
"gpu": 1,
"workdir": "/home/droid/roboimi_suite_20260404",
"python": "/home/droid/miniforge3/envs/roboimi/bin/python",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"run_name": "imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223",
"launch_state": "provisioning_required"
},
{
"id": "ph32_ex16",
"pred_horizon": 32,
"num_action_steps": 16,
"host": "100.119.99.14",
"host_label": "tailnet-l20",
"gpu": 2,
"workdir": "/home/droid/roboimi_suite_20260404",
"python": "/home/droid/miniforge3/envs/roboimi/bin/python",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"run_name": "imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223",
"launch_state": "provisioning_required"
},
{
"id": "ph32_ex32",
"pred_horizon": 32,
"num_action_steps": 32,
"host": "local",
"host_label": "local-5090",
"gpu": 0,
"workdir": "/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy",
"python": "/home/droid/.conda/envs/roboimi/bin/python",
"dataset_dir": "/home/droid/project/diana_sim/sim_transfer",
"run_name": "imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223",
"launch_state": "ready"
}
]
}

View File

@@ -0,0 +1,20 @@
# IMF Horizon Grid Suite Notes
- Created: 2026-04-04 13:19:52
- Phase-1 matrix: (8,8), (16,8), (16,16), (32,8), (32,16), (32,32)
- Fixed baseline: IMF AttnRes, n_emb=384, n_layer=12, batch_size=80, lr=2.5e-4, max_steps=50k, rollout every 5 epochs with 5 episodes.
- Host allocation:
- local RTX 5090: ph32_ex32
- 100.73.14.65 RTX 5880 GPU0: ph8_ex8
- 100.73.14.65 RTX 5880 GPU1: ph16_ex8
- 100.119.99.14 L20 GPU0: ph16_ex16
- 100.119.99.14 L20 GPU1: ph32_ex8
- 100.119.99.14 L20 GPU2: ph32_ex16
- 100.119.99.14 still needs env + dataset + swanlab credential copy before launch.
- 2026-04-04 13:23:43: launched local ph32_ex32 (pid 1437836), remote 100.73 ph8_ex8 (pid 931824), ph16_ex8 (pid 931826); started 100.119 bootstrap (local pid 1437837).
- 2026-04-04 13:25:43: first status sync — local ph32_ex32 step≈500; remote ph8_ex8 step≈400; remote ph16_ex8 step≈400.
- 2026-04-04 13:27:41: second status sync — 100.119 bootstrap finished env copy and entered dataset copy; local ph32_ex32 step≈900; remote ph8_ex8 step≈800; remote ph16_ex8 step≈800.
- 2026-04-04 13:35:31: 100.119 bootstrap data/env copy finished. Original validation command hit a quoting bug, then I manually revalidated torch+mujoco+swanlab and launched ph16_ex16/ph32_ex8/ph32_ex16 with pids 81129/81130/81131.
- 2026-04-04 13:37:36: all 6 Phase-1 runs are now up. SwanLab links recorded in status.json; latest observed steps ~ local 900 / 5880 runs 800 / L20 runs 100.
- 2026-04-04 14:41:08: diagnosed remote first-rollout crash as early mujoco import before MUJOCO_GL=egl in eval_vla.py via raw_action_trajectory_viewer. Added regression test tests/test_eval_vla_headless_import.py, fixed import to lazy-load, verified 20-step headless eval on 5880 and L20, then resumed 5 failed runs from step 4374. Current resumed pids: ph8_ex8=938714, ph16_ex8=938717, ph16_ex16=90169, ph32_ex8=90173, ph32_ex16=90175.

View File

@@ -0,0 +1,38 @@
# Phase-1 IMF Horizon Grid Summary
- Generated: 2026-04-04 23:43:38
- Fixed baseline: IMF AttnRes head, n_emb=384, n_layer=12, batch_size=80, lr=2.5e-4, max_steps=50k, rollout every 5 epochs with 5 episodes, 3 cameras `[r_vis, top, front]`.
- Primary metric: `checkpoints/vla_model_best.pt -> rollout_avg_reward` (max training-time rollout average reward).
## Ranked results
| Rank | Run ID | pred_horizon | num_action_steps | Best avg_reward | Best step | Final loss | Host |
|---:|---|---:|---:|---:|---:|---:|---|
| 1 | `ph16_ex8` | 16 | 8 | 610.8 | 21874 | 0.0034 | 100.73.14.65 |
| 2 | `ph16_ex16` | 16 | 16 | 561.2 | 48124 | 0.0045 | 100.119.99.14 |
| 3 | `ph32_ex32` | 32 | 32 | 513.2 | 43749 | 0.0040 | local |
| 4 | `ph8_ex8` | 8 | 8 | 415.6 | 48124 | 0.0070 | 100.73.14.65 |
| 5 | `ph32_ex8` | 32 | 8 | 361.6 | 43749 | 0.0048 | 100.119.99.14 |
| 6 | `ph32_ex16` | 32 | 16 | 239.6 | 48124 | 0.0038 | 100.119.99.14 |
## Main observations
- Best overall setting was **`pred_horizon=16`, `num_action_steps=8`** with **max avg_reward = 610.8** at step **21874**.
- Comparing horizon 16: executing 8 steps outperformed executing 16 steps (`ph16_ex8` > `ph16_ex16`).
- Comparing horizon 32: executing the full 32-step chunk was much better than executing 16 or 8 steps (`ph32_ex32` > `ph32_ex8` > `ph32_ex16`).
- Short horizon 8 with 8-step execution was competitive but clearly below the best 16/8 and 32/32 settings.
- In this sweep, increasing prediction horizon helped only when the executed chunk length matched a good control cadence; mismatch could hurt a lot (especially `ph32_ex16`).
## Raw results
- `ph16_ex8`: best avg_reward=610.8 @ step 21874, final_loss=0.0034, run_dir=`/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223`
- `ph16_ex16`: best avg_reward=561.2 @ step 48124, final_loss=0.0045, run_dir=`/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223`
- `ph32_ex32`: best avg_reward=513.2 @ step 43749, final_loss=0.0040, run_dir=`/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy/runs/imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223`
- `ph8_ex8`: best avg_reward=415.6 @ step 48124, final_loss=0.0070, run_dir=`/home/droid/roboimi_suite_20260404/runs/imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223`
- `ph32_ex8`: best avg_reward=361.6 @ step 43749, final_loss=0.0048, run_dir=`/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223`
- `ph32_ex16`: best avg_reward=239.6 @ step 48124, final_loss=0.0038, run_dir=`/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223`
## Recommendation for Phase-2 anchor
- Use **`pred_horizon=16`, `num_action_steps=8`** as the strongest Phase-1 baseline if the goal is purely maximizing rollout reward.
- If phase-2 needs a more conservative action execution budget, `ph16_ex8` is the strongest non-full-32 execution setting and may still be a good comparison anchor.

View File

@@ -0,0 +1,165 @@
{
"suite_name": "2026-04-04-imf-horizon-grid",
"updated_at": "2026-04-04 23:46:01",
"phase": "phase1_completed",
"provisioning": {
"100.119.99.14": {
"state": "completed_manual_launch",
"bootstrap_pid_local": 1437837,
"log_path": "experiment_suites/2026-04-04-imf-horizon-grid/provision_logs/100.119.99.14-bootstrap-20260404-131223.log",
"env_copy": "completed",
"dataset_copy": "completed",
"launch_watcher_pid_local": null,
"launch_watcher_log": "experiment_suites/2026-04-04-imf-horizon-grid/launch_logs/100.119.99.14-launch-watcher-20260404-131223.log",
"swanlab_copy": "completed",
"bootstrap_validation_note": "initial validation command had a quoting bug; manual validation passed and launches were started successfully"
}
},
"runs": {
"ph8_ex8": {
"status": "finished",
"host": "100.73.14.65",
"gpu": 0,
"run_name": "imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223",
"workdir": "/home/droid/roboimi_suite_20260404",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223/train_vla.log",
"run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223",
"pred_horizon": 8,
"num_action_steps": 8,
"pid": 938714,
"launch_log": "experiment_suite_launch_logs/imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223.restartfix-20260404-143827.log",
"latest_step": 50000,
"latest_log_sync": "2026-04-04 23:42:34",
"swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/i5syc57b6zq7rbkrtqy7b",
"process_running": false,
"best_step": 48124,
"best_rollout_avg_reward": 415.6,
"final_loss": 0.007008877582848072
},
"ph16_ex8": {
"status": "finished",
"host": "100.73.14.65",
"gpu": 1,
"run_name": "imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223",
"workdir": "/home/droid/roboimi_suite_20260404",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223/train_vla.log",
"run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223",
"pred_horizon": 16,
"num_action_steps": 8,
"pid": 938717,
"launch_log": "experiment_suite_launch_logs/imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223.restartfix-20260404-143827.log",
"latest_step": 50000,
"latest_log_sync": "2026-04-04 23:42:34",
"swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/4rusbrpfxmw4ffii1ul5w",
"process_running": false,
"best_step": 21874,
"best_rollout_avg_reward": 610.8,
"final_loss": 0.0034315965604037046
},
"ph16_ex16": {
"status": "finished",
"host": "100.119.99.14",
"gpu": 0,
"run_name": "imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223",
"workdir": "/home/droid/roboimi_suite_20260404",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223/train_vla.log",
"run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223",
"pred_horizon": 16,
"num_action_steps": 16,
"pid": 90169,
"launch_log": "experiment_suite_launch_logs/imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223.restartfix-20260404-143827.log",
"latest_log_sync": "2026-04-04 23:42:34",
"latest_step": 50000,
"swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/wwm232k6190gexnze8mg6",
"process_running": false,
"best_step": 48124,
"best_rollout_avg_reward": 561.2,
"final_loss": 0.004544622730463743
},
"ph32_ex8": {
"status": "finished",
"host": "100.119.99.14",
"gpu": 1,
"run_name": "imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223",
"workdir": "/home/droid/roboimi_suite_20260404",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223/train_vla.log",
"run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223",
"pred_horizon": 32,
"num_action_steps": 8,
"pid": 90173,
"launch_log": "experiment_suite_launch_logs/imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223.restartfix-20260404-143827.log",
"latest_log_sync": "2026-04-04 23:42:34",
"latest_step": 50000,
"swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/o5y2xjb2rsb3lmfcuhy4p",
"process_running": false,
"best_step": 43749,
"best_rollout_avg_reward": 361.6,
"final_loss": 0.004788532387465239
},
"ph32_ex16": {
"status": "finished",
"host": "100.119.99.14",
"gpu": 2,
"run_name": "imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223",
"workdir": "/home/droid/roboimi_suite_20260404",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223/train_vla.log",
"run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223",
"pred_horizon": 32,
"num_action_steps": 16,
"pid": 90175,
"launch_log": "experiment_suite_launch_logs/imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223.restartfix-20260404-143827.log",
"latest_log_sync": "2026-04-04 23:42:34",
"latest_step": 50000,
"swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/54cjpgba9eqsopdm0l8d3",
"process_running": false,
"best_step": 48124,
"best_rollout_avg_reward": 239.6,
"final_loss": 0.0038348555099219084
},
"ph32_ex32": {
"status": "finished",
"host": "local",
"gpu": 0,
"run_name": "imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223",
"workdir": "/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy",
"dataset_dir": "/home/droid/project/diana_sim/sim_transfer",
"log_path": "/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy/runs/imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223/train_vla.log",
"run_dir": "/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy/runs/imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223",
"pred_horizon": 32,
"num_action_steps": 32,
"pid": 1437836,
"launch_log": "experiment_suites/2026-04-04-imf-horizon-grid/launch_logs/imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223.launch.log",
"latest_step": 50000,
"latest_log_sync": "2026-04-04 23:42:34",
"swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/ajs2m218jd260hawhy5ns",
"process_running": false,
"latest_rollout_avg_reward": 513.2,
"best_rollout_avg_reward": 513.2,
"best_step": 43749,
"final_loss": 0.003953303210437298
}
},
"monitor": {
"state": "running",
"pid_local": 1443268,
"log_path": "experiment_suites/2026-04-04-imf-horizon-grid/monitor_logs/status-sync-20260404-131223.log",
"interval_seconds": 300
},
"debug": {
"remote_rollout_failure_20260404": {
"root_cause": "eval_vla.py imported raw_action_trajectory_viewer at module import time, which imported mujoco before MUJOCO_GL=egl was set; remote headless rollout then fell back to GLFW/X11 and crashed with mujoco.FatalError: gladLoadGL error during env.reset()->mj.Renderer(...)",
"fixed_file": "roboimi/demos/vla_scripts/eval_vla.py",
"verification": {
"pytest": "tests/test_eval_vla_headless_import.py passed",
"remote_eval_5880": "1 episode x 20 steps headless eval passed",
"remote_eval_l20": "1 episode x 20 steps headless eval passed"
}
}
},
"phase1_summary_md": "/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy/experiment_suites/2026-04-04-imf-horizon-grid/phase1_summary.md"
}