Files
roboimi/experiment_suites/2026-04-04-imf-horizon-grid/status.json
2026-04-05 00:07:59 +08:00

166 lines
8.2 KiB
JSON

{
"suite_name": "2026-04-04-imf-horizon-grid",
"updated_at": "2026-04-05 00:07:39",
"phase": "phase1_completed",
"provisioning": {
"100.119.99.14": {
"state": "completed_manual_launch",
"bootstrap_pid_local": 1437837,
"log_path": "experiment_suites/2026-04-04-imf-horizon-grid/provision_logs/100.119.99.14-bootstrap-20260404-131223.log",
"env_copy": "completed",
"dataset_copy": "completed",
"launch_watcher_pid_local": null,
"launch_watcher_log": "experiment_suites/2026-04-04-imf-horizon-grid/launch_logs/100.119.99.14-launch-watcher-20260404-131223.log",
"swanlab_copy": "completed",
"bootstrap_validation_note": "initial validation command had a quoting bug; manual validation passed and launches were started successfully"
}
},
"runs": {
"ph8_ex8": {
"status": "running",
"host": "100.73.14.65",
"gpu": 0,
"run_name": "imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223",
"workdir": "/home/droid/roboimi_suite_20260404",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223/train_vla.log",
"run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223",
"pred_horizon": 8,
"num_action_steps": 8,
"pid": 938714,
"launch_log": "experiment_suite_launch_logs/imf-p1-ph08-ex08-emb384-l12-ms50k-5880g0-20260404-131223.restartfix-20260404-143827.log",
"latest_step": 50000,
"latest_log_sync": "2026-04-05 00:07:39",
"swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/i5syc57b6zq7rbkrtqy7b",
"process_running": true,
"best_step": 48124,
"best_rollout_avg_reward": 415.6,
"final_loss": 0.007008877582848072
},
"ph16_ex8": {
"status": "running",
"host": "100.73.14.65",
"gpu": 1,
"run_name": "imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223",
"workdir": "/home/droid/roboimi_suite_20260404",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223/train_vla.log",
"run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223",
"pred_horizon": 16,
"num_action_steps": 8,
"pid": 938717,
"launch_log": "experiment_suite_launch_logs/imf-p1-ph16-ex08-emb384-l12-ms50k-5880g1-20260404-131223.restartfix-20260404-143827.log",
"latest_step": 50000,
"latest_log_sync": "2026-04-05 00:07:39",
"swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/4rusbrpfxmw4ffii1ul5w",
"process_running": true,
"best_step": 21874,
"best_rollout_avg_reward": 610.8,
"final_loss": 0.0034315965604037046
},
"ph16_ex16": {
"status": "running",
"host": "100.119.99.14",
"gpu": 0,
"run_name": "imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223",
"workdir": "/home/droid/roboimi_suite_20260404",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223/train_vla.log",
"run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223",
"pred_horizon": 16,
"num_action_steps": 16,
"pid": 90169,
"launch_log": "experiment_suite_launch_logs/imf-p1-ph16-ex16-emb384-l12-ms50k-l20g0-20260404-131223.restartfix-20260404-143827.log",
"latest_log_sync": "2026-04-05 00:07:39",
"latest_step": 50000,
"swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/wwm232k6190gexnze8mg6",
"process_running": true,
"best_step": 48124,
"best_rollout_avg_reward": 561.2,
"final_loss": 0.004544622730463743
},
"ph32_ex8": {
"status": "running",
"host": "100.119.99.14",
"gpu": 1,
"run_name": "imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223",
"workdir": "/home/droid/roboimi_suite_20260404",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223/train_vla.log",
"run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223",
"pred_horizon": 32,
"num_action_steps": 8,
"pid": 90173,
"launch_log": "experiment_suite_launch_logs/imf-p1-ph32-ex08-emb384-l12-ms50k-l20g1-20260404-131223.restartfix-20260404-143827.log",
"latest_log_sync": "2026-04-05 00:07:39",
"latest_step": 50000,
"swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/o5y2xjb2rsb3lmfcuhy4p",
"process_running": true,
"best_step": 43749,
"best_rollout_avg_reward": 361.6,
"final_loss": 0.004788532387465239
},
"ph32_ex16": {
"status": "running",
"host": "100.119.99.14",
"gpu": 2,
"run_name": "imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223",
"workdir": "/home/droid/roboimi_suite_20260404",
"dataset_dir": "/home/droid/sim_dataset/sim_transfer",
"log_path": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223/train_vla.log",
"run_dir": "/home/droid/roboimi_suite_20260404/runs/imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223",
"pred_horizon": 32,
"num_action_steps": 16,
"pid": 90175,
"launch_log": "experiment_suite_launch_logs/imf-p1-ph32-ex16-emb384-l12-ms50k-l20g2-20260404-131223.restartfix-20260404-143827.log",
"latest_log_sync": "2026-04-05 00:07:39",
"latest_step": 50000,
"swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/54cjpgba9eqsopdm0l8d3",
"process_running": true,
"best_step": 48124,
"best_rollout_avg_reward": 239.6,
"final_loss": 0.0038348555099219084
},
"ph32_ex32": {
"status": "finished",
"host": "local",
"gpu": 0,
"run_name": "imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223",
"workdir": "/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy",
"dataset_dir": "/home/droid/project/diana_sim/sim_transfer",
"log_path": "/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy/runs/imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223/train_vla.log",
"run_dir": "/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy/runs/imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223",
"pred_horizon": 32,
"num_action_steps": 32,
"pid": 1437836,
"launch_log": "experiment_suites/2026-04-04-imf-horizon-grid/launch_logs/imf-p1-ph32-ex32-emb384-l12-ms50k-5090-20260404-131223.launch.log",
"latest_step": 49900,
"latest_log_sync": "2026-04-05 00:07:39",
"swanlab_url": "https://swanlab.cn/@game-loader/roboimi-vla/runs/ajs2m218jd260hawhy5ns",
"process_running": false,
"latest_rollout_avg_reward": 513.2,
"best_rollout_avg_reward": 513.2,
"best_step": 43749,
"final_loss": 0.003953303210437298
}
},
"monitor": {
"state": "running",
"pid_local": 1443268,
"log_path": "experiment_suites/2026-04-04-imf-horizon-grid/monitor_logs/status-sync-20260404-131223.log",
"interval_seconds": 300
},
"debug": {
"remote_rollout_failure_20260404": {
"root_cause": "eval_vla.py imported raw_action_trajectory_viewer at module import time, which imported mujoco before MUJOCO_GL=egl was set; remote headless rollout then fell back to GLFW/X11 and crashed with mujoco.FatalError: gladLoadGL error during env.reset()->mj.Renderer(...)",
"fixed_file": "roboimi/demos/vla_scripts/eval_vla.py",
"verification": {
"pytest": "tests/test_eval_vla_headless_import.py passed",
"remote_eval_5880": "1 episode x 20 steps headless eval passed",
"remote_eval_l20": "1 episode x 20 steps headless eval passed"
}
}
},
"phase1_summary_md": "/home/droid/project/roboimi/.worktrees/feat-imf-attnres-policy/experiment_suites/2026-04-04-imf-horizon-grid/phase1_summary.md"
}