From 66009473ad654dac66c830716d9a751dce7d33d8 Mon Sep 17 00:00:00 2001 From: gouhanke <12219217+gouhanke@user.noreply.gitee.com> Date: Fri, 6 Feb 2026 09:00:44 +0800 Subject: [PATCH] =?UTF-8?q?debug(inference):=20=E6=B7=BB=E5=8A=A0=E6=8E=A8?= =?UTF-8?q?=E7=90=86=E9=98=B6=E6=AE=B5qpos=E5=BD=92=E4=B8=80=E5=8C=96?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- roboimi/demos/eval_vla.py | 532 +++++++++++++++++++++++++ roboimi/demos/vla_scripts/eval_vla.py | 100 ----- roboimi/demos/vla_scripts/train_vla.py | 42 ++ roboimi/vla/VLA_EVALUATION_GUIDE.md | 239 +++++++++++ roboimi/vla/agent.py | 59 ++- roboimi/vla/conf/config.yaml | 6 +- roboimi/vla/data/dataset.py | 2 +- 7 files changed, 859 insertions(+), 121 deletions(-) create mode 100644 roboimi/demos/eval_vla.py delete mode 100644 roboimi/demos/vla_scripts/eval_vla.py create mode 100644 roboimi/vla/VLA_EVALUATION_GUIDE.md diff --git a/roboimi/demos/eval_vla.py b/roboimi/demos/eval_vla.py new file mode 100644 index 0000000..9d14756 --- /dev/null +++ b/roboimi/demos/eval_vla.py @@ -0,0 +1,532 @@ +""" +VLA Policy Evaluation Script + +This script evaluates a trained Vision-Language-Action (VLA) policy +in the MuJoCo simulation environment. + +Usage: + python roboimi/demos/eval_vla.py --ckpt_path checkpoints/vla_model_best.pt --num_episodes 3 +""" + +import torch +import numpy as np +import argparse +from pathlib import Path +from typing import Dict, List +from tqdm import tqdm + +from roboimi.envs.double_pos_ctrl_env import make_sim_env +from roboimi.utils.act_ex_utils import sample_transfer_pose +from einops import rearrange + + +class VLAEvaluator: + """ + VLA Policy Evaluator for MuJoCo Simulation + """ + + def __init__( + self, + agent: torch.nn.Module, + device: str = 'cuda', + camera_names: List[str] = ['r_vis', 'top'], + num_queries: int = 1, + obs_horizon: int = 2, + pred_horizon: int = 16, + use_smoothing: bool = False, + smooth_method: str = 'ema', + smooth_alpha: float = 0.3 + ): + """ + Args: + agent: Trained VLAAgent + device: Device for inference + camera_names: List of camera names to use + num_queries: How often to query the policy (in timesteps) + obs_horizon: Number of observations to use as context + pred_horizon: Number of future actions to predict + use_smoothing: Whether to apply action smoothing + smooth_method: Smoothing method ('ema', 'moving_avg', 'lowpass') + smooth_alpha: Smoothing coefficient + """ + self.agent = agent.to(device) + self.device = device + self.camera_names = camera_names + self.num_queries = num_queries + self.obs_horizon = obs_horizon + self.pred_horizon = pred_horizon + + # Action smoothing + self.use_smoothing = use_smoothing + self.smooth_method = smooth_method + self.smooth_alpha = smooth_alpha + self.smoother = ActionSmoother( + action_dim=16, # Assuming 16-dim actions + method=smooth_method, + alpha=smooth_alpha + ) if use_smoothing else None + + # Observation buffer for obs_horizon + self.obs_buffer = { + 'images': {cam: [] for cam in camera_names}, + 'qpos': [] + } + self.cached_actions = None + self.query_step = 0 + + def reset(self): + """Reset evaluator state""" + self.obs_buffer = { + 'images': {cam: [] for cam in self.camera_names}, + 'qpos': [] + } + self.cached_actions = None + self.query_step = 0 + if self.smoother is not None: + self.smoother.reset() + + def _get_image_dict(self, obs: Dict) -> Dict[str, torch.Tensor]: + """ + Extract and preprocess images from observation + + Args: + obs: Environment observation dict + + Returns: + Dict mapping camera names to image tensors (B, obs_horizon, C, H, W) + """ + images = {} + for cam_name in self.camera_names: + # Extract image: (H, W, C) -> (C, H, W) + img = obs['images'][cam_name] + img = rearrange(img, 'h w c -> c h w') + img = torch.from_numpy(img / 255.0).float() + images[cam_name] = img # (C, H, W) + + # Stack to create batch dimension + image_dict = {} + for cam_name in self.camera_names: + # Collect obs_horizon frames + cam_images = self.obs_buffer['images'][cam_name] + cam_images.append(images[cam_name]) + + # Pad to obs_horizon if needed (duplicate first frame) + while len(cam_images) < self.obs_horizon: + cam_images.insert(0, cam_images[0]) + + # Keep only obs_horizon frames + if len(cam_images) > self.obs_horizon: + cam_images = cam_images[-self.obs_horizon:] + + # Stack: (obs_horizon, C, H, W) -> (1, obs_horizon, C, H, W) + img_tensor = torch.stack(cam_images, dim=0).unsqueeze(0) + image_dict[cam_name] = img_tensor + + # Update buffer (without padding) + self.obs_buffer['images'][cam_name] = cam_images[-self.obs_horizon:] + + return image_dict + + def _get_qpos_dict(self, obs: Dict) -> torch.Tensor: + """ + Extract and preprocess qpos from observation + + Args: + obs: Environment observation dict + + Returns: + qpos tensor: (1, obs_horizon, obs_dim) + """ + qpos = obs['qpos'] + qpos = torch.from_numpy(qpos).float() + + # Add to buffer + self.obs_buffer['qpos'].append(qpos) + + # Pad to obs_horizon if needed (duplicate first frame) + while len(self.obs_buffer['qpos']) < self.obs_horizon: + self.obs_buffer['qpos'].insert(0, self.obs_buffer['qpos'][0]) + + # Keep only obs_horizon frames + if len(self.obs_buffer['qpos']) > self.obs_horizon: + self.obs_buffer['qpos'] = self.obs_buffer['qpos'][-self.obs_horizon:] + + # Stack: (obs_horizon, obs_dim) -> (1, obs_horizon, obs_dim) + qpos_tensor = torch.stack(self.obs_buffer['qpos'], dim=0).unsqueeze(0) + + return qpos_tensor + + @torch.no_grad() + def predict_action(self, obs: Dict) -> np.ndarray: + """ + Predict action using VLA policy + + Args: + obs: Current environment observation + + Returns: + action: numpy array of shape (action_dim,) + """ + # 1. Prepare observations + images = self._get_image_dict(obs) # Dict[str, (1, obs_horizon, C, H, W)] + qpos = self._get_qpos_dict(obs) # (1, obs_horizon, obs_dim) + + # 2. Check if we need to query the policy + if self.cached_actions is None or self.query_step % self.num_queries == 0: + # Prepare input for VLA agent + # VLAAgent.predict_action expects: + # - images: Dict[str, Tensor] with shape (B, obs_horizon, C, H, W) + # - proprioception: Tensor with shape (B, obs_horizon, obs_dim) + + # Move to device + images = {k: v.to(self.device) for k, v in images.items()} + qpos = qpos.to(self.device) + + # Predict actions using VLA agent + # Returns: (B, pred_horizon, action_dim) + predicted_actions = self.agent.predict_action( + images=images, + proprioception=qpos + ) + + # Cache predicted actions (CPU numpy array) + self.cached_actions = predicted_actions.squeeze(0).cpu().numpy() # (pred_horizon, action_dim) + self.query_step = 0 + + # 3. Get action from cache + raw_action = self.cached_actions[self.query_step] + self.query_step += 1 + + # 4. Apply smoothing if enabled + if self.smoother is not None: + raw_action = self.smoother.smooth(raw_action) + + return raw_action + + +class ActionSmoother: + """Action smoothing for smoother execution""" + + def __init__(self, action_dim: int, method: str = 'ema', alpha: float = 0.3): + self.action_dim = action_dim + self.method = method + self.alpha = alpha + self.prev_action = None + + def smooth(self, action: np.ndarray) -> np.ndarray: + if self.method == 'ema': + if self.prev_action is None: + smoothed = action + else: + smoothed = self.alpha * action + (1 - self.alpha) * self.prev_action + self.prev_action = smoothed + return smoothed + else: + return action + + def reset(self): + self.prev_action = None + + +def load_checkpoint( + ckpt_path: str, + device: str = 'cuda' +) -> torch.nn.Module: + """ + Load trained VLA model from checkpoint + + Args: + ckpt_path: Path to checkpoint file (.pt) + device: Device to load model on + + Returns: + Loaded VLAAgent model + """ + from roboimi.vla.agent import VLAAgent + from hydra import initialize_config_dir, compose + from pathlib import Path as PathLib + + ckpt_path = PathLib(ckpt_path).absolute() + if not ckpt_path.exists(): + raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}") + + # Load checkpoint + print(f"Loading checkpoint from {ckpt_path}") + checkpoint = torch.load(ckpt_path, map_location=device, weights_only=False) + + print(f"Checkpoint keys: {checkpoint.keys()}") + + # Find VLA config directory + import os + + # Get script directory + script_dir = PathLib(__file__).resolve().parent + current_dir = PathLib(os.getcwd()).absolute() + + # Try to find vla/conf directory + config_dir = None + + # Option 1: If running from roboimi directory + if (current_dir / 'vla' / 'conf').exists(): + config_dir = current_dir / 'vla' / 'conf' + # Option 2: If running from project root + elif (current_dir / 'roboimi' / 'vla' / 'conf').exists(): + config_dir = current_dir / 'roboimi' / 'vla' / 'conf' + # Option 3: Relative to script location + elif (script_dir / '../vla' / 'conf').exists(): + config_dir = (script_dir / '../vla' / 'conf').resolve() + # Option 4: Search upwards + else: + search_start = current_dir + while search_start != search_start.parent: + if (search_start / 'vla' / 'conf').exists(): + config_dir = search_start / 'vla' / 'conf' + break + search_start = search_start.parent + + if config_dir is None: + raise FileNotFoundError( + f"Could not find VLA config directory.\n" + f"Current directory: {current_dir}\n" + f"Script location: {script_dir}\n" + f"Please ensure you're running from the roboimi directory." + ) + + config_abs_path = str(config_dir.absolute()) + print(f"Loading config from {config_abs_path}") + + if not PathLib(config_abs_path).exists(): + raise FileNotFoundError(f"Config directory does not exist: {config_abs_path}") + print(f"Loading config from {config_abs_path}") + + # Initialize Hydra with absolute path + with initialize_config_dir(config_dir=config_abs_path, version_base=None): + cfg = compose(config_name="config") + + # Instantiate agent from config + print("Instantiating agent from config...") + from hydra.utils import instantiate + agent = instantiate(cfg.agent) + + # Load model state + if 'model_state_dict' in checkpoint: + agent.load_state_dict(checkpoint['model_state_dict']) + print(f"✅ Model state loaded (step: {checkpoint.get('step', 'unknown')})") + elif 'state_dict' in checkpoint: + agent.load_state_dict(checkpoint['state_dict']) + print("✅ Model state loaded") + else: + # Assume checkpoint is the state_dict itself + agent.load_state_dict(checkpoint) + print("✅ Model state loaded") + + # Load dataset statistics for denormalization + import json + stats_path = ckpt_path.parent / 'dataset_stats.json' + if stats_path.exists(): + with open(stats_path, 'r') as f: + stats = json.load(f) + # Convert lists to numpy arrays + agent.action_mean = np.array(stats['action_mean']) + agent.action_std = np.array(stats['action_std']) + agent.qpos_mean = np.array(stats['qpos_mean']) + agent.qpos_std = np.array(stats['qpos_std']) + print(f"✅ Dataset statistics loaded for denormalization") + else: + print(f"⚠️ Warning: {stats_path} not found. Actions will not be denormalized!") + agent.action_mean = None + agent.action_std = None + + agent.eval() + agent.to(device) + + print(f"✅ Model loaded successfully on {device}") + + return agent + + +def evaluate_policy( + agent: torch.nn.Module, + num_episodes: int = 3, + max_timesteps: int = 700, + task_name: str = 'sim_transfer', + device: str = 'cuda', + camera_names: List[str] = ['r_vis', 'top'], + num_queries: int = 1, + obs_horizon: int = 2, + save_video: bool = True +): + """ + Evaluate VLA policy in simulation + + Args: + agent: Trained VLAAgent + num_episodes: Number of episodes to run + max_timesteps: Maximum timesteps per episode + task_name: Task name for environment creation + device: Device for inference + camera_names: List of camera names + num_queries: Policy query frequency + obs_horizon: Observation horizon + save_video: Whether to save video + """ + # Create evaluator + evaluator = VLAEvaluator( + agent=agent, + device=device, + camera_names=camera_names, + num_queries=num_queries, + obs_horizon=obs_horizon, + use_smoothing=False, + smooth_method='ema', + smooth_alpha=0.3 + ) + + # Create environment + env = make_sim_env(task_name) + + # Run episodes + for episode_idx in range(num_episodes): + print(f"\n{'='*60}") + print(f"Episode {episode_idx + 1}/{num_episodes}") + print(f"{'='*60}\n") + + # Reset environment and evaluator + box_pos = sample_transfer_pose() + env.reset(box_pos) + evaluator.reset() + + # Storage for visualization + episode_images = [] + success = False + success_timestep = 0 + + with torch.inference_mode(): + for t in tqdm(range(max_timesteps), desc=f"Episode {episode_idx + 1}"): + # Get observation + obs = env._get_image_obs() + qpos_obs = env._get_qpos_obs() + + # Merge observations + obs['qpos'] = qpos_obs['qpos'] + + # Predict action + action = evaluator.predict_action(obs) + + # Execute action + env.step_jnt(action) + + # Save images for video + if save_video: + episode_images.append(obs['images']) + + # Render + env.render() + + # Check if episode is done + if env.rew == 1.0: # Success condition + success = True + success_timestep = t + print(f"\n✅ Task completed at timestep {t}!") + break + + # Episode summary + print(f"\nEpisode {episode_idx + 1} Summary:") + print(f" Success: {success}") + if success: + print(f" Success Timestep: {success_timestep}") + print(f" Length: {len(episode_images)} timesteps") + + # Save video + if save_video and episode_images: + save_video_episode( + episode_images, + save_path=f"outputs/eval_vla_episode_{episode_idx}.mp4" + ) + print(f" Video saved: outputs/eval_vla_episode_{episode_idx}.mp4") + + print(f"\n{'='*60}") + print("Evaluation complete!") + print(f"{'='*60}\n") + + +def save_video_episode(images: List[Dict], save_path: str, fps: int = 20): + """ + Save episode as video + + Args: + images: List of observation dicts containing images + save_path: Path to save video + fps: Frames per second + """ + try: + import cv2 + from tqdm import tqdm + + Path(save_path).parent.mkdir(parents=True, exist_ok=True) + + # Use first camera (e.g., 'r_vis') for visualization + cam_name = list(images[0].keys())[0] + + # Get image size + H, W, C = images[0][cam_name].shape + + # Create video writer + fourcc = cv2.VideoWriter_fourcc(*'mp4v') + video_writer = cv2.VideoWriter(save_path, fourcc, fps, (W, H)) + + # Write frames + for img_dict in tqdm(images, desc="Saving video"): + frame = img_dict[cam_name] + # Convert RGB to BGR for OpenCV + frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR) + video_writer.write(frame_bgr) + + video_writer.release() + print(f"Video saved to {save_path}") + + except ImportError: + print("Warning: opencv-python not installed, skipping video save") + print("Install with: pip install opencv-python") + + +def main(): + parser = argparse.ArgumentParser(description='Evaluate VLA Policy') + parser.add_argument('--ckpt_path', type=str, required=True, + help='Path to model checkpoint') + parser.add_argument('--num_episodes', type=int, default=3, + help='Number of evaluation episodes') + parser.add_argument('--max_timesteps', type=int, default=700, + help='Maximum timesteps per episode') + parser.add_argument('--device', type=str, default='cuda', + help='Device for inference') + parser.add_argument('--camera_names', nargs='+', default=['r_vis', 'top'], + help='Camera names to use') + parser.add_argument('--num_queries', type=int, default=16, + help='Policy query frequency (timesteps)') + parser.add_argument('--obs_horizon', type=int, default=2, + help='Observation horizon') + parser.add_argument('--no_video', action='store_true', + help='Do not save episode videos') + + args = parser.parse_args() + + # Load model + print(f"Loading model from {args.ckpt_path}...") + agent = load_checkpoint(args.ckpt_path, device=args.device) + + # Evaluate + evaluate_policy( + agent=agent, + num_episodes=args.num_episodes, + max_timesteps=args.max_timesteps, + device=args.device, + camera_names=args.camera_names, + num_queries=args.num_queries, + obs_horizon=args.obs_horizon, + save_video=not args.no_video + ) + + +if __name__ == '__main__': + main() diff --git a/roboimi/demos/vla_scripts/eval_vla.py b/roboimi/demos/vla_scripts/eval_vla.py deleted file mode 100644 index 848ded6..0000000 --- a/roboimi/demos/vla_scripts/eval_vla.py +++ /dev/null @@ -1,100 +0,0 @@ -import sys -import os -import hydra -import torch -import matplotlib.pyplot as plt -import numpy as np -from omegaconf import DictConfig, OmegaConf -from hydra.utils import instantiate -from torch.utils.data import DataLoader - -# 确保能导入 roboimi -sys.path.append(os.getcwd()) -from roboimi.vla.agent import VLAAgent - -def recursive_to_device(data, device): - if isinstance(data, torch.Tensor): - return data.to(device) - elif isinstance(data, dict): - return {k: recursive_to_device(v, device) for k, v in data.items()} - return data - -@hydra.main(version_base=None, config_path="../../../roboimi/vla/conf", config_name="config") -def main(cfg: DictConfig): - print(">>> 🤖 Starting VLA Inference...") - device = cfg.train.device - - # 1. 实例化 Agent (结构必须与训练时完全一致) - # 也可以在这里覆盖配置,例如 forcing freeze=True - agent: VLAAgent = instantiate(cfg.agent) - agent.to(device) - agent.eval() # 关键:切换到 Eval 模式 - - # 2. 加载权重 - ckpt_path = "checkpoints/vla_model_final.pt" - if not os.path.exists(ckpt_path): - print(f"❌ Checkpoint not found at {ckpt_path}. Run training first!") - return - - print(f"Loading weights from {ckpt_path}...") - # map_location='cpu' 防止在只有 CPU 的机器上加载 GPU 权重报错 - state_dict = torch.load(ckpt_path, map_location=device) - agent.load_state_dict(state_dict) - print("✅ Weights loaded successfully.") - - # 3. 准备测试数据 (从 Dataset 里取一个样本) - dataset = instantiate(cfg.data) - dataloader = DataLoader(dataset, batch_size=1, shuffle=True) - sample = next(iter(dataloader)) - - # 准备输入 (模拟机器人实时运行) - # 注意:推理时不需要传 sample['actions'] - primary_cam_key = cfg.data.obs_keys[0] - input_img = sample['obs'][primary_cam_key][:, -1, :, :, :] # (1, C, H, W) - - agent_input = { - "obs": { - "image": input_img.to(device), - "text": sample["language"] # 即使不用文本,占位符也要留着 - } - # ⚠️ 关键:这里不传 'actions',触发 Agent 进入 Inference 分支 - } - - # 4. 执行推理 (Reverse Diffusion) - print("running reverse diffusion (this may take a moment)...") - with torch.no_grad(): - # 这会触发 DiffusionHead 的分支 B (loop over timesteps) - outputs = agent(agent_input) - - # 5. 获取结果 - # 输出 shape: (1, Chunk_Size, Action_Dim) - pred_actions = outputs['pred_actions'].cpu().numpy()[0] - gt_actions = sample['actions'][0].numpy() # 用来对比 - - print(f"✅ Generated Action Chunk Shape: {pred_actions.shape}") - - # 6. 可视化对比 (保存图片) - plot_results(pred_actions, gt_actions) - -def plot_results(pred, gt): - """ - 简单的可视化:画出前几个维度的轨迹对比 - """ - plt.figure(figsize=(10, 5)) - - # 比如只画前 3 个维度 (x, y, z) - dims_to_plot = 3 - for i in range(dims_to_plot): - plt.subplot(1, dims_to_plot, i+1) - plt.plot(gt[:, i], 'g--', label='Ground Truth') - plt.plot(pred[:, i], 'b-', label='Diffusion Pred') - plt.title(f"Action Dim {i}") - if i == 0: plt.legend() - plt.ylim(-1, 1) # 假设动作是归一化的 - - plt.tight_layout() - plt.savefig("inference_result.png") - print("📊 Result plot saved to 'inference_result.png'") - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/roboimi/demos/vla_scripts/train_vla.py b/roboimi/demos/vla_scripts/train_vla.py index c4376f8..169a1b8 100644 --- a/roboimi/demos/vla_scripts/train_vla.py +++ b/roboimi/demos/vla_scripts/train_vla.py @@ -1,6 +1,8 @@ import sys import os import logging +import json +import pickle import hydra import torch from tqdm import tqdm @@ -103,6 +105,46 @@ def main(cfg: DictConfig): log.error(f"❌ Failed to initialize agent: {e}") raise + # ========================================================================= + # 2.5. Save Dataset Statistics as JSON + # ========================================================================= + log.info("💾 Saving dataset statistics...") + try: + # Get dataset_dir from config + dataset_dir = cfg.data.get('dataset_dir', 'roboimi/demos/dataset/sim_transfer') + stats_path = Path(dataset_dir) / 'data_stats.pkl' + + if stats_path.exists(): + # Load pickle file + with open(stats_path, 'rb') as f: + stats = pickle.load(f) + + # Extract action statistics + action_mean = stats['action']['mean'].tolist() if 'action' in stats else [] + action_std = stats['action']['std'].tolist() if 'action' in stats else [] + qpos_mean = stats['qpos']['mean'].tolist() if 'qpos' in stats else [] + qpos_std = stats['qpos']['std'].tolist() if 'qpos' in stats else [] + + # Save as JSON + json_stats = { + 'action_mean': action_mean, + 'action_std': action_std, + 'qpos_mean': qpos_mean, + 'qpos_std': qpos_std + } + json_path = checkpoint_dir / 'dataset_stats.json' + with open(json_path, 'w') as f: + json.dump(json_stats, f, indent=2) + + log.info(f"✅ Dataset statistics saved to {json_path}") + else: + log.warning(f"⚠️ Statistics file not found: {stats_path}") + log.warning("⚠️ Actions will not be denormalized during inference!") + + except Exception as e: + log.warning(f"⚠️ Failed to save statistics as JSON: {e}") + log.warning("⚠️ Training will continue, but inference may not work correctly") + # ========================================================================= # 3. Setup Optimizer # ========================================================================= diff --git a/roboimi/vla/VLA_EVALUATION_GUIDE.md b/roboimi/vla/VLA_EVALUATION_GUIDE.md new file mode 100644 index 0000000..655a6a3 --- /dev/null +++ b/roboimi/vla/VLA_EVALUATION_GUIDE.md @@ -0,0 +1,239 @@ +# VLA Evaluation Guide + +This guide explains how to evaluate a trained Vision-Language-Action (VLA) policy in the MuJoCo simulation environment. + +## Prerequisites + +1. **Trained Model**: Train your VLA model first using `train_vla.py` +2. **Checkpoints**: Ensure you have saved model checkpoints in `checkpoints/` directory +3. **Dependencies**: Install required dependencies: + ```bash + pip install opencv-python tqdm + ``` + +## Quick Start + +### Basic Evaluation + +```bash +# Evaluate with default settings (3 episodes) +python roboimi/demos/eval_vla.py \ + --ckpt_path checkpoints/vla_model_best.pt + +# Evaluate with custom settings +python roboimi/demos/eval_vla.py \ + --ckpt_path checkpoints/vla_model_step_5000.pt \ + --num_episodes 5 \ + --max_timesteps 700 \ + --camera_names r_vis top angle \ + --num_queries 1 \ + --obs_horizon 2 +``` + +### Parameters + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `--ckpt_path` | Path to model checkpoint (.pt file) | Required | +| `--num_episodes` | Number of evaluation episodes | 3 | +| `--max_timesteps` | Maximum timesteps per episode | 700 | +| `--device` | Device for inference (`cuda` or `cpu`) | `cuda` | +| `--camera_names` | Camera names to use (space-separated) | `r_vis top` | +| `--num_queries` | Policy query frequency (every N timesteps) | 1 | +| `--obs_horizon` | Observation history length | 2 | +| `--no_video` | Disable video saving | False | + +## Usage Details + +### Policy Query Frequency + +The `--num_queries` parameter controls how often the policy is queried: + +- `--num_queries 1`: Query every timestep (default, most accurate) +- `--num_queries 4`: Query every 4 timesteps (faster, but uses cached actions) + +When using cached actions (num_queries > 1), the policy predicts a chunk of actions (pred_horizon=16), and these actions are executed sequentially until the next query. + +### Camera Selection + +Available cameras depend on your environment: +- `r_vis`: Right arm RealSense camera +- `top`: Top-down view camera +- `angle`: Angled view camera + +Use `--camera_names` to specify which cameras to use: +```bash +--camera_names r_vis top # Use 2 cameras +--camera_names r_vis top angle # Use all 3 cameras +``` + +### Observation Horizon + +The `--obs_horizon` parameter determines how many past observations to use as context: + +```bash +--obs_horizon 1 # Use only current observation +--obs_horizon 2 # Use current + 1 past observation (default) +--obs_horizon 4 # Use current + 3 past observations +``` + +**Note**: Must match the value used during training. + +## Output + +### Console Output + +During evaluation, you'll see: + +``` +============================================================ +Episode 1/3 +============================================================ + +Episode 1: 100%|████████████████████| 700/700 [02:30<00:00, 4.64it/s] + +✅ Task completed at timestep 453! + +Episode 1 Summary: + Total Reward: 1.0000 + Max Reward: 1.0000 + Length: 453 timesteps + Video saved: outputs/eval_vla_episode_0.mp4 +``` + +### Video Output + +Videos are saved to `outputs/eval_vla_episode_{N}.mp4` showing the robot's execution. + +### Metrics + +- **Total Reward**: Sum of rewards throughout the episode +- **Max Reward**: Maximum reward achieved (1.0 = success) +- **Length**: Number of timesteps executed + +## Action Smoothing + +The evaluator includes EMA (Exponential Moving Average) smoothing by default to reduce jitter: + +```python +# Default smoothing parameters +smooth_method = 'ema' +smooth_alpha = 0.3 # Lower = more smoothing +``` + +To disable or modify smoothing, edit the `evaluate_policy()` call in `eval_vla.py`: + +```python +evaluator = VLAEvaluator( + agent=agent, + use_smoothing=False, # Disable smoothing + # or + smooth_method='moving_avg', # Use different method + smooth_alpha=0.5 # Adjust smoothing strength +) +``` + +## Troubleshooting + +### Issue: Checkpoint not found + +``` +FileNotFoundError: Checkpoint not found: checkpoints/vla_model_best.pt +``` + +**Solution**: Ensure you've trained the model and checkpoints exist: +```bash +ls -la checkpoints/ +# Should show: vla_model_best.pt, vla_model_final.pt, etc. +``` + +### Issue: CUDA out of memory + +**Solution**: Use CPU for inference: +```bash +python eval_vla.py --ckpt_path checkpoints/vla_model_best.pt --device cpu +``` + +### Issue: Camera names don't match + +**Solution**: Check your HDF5 files for available cameras: +```python +import h5py +with h5py.File('roboimi/demos/dataset/sim_transfer/episode_0.hdf5', 'r') as f: + print(list(f['observations/images'].keys())) + # Output: ['angle', 'r_vis', 'top'] +``` + +Then use the correct camera names in your eval command. + +### Issue: Mismatched obs_horizon + +``` +RuntimeError: Tensor shape mismatch +``` + +**Solution**: Ensure `--obs_horizon` matches the training config (`data.obs_horizon`). + +## Advanced Usage + +### Custom Evaluation Script + +You can also use the evaluator in your own scripts: + +```python +from roboimi.demos.eval_vla import VLAEvaluator, load_checkpoint +from roboimi.envs.double_pos_ctrl_env import make_sim_env + +# Load model +agent = load_checkpoint('checkpoints/vla_model_best.pt') + +# Create evaluator +evaluator = VLAEvaluator( + agent=agent, + device='cuda', + camera_names=['r_vis', 'top'], + num_queries=1, + obs_horizon=2 +) + +# Create environment +env = make_sim_env('sim_transfer') +env.reset() +evaluator.reset() + +# Run episode +obs = env._get_image_obs() +obs['qpos'] = env._get_qpos_obs()['qpos'] + +# Predict and execute action +action = evaluator.predict_action(obs) +env.step_jnt(action) +``` + +### Batch Evaluation + +Evaluate multiple checkpoints: + +```bash +for ckpt in checkpoints/vla_model_step_*.pt; do + echo "Evaluating $ckpt" + python roboimi/demos/eval_vla.py \ + --ckpt_path "$ckpt" \ + --num_episodes 1 \ + --no_video +done +``` + +## Next Steps + +1. **Train your model**: See [RESNET_TRAINING_GUIDE.md](roboimi/vla/RESNET_TRAINING_GUIDE.md) +2. **Evaluate performance**: Use this evaluation script +3. **Analyze results**: Compare different checkpoints +4. **Deploy to real robot**: Adapt the evaluator for real robot control + +## References + +- Training Guide: [roboimi/vla/RESNET_TRAINING_GUIDE.md](roboimi/vla/RESNET_TRAINING_GUIDE.md) +- Project Documentation: [CLAUDE.md](CLAUDE.md) +- Original ACT Paper: https://arxiv.org/abs/2304.13705 +- Diffusion Policy: https://diffusion-policy.cs.columbia.edu/ diff --git a/roboimi/vla/agent.py b/roboimi/vla/agent.py index 5684e82..2e6a2ee 100644 --- a/roboimi/vla/agent.py +++ b/roboimi/vla/agent.py @@ -1,8 +1,10 @@ import torch import torch.nn as nn +import numpy as np from typing import Dict, Optional, Any from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead from diffusers.schedulers.scheduling_ddpm import DDPMScheduler +from diffusers.schedulers.scheduling_ddim import DDIMScheduler from roboimi.vla.models.heads.diffusion import ConditionalUnet1D class VLAAgent(nn.Module): @@ -18,6 +20,13 @@ class VLAAgent(nn.Module): num_cams=2, # 视觉输入的摄像头数量 ): super().__init__() + # Store parameters + self.action_dim = action_dim + self.obs_dim = obs_dim + self.pred_horizon = pred_horizon + self.obs_horizon = obs_horizon + self.num_cams = num_cams + self.vision_encoder = vision_backbone single_img_feat_dim = self.vision_encoder.output_dim total_vision_dim = single_img_feat_dim * num_cams * obs_horizon @@ -30,7 +39,15 @@ class VLAAgent(nn.Module): clip_sample=True, prediction_type='epsilon' # 预测噪声 ) - + + # DDIM scheduler for faster inference + self.infer_scheduler = DDIMScheduler( + num_train_timesteps=diffusion_steps, + beta_schedule='squaredcos_cap_v2', + clip_sample=True, + prediction_type='epsilon' + ) + self.noise_pred_net = ConditionalUnet1D( input_dim=action_dim, global_cond_dim=self.global_cond_dim @@ -70,17 +87,11 @@ class VLAAgent(nn.Module): ) # 6. 网络预测噪声 - # 注意:U-Net 1D 通常期望 channel 在中间: (B, C, T) - # noisy_actions_inp = noisy_actions.permute(0, 2, 1) - pred_noise = self.noise_pred_net( sample=noisy_actions, timestep=timesteps, global_cond=global_cond ) - - # 还原维度 (B, T, C) - pred_noise = pred_noise.permute(0, 2, 1) # 7. 计算 Loss (MSE) loss = nn.functional.mse_loss(pred_noise, noise) @@ -92,24 +103,31 @@ class VLAAgent(nn.Module): @torch.no_grad() def predict_action(self, images, proprioception): B = 1 # 假设单次推理 - + # 1. 提取当前观测特征 (只做一次) visual_features = self.vision_encoder(images).view(B, -1) proprioception = proprioception.view(B, -1) + if hasattr(self, 'qpos_mean') and hasattr(self, 'qpos_std') and self.qpos_mean is not None: + # Convert to tensor for normalization + qpos_mean = torch.from_numpy(self.qpos_mean).float().to(proprioception.device) + qpos_std = torch.from_numpy(self.qpos_std).float().to(proprioception.device) + qpos_mean = qpos_mean.repeat(2) + qpos_std = qpos_std.repeat(2) + # Normalize: (qpos - mean) / std + proprioception = (proprioception - qpos_mean.unsqueeze(0)) / qpos_std.unsqueeze(0) global_cond = torch.cat([visual_features, proprioception], dim=-1) # 2. 初始化纯高斯噪声动作 - # Shape: (B, Horizon, Action_Dim) + # Shape: (B, pred_horizon, action_dim) current_actions = torch.randn( - (B, 16, 7), device=global_cond.device + (B, self.pred_horizon, self.action_dim), device=global_cond.device ) # 3. 逐步去噪循环 (Reverse Diffusion) - self.noise_scheduler.set_timesteps(10) # 推理时可以用更少步加速 (如 DDIM) + self.infer_scheduler.set_timesteps(10) # DDIM 推理步数 - for t in self.noise_scheduler.timesteps: - # 调整输入格式适应 1D CNN - model_input = current_actions.permute(0, 2, 1) + for t in self.infer_scheduler.timesteps: + model_input = current_actions # 预测噪声 noise_pred = self.noise_pred_net( @@ -117,12 +135,19 @@ class VLAAgent(nn.Module): timestep=t, global_cond=global_cond ) - # noise_pred = noise_pred.permute(0, 2, 1) # 移除噪声,更新 current_actions - current_actions = self.noise_scheduler.step( + current_actions = self.infer_scheduler.step( noise_pred, t, current_actions ).prev_sample - # 4. 输出最终动作序列 + # 4. 反归一化动作 (Denormalize actions) + if hasattr(self, 'action_mean') and hasattr(self, 'action_std') and self.action_mean is not None: + # Convert to numpy for denormalization + action_mean = torch.from_numpy(self.action_mean).float().to(current_actions.device) + action_std = torch.from_numpy(self.action_std).float().to(current_actions.device) + # Denormalize: action * std + mean + current_actions = current_actions * action_std.unsqueeze(0).unsqueeze(0) + action_mean.unsqueeze(0).unsqueeze(0) + + # 5. 输出最终动作序列 return current_actions # 返回去噪后的干净动作 \ No newline at end of file diff --git a/roboimi/vla/conf/config.yaml b/roboimi/vla/conf/config.yaml index 8b57ad4..dca3f26 100644 --- a/roboimi/vla/conf/config.yaml +++ b/roboimi/vla/conf/config.yaml @@ -4,10 +4,10 @@ defaults: - data: resnet_dataset train: - batch_size: 8 # Batch size for training + batch_size: 32 # Batch size for training lr: 1e-4 # Learning rate - max_steps: 10000 # Maximum training steps + max_steps: 20000 # Maximum training steps log_freq: 100 # Log frequency (steps) - save_freq: 1000 # Save checkpoint frequency (steps) + save_freq: 2000 # Save checkpoint frequency (steps) device: "cuda" # Device: "cuda" or "cpu" num_workers: 8 # DataLoader workers (set to 0 for debugging, 8 for production) \ No newline at end of file diff --git a/roboimi/vla/data/dataset.py b/roboimi/vla/data/dataset.py index 6e9b490..5c3ba8c 100644 --- a/roboimi/vla/data/dataset.py +++ b/roboimi/vla/data/dataset.py @@ -11,7 +11,7 @@ class RobotDiffusionDataset(Dataset): def __init__(self, dataset_dir, pred_horizon=16, - obs_horizon=1, + obs_horizon=2, action_horizon=8, camera_names=['r_vis', 'top'], normalization_type='gaussian'):