debug(inference): 添加推理阶段qpos归一化

2026-02-06 09:00:44 +08:00
parent b0a944f7aa
commit 66009473ad
7 changed files with 859 additions and 121 deletions
--- a/roboimi/demos/eval_vla.py
+++ b/roboimi/demos/eval_vla.py
@@ -0,0 +1,532 @@
 """
 VLA Policy Evaluation Script
 This script evaluates a trained Vision-Language-Action (VLA) policy
 in the MuJoCo simulation environment.
 Usage:
    python roboimi/demos/eval_vla.py --ckpt_path checkpoints/vla_model_best.pt --num_episodes 3
 """
 import torch
 import numpy as np
 import argparse
 from pathlib import Path
 from typing import Dict, List
 from tqdm import tqdm
 from roboimi.envs.double_pos_ctrl_env import make_sim_env
 from roboimi.utils.act_ex_utils import sample_transfer_pose
 from einops import rearrange
 class VLAEvaluator:
    """
    VLA Policy Evaluator for MuJoCo Simulation
    """
    def __init__(
        self,
        agent: torch.nn.Module,
        device: str = 'cuda',
        camera_names: List[str] = ['r_vis', 'top'],
        num_queries: int = 1,
        obs_horizon: int = 2,
        pred_horizon: int = 16,
        use_smoothing: bool = False,
        smooth_method: str = 'ema',
        smooth_alpha: float = 0.3
    ):
        """
        Args:
            agent: Trained VLAAgent
            device: Device for inference
            camera_names: List of camera names to use
            num_queries: How often to query the policy (in timesteps)
            obs_horizon: Number of observations to use as context
            pred_horizon: Number of future actions to predict
            use_smoothing: Whether to apply action smoothing
            smooth_method: Smoothing method ('ema', 'moving_avg', 'lowpass')
            smooth_alpha: Smoothing coefficient
        """
        self.agent = agent.to(device)
        self.device = device
        self.camera_names = camera_names
        self.num_queries = num_queries
        self.obs_horizon = obs_horizon
        self.pred_horizon = pred_horizon
        # Action smoothing
        self.use_smoothing = use_smoothing
        self.smooth_method = smooth_method
        self.smooth_alpha = smooth_alpha
        self.smoother = ActionSmoother(
            action_dim=16,  # Assuming 16-dim actions
            method=smooth_method,
            alpha=smooth_alpha
        ) if use_smoothing else None
        # Observation buffer for obs_horizon
        self.obs_buffer = {
            'images': {cam: [] for cam in camera_names},
            'qpos': []
        }
        self.cached_actions = None
        self.query_step = 0
    def reset(self):
        """Reset evaluator state"""
        self.obs_buffer = {
            'images': {cam: [] for cam in self.camera_names},
            'qpos': []
        }
        self.cached_actions = None
        self.query_step = 0
        if self.smoother is not None:
            self.smoother.reset()
    def _get_image_dict(self, obs: Dict) -> Dict[str, torch.Tensor]:
        """
        Extract and preprocess images from observation
        Args:
            obs: Environment observation dict
        Returns:
            Dict mapping camera names to image tensors (B, obs_horizon, C, H, W)
        """
        images = {}
        for cam_name in self.camera_names:
            # Extract image: (H, W, C) -> (C, H, W)
            img = obs['images'][cam_name]
            img = rearrange(img, 'h w c -> c h w')
            img = torch.from_numpy(img / 255.0).float()
            images[cam_name] = img  # (C, H, W)
        # Stack to create batch dimension
        image_dict = {}
        for cam_name in self.camera_names:
            # Collect obs_horizon frames
            cam_images = self.obs_buffer['images'][cam_name]
            cam_images.append(images[cam_name])
            # Pad to obs_horizon if needed (duplicate first frame)
            while len(cam_images) < self.obs_horizon:
                cam_images.insert(0, cam_images[0])
            # Keep only obs_horizon frames
            if len(cam_images) > self.obs_horizon:
                cam_images = cam_images[-self.obs_horizon:]
            # Stack: (obs_horizon, C, H, W) -> (1, obs_horizon, C, H, W)
            img_tensor = torch.stack(cam_images, dim=0).unsqueeze(0)
            image_dict[cam_name] = img_tensor
            # Update buffer (without padding)
            self.obs_buffer['images'][cam_name] = cam_images[-self.obs_horizon:]
        return image_dict
    def _get_qpos_dict(self, obs: Dict) -> torch.Tensor:
        """
        Extract and preprocess qpos from observation
        Args:
            obs: Environment observation dict
        Returns:
            qpos tensor: (1, obs_horizon, obs_dim)
        """
        qpos = obs['qpos']
        qpos = torch.from_numpy(qpos).float()
        # Add to buffer
        self.obs_buffer['qpos'].append(qpos)
        # Pad to obs_horizon if needed (duplicate first frame)
        while len(self.obs_buffer['qpos']) < self.obs_horizon:
            self.obs_buffer['qpos'].insert(0, self.obs_buffer['qpos'][0])
        # Keep only obs_horizon frames
        if len(self.obs_buffer['qpos']) > self.obs_horizon:
            self.obs_buffer['qpos'] = self.obs_buffer['qpos'][-self.obs_horizon:]
        # Stack: (obs_horizon, obs_dim) -> (1, obs_horizon, obs_dim)
        qpos_tensor = torch.stack(self.obs_buffer['qpos'], dim=0).unsqueeze(0)
        return qpos_tensor
    @torch.no_grad()
    def predict_action(self, obs: Dict) -> np.ndarray:
        """
        Predict action using VLA policy
        Args:
            obs: Current environment observation
        Returns:
            action: numpy array of shape (action_dim,)
        """
        # 1. Prepare observations
        images = self._get_image_dict(obs)  # Dict[str, (1, obs_horizon, C, H, W)]
        qpos = self._get_qpos_dict(obs)      # (1, obs_horizon, obs_dim)
        # 2. Check if we need to query the policy
        if self.cached_actions is None or self.query_step % self.num_queries == 0:
            # Prepare input for VLA agent
            # VLAAgent.predict_action expects:
            # - images: Dict[str, Tensor] with shape (B, obs_horizon, C, H, W)
            # - proprioception: Tensor with shape (B, obs_horizon, obs_dim)
            # Move to device
            images = {k: v.to(self.device) for k, v in images.items()}
            qpos = qpos.to(self.device)
            # Predict actions using VLA agent
            # Returns: (B, pred_horizon, action_dim)
            predicted_actions = self.agent.predict_action(
                images=images,
                proprioception=qpos
            )
            # Cache predicted actions (CPU numpy array)
            self.cached_actions = predicted_actions.squeeze(0).cpu().numpy()  # (pred_horizon, action_dim)
            self.query_step = 0
        # 3. Get action from cache
        raw_action = self.cached_actions[self.query_step]
        self.query_step += 1
        # 4. Apply smoothing if enabled
        if self.smoother is not None:
            raw_action = self.smoother.smooth(raw_action)
        return raw_action
 class ActionSmoother:
    """Action smoothing for smoother execution"""
    def __init__(self, action_dim: int, method: str = 'ema', alpha: float = 0.3):
        self.action_dim = action_dim
        self.method = method
        self.alpha = alpha
        self.prev_action = None
    def smooth(self, action: np.ndarray) -> np.ndarray:
        if self.method == 'ema':
            if self.prev_action is None:
                smoothed = action
            else:
                smoothed = self.alpha * action + (1 - self.alpha) * self.prev_action
            self.prev_action = smoothed
            return smoothed
        else:
            return action
    def reset(self):
        self.prev_action = None
 def load_checkpoint(
    ckpt_path: str,
    device: str = 'cuda'
 ) -> torch.nn.Module:
    """
    Load trained VLA model from checkpoint
    Args:
        ckpt_path: Path to checkpoint file (.pt)
        device: Device to load model on
    Returns:
        Loaded VLAAgent model
    """
    from roboimi.vla.agent import VLAAgent
    from hydra import initialize_config_dir, compose
    from pathlib import Path as PathLib
    ckpt_path = PathLib(ckpt_path).absolute()
    if not ckpt_path.exists():
        raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
    # Load checkpoint
    print(f"Loading checkpoint from {ckpt_path}")
    checkpoint = torch.load(ckpt_path, map_location=device, weights_only=False)
    print(f"Checkpoint keys: {checkpoint.keys()}")
    # Find VLA config directory
    import os
    # Get script directory
    script_dir = PathLib(__file__).resolve().parent
    current_dir = PathLib(os.getcwd()).absolute()
    # Try to find vla/conf directory
    config_dir = None
    # Option 1: If running from roboimi directory
    if (current_dir / 'vla' / 'conf').exists():
        config_dir = current_dir / 'vla' / 'conf'
    # Option 2: If running from project root
    elif (current_dir / 'roboimi' / 'vla' / 'conf').exists():
        config_dir = current_dir / 'roboimi' / 'vla' / 'conf'
    # Option 3: Relative to script location
    elif (script_dir / '../vla' / 'conf').exists():
        config_dir = (script_dir / '../vla' / 'conf').resolve()
    # Option 4: Search upwards
    else:
        search_start = current_dir
        while search_start != search_start.parent:
            if (search_start / 'vla' / 'conf').exists():
                config_dir = search_start / 'vla' / 'conf'
                break
            search_start = search_start.parent
    if config_dir is None:
        raise FileNotFoundError(
            f"Could not find VLA config directory.\n"
            f"Current directory: {current_dir}\n"
            f"Script location: {script_dir}\n"
            f"Please ensure you're running from the roboimi directory."
        )
    config_abs_path = str(config_dir.absolute())
    print(f"Loading config from {config_abs_path}")
    if not PathLib(config_abs_path).exists():
        raise FileNotFoundError(f"Config directory does not exist: {config_abs_path}")
    print(f"Loading config from {config_abs_path}")
    # Initialize Hydra with absolute path
    with initialize_config_dir(config_dir=config_abs_path, version_base=None):
        cfg = compose(config_name="config")
    # Instantiate agent from config
    print("Instantiating agent from config...")
    from hydra.utils import instantiate
    agent = instantiate(cfg.agent)
    # Load model state
    if 'model_state_dict' in checkpoint:
        agent.load_state_dict(checkpoint['model_state_dict'])
        print(f"✅ Model state loaded (step: {checkpoint.get('step', 'unknown')})")
    elif 'state_dict' in checkpoint:
        agent.load_state_dict(checkpoint['state_dict'])
        print("✅ Model state loaded")
    else:
        # Assume checkpoint is the state_dict itself
        agent.load_state_dict(checkpoint)
        print("✅ Model state loaded")
    # Load dataset statistics for denormalization
    import json
    stats_path = ckpt_path.parent / 'dataset_stats.json'
    if stats_path.exists():
        with open(stats_path, 'r') as f:
            stats = json.load(f)
        # Convert lists to numpy arrays
        agent.action_mean = np.array(stats['action_mean'])
        agent.action_std = np.array(stats['action_std'])
        agent.qpos_mean = np.array(stats['qpos_mean'])
        agent.qpos_std = np.array(stats['qpos_std'])
        print(f"✅ Dataset statistics loaded for denormalization")
    else:
        print(f"⚠️  Warning: {stats_path} not found. Actions will not be denormalized!")
        agent.action_mean = None
        agent.action_std = None
    agent.eval()
    agent.to(device)
    print(f"✅ Model loaded successfully on {device}")
    return agent
 def evaluate_policy(
    agent: torch.nn.Module,
    num_episodes: int = 3,
    max_timesteps: int = 700,
    task_name: str = 'sim_transfer',
    device: str = 'cuda',
    camera_names: List[str] = ['r_vis', 'top'],
    num_queries: int = 1,
    obs_horizon: int = 2,
    save_video: bool = True
 ):
    """
    Evaluate VLA policy in simulation
    Args:
        agent: Trained VLAAgent
        num_episodes: Number of episodes to run
        max_timesteps: Maximum timesteps per episode
        task_name: Task name for environment creation
        device: Device for inference
        camera_names: List of camera names
        num_queries: Policy query frequency
        obs_horizon: Observation horizon
        save_video: Whether to save video
    """
    # Create evaluator
    evaluator = VLAEvaluator(
        agent=agent,
        device=device,
        camera_names=camera_names,
        num_queries=num_queries,
        obs_horizon=obs_horizon,
        use_smoothing=False,
        smooth_method='ema',
        smooth_alpha=0.3
    )
    # Create environment
    env = make_sim_env(task_name)
    # Run episodes
    for episode_idx in range(num_episodes):
        print(f"\n{'='*60}")
        print(f"Episode {episode_idx + 1}/{num_episodes}")
        print(f"{'='*60}\n")
        # Reset environment and evaluator
        box_pos = sample_transfer_pose()
        env.reset(box_pos)
        evaluator.reset()
        # Storage for visualization
        episode_images = []
        success = False
        success_timestep = 0
        with torch.inference_mode():
            for t in tqdm(range(max_timesteps), desc=f"Episode {episode_idx + 1}"):
                # Get observation
                obs = env._get_image_obs()
                qpos_obs = env._get_qpos_obs()
                # Merge observations
                obs['qpos'] = qpos_obs['qpos']
                # Predict action
                action = evaluator.predict_action(obs)
                # Execute action
                env.step_jnt(action)
                # Save images for video
                if save_video:
                    episode_images.append(obs['images'])
                # Render
                env.render()
                # Check if episode is done
                if env.rew == 1.0:  # Success condition
                    success = True
                    success_timestep = t
                    print(f"\n✅ Task completed at timestep {t}!")
                    break
        # Episode summary
        print(f"\nEpisode {episode_idx + 1} Summary:")
        print(f"  Success: {success}")
        if success:
            print(f"  Success Timestep: {success_timestep}")
        print(f"  Length: {len(episode_images)} timesteps")
        # Save video
        if save_video and episode_images:
            save_video_episode(
                episode_images,
                save_path=f"outputs/eval_vla_episode_{episode_idx}.mp4"
            )
            print(f"  Video saved: outputs/eval_vla_episode_{episode_idx}.mp4")
    print(f"\n{'='*60}")
    print("Evaluation complete!")
    print(f"{'='*60}\n")
 def save_video_episode(images: List[Dict], save_path: str, fps: int = 20):
    """
    Save episode as video
    Args:
        images: List of observation dicts containing images
        save_path: Path to save video
        fps: Frames per second
    """
    try:
        import cv2
        from tqdm import tqdm
        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
        # Use first camera (e.g., 'r_vis') for visualization
        cam_name = list(images[0].keys())[0]
        # Get image size
        H, W, C = images[0][cam_name].shape
        # Create video writer
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        video_writer = cv2.VideoWriter(save_path, fourcc, fps, (W, H))
        # Write frames
        for img_dict in tqdm(images, desc="Saving video"):
            frame = img_dict[cam_name]
            # Convert RGB to BGR for OpenCV
            frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            video_writer.write(frame_bgr)
        video_writer.release()
        print(f"Video saved to {save_path}")
    except ImportError:
        print("Warning: opencv-python not installed, skipping video save")
        print("Install with: pip install opencv-python")
 def main():
    parser = argparse.ArgumentParser(description='Evaluate VLA Policy')
    parser.add_argument('--ckpt_path', type=str, required=True,
                        help='Path to model checkpoint')
    parser.add_argument('--num_episodes', type=int, default=3,
                        help='Number of evaluation episodes')
    parser.add_argument('--max_timesteps', type=int, default=700,
                        help='Maximum timesteps per episode')
    parser.add_argument('--device', type=str, default='cuda',
                        help='Device for inference')
    parser.add_argument('--camera_names', nargs='+', default=['r_vis', 'top'],
                        help='Camera names to use')
    parser.add_argument('--num_queries', type=int, default=16,
                        help='Policy query frequency (timesteps)')
    parser.add_argument('--obs_horizon', type=int, default=2,
                        help='Observation horizon')
    parser.add_argument('--no_video', action='store_true',
                        help='Do not save episode videos')
    args = parser.parse_args()
    # Load model
    print(f"Loading model from {args.ckpt_path}...")
    agent = load_checkpoint(args.ckpt_path, device=args.device)
    # Evaluate
    evaluate_policy(
        agent=agent,
        num_episodes=args.num_episodes,
        max_timesteps=args.max_timesteps,
        device=args.device,
        camera_names=args.camera_names,
        num_queries=args.num_queries,
        obs_horizon=args.obs_horizon,
        save_video=not args.no_video
    )
 if __name__ == '__main__':
    main()
--- a/roboimi/demos/vla_scripts/eval_vla.py
+++ b/roboimi/demos/vla_scripts/eval_vla.py
@@ -1,100 +0,0 @@
 import sys
 import os
 import hydra
 import torch
 import matplotlib.pyplot as plt
 import numpy as np
 from omegaconf import DictConfig, OmegaConf
 from hydra.utils import instantiate
 from torch.utils.data import DataLoader
 # 确保能导入 roboimi
 sys.path.append(os.getcwd())
 from roboimi.vla.agent import VLAAgent
 def recursive_to_device(data, device):
    if isinstance(data, torch.Tensor):
        return data.to(device)
    elif isinstance(data, dict):
        return {k: recursive_to_device(v, device) for k, v in data.items()}
    return data
@hydra.main(version_base=None, config_path="../../../roboimi/vla/conf", config_name="config")
 def main(cfg: DictConfig):
    print(">>> 🤖 Starting VLA Inference...")
    device = cfg.train.device
    # 1. 实例化 Agent (结构必须与训练时完全一致)
    # 也可以在这里覆盖配置，例如 forcing freeze=True
    agent: VLAAgent = instantiate(cfg.agent)
    agent.to(device)
    agent.eval() # 关键：切换到 Eval 模式
    # 2. 加载权重
    ckpt_path = "checkpoints/vla_model_final.pt"
    if not os.path.exists(ckpt_path):
        print(f"❌ Checkpoint not found at {ckpt_path}. Run training first!")
        return
    print(f"Loading weights from {ckpt_path}...")
    # map_location='cpu' 防止在只有 CPU 的机器上加载 GPU 权重报错
    state_dict = torch.load(ckpt_path, map_location=device)
    agent.load_state_dict(state_dict)
    print("✅ Weights loaded successfully.")
    # 3. 准备测试数据 (从 Dataset 里取一个样本)
    dataset = instantiate(cfg.data)
    dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    sample = next(iter(dataloader))
    # 准备输入 (模拟机器人实时运行)
    # 注意：推理时不需要传 sample['actions']
    primary_cam_key = cfg.data.obs_keys[0]
    input_img = sample['obs'][primary_cam_key][:, -1, :, :, :] # (1, C, H, W)
    agent_input = {
        "obs": {
            "image": input_img.to(device),
            "text": sample["language"] # 即使不用文本，占位符也要留着
        }
        # ⚠️ 关键：这里不传 'actions'，触发 Agent 进入 Inference 分支
    }
    # 4. 执行推理 (Reverse Diffusion)
    print("running reverse diffusion (this may take a moment)...")
    with torch.no_grad():
        # 这会触发 DiffusionHead 的分支 B (loop over timesteps)
        outputs = agent(agent_input)
    # 5. 获取结果
    # 输出 shape: (1, Chunk_Size, Action_Dim)
    pred_actions = outputs['pred_actions'].cpu().numpy()[0] 
    gt_actions = sample['actions'][0].numpy() # 用来对比
    print(f"✅ Generated Action Chunk Shape: {pred_actions.shape}")
    # 6. 可视化对比 (保存图片)
    plot_results(pred_actions, gt_actions)
 def plot_results(pred, gt):
    """
    简单的可视化：画出前几个维度的轨迹对比
    """
    plt.figure(figsize=(10, 5))
    # 比如只画前 3 个维度 (x, y, z)
    dims_to_plot = 3
    for i in range(dims_to_plot):
        plt.subplot(1, dims_to_plot, i+1)
        plt.plot(gt[:, i], 'g--', label='Ground Truth')
        plt.plot(pred[:, i], 'b-', label='Diffusion Pred')
        plt.title(f"Action Dim {i}")
        if i == 0: plt.legend()
        plt.ylim(-1, 1) # 假设动作是归一化的
    plt.tight_layout()
    plt.savefig("inference_result.png")
    print("📊 Result plot saved to 'inference_result.png'")
 if __name__ == "__main__":
    main()
--- a/roboimi/demos/vla_scripts/train_vla.py
+++ b/roboimi/demos/vla_scripts/train_vla.py
@@ -1,6 +1,8 @@
 import sys
 import os
 import logging
 import json
 import pickle
 import hydra
 import torch
 from tqdm import tqdm
@@ -103,6 +105,46 @@ def main(cfg: DictConfig):
        log.error(f"❌ Failed to initialize agent: {e}")
        raise
    # =========================================================================
    # 2.5. Save Dataset Statistics as JSON
    # =========================================================================
    log.info("💾 Saving dataset statistics...")
    try:
        # Get dataset_dir from config
        dataset_dir = cfg.data.get('dataset_dir', 'roboimi/demos/dataset/sim_transfer')
        stats_path = Path(dataset_dir) / 'data_stats.pkl'
        if stats_path.exists():
            # Load pickle file
            with open(stats_path, 'rb') as f:
                stats = pickle.load(f)
            # Extract action statistics
            action_mean = stats['action']['mean'].tolist() if 'action' in stats else []
            action_std = stats['action']['std'].tolist() if 'action' in stats else []
            qpos_mean = stats['qpos']['mean'].tolist() if 'qpos' in stats else []
            qpos_std = stats['qpos']['std'].tolist() if 'qpos' in stats else []
            # Save as JSON
            json_stats = {
                'action_mean': action_mean,
                'action_std': action_std,
                'qpos_mean': qpos_mean,
                'qpos_std': qpos_std
            }
            json_path = checkpoint_dir / 'dataset_stats.json'
            with open(json_path, 'w') as f:
                json.dump(json_stats, f, indent=2)
            log.info(f"✅ Dataset statistics saved to {json_path}")
        else:
            log.warning(f"⚠️  Statistics file not found: {stats_path}")
            log.warning("⚠️  Actions will not be denormalized during inference!")
    except Exception as e:
        log.warning(f"⚠️  Failed to save statistics as JSON: {e}")
        log.warning("⚠️  Training will continue, but inference may not work correctly")
    # =========================================================================
    # 3. Setup Optimizer
    # =========================================================================
--- a/roboimi/vla/VLA_EVALUATION_GUIDE.md
+++ b/roboimi/vla/VLA_EVALUATION_GUIDE.md
@@ -0,0 +1,239 @@
 # VLA Evaluation Guide
 This guide explains how to evaluate a trained Vision-Language-Action (VLA) policy in the MuJoCo simulation environment.
 ## Prerequisites
 1. **Trained Model**: Train your VLA model first using `train_vla.py`
 2. **Checkpoints**: Ensure you have saved model checkpoints in `checkpoints/` directory
 3. **Dependencies**: Install required dependencies:
   ```bash
   pip install opencv-python tqdm
   ```
 ## Quick Start
 ### Basic Evaluation
 ```bash
 # Evaluate with default settings (3 episodes)
 python roboimi/demos/eval_vla.py \
    --ckpt_path checkpoints/vla_model_best.pt
 # Evaluate with custom settings
 python roboimi/demos/eval_vla.py \
    --ckpt_path checkpoints/vla_model_step_5000.pt \
    --num_episodes 5 \
    --max_timesteps 700 \
    --camera_names r_vis top angle \
    --num_queries 1 \
    --obs_horizon 2
 ```
 ### Parameters
 | Parameter | Description | Default |
 |-----------|-------------|---------|
 | `--ckpt_path` | Path to model checkpoint (.pt file) | Required |
 | `--num_episodes` | Number of evaluation episodes | 3 |
 | `--max_timesteps` | Maximum timesteps per episode | 700 |
 | `--device` | Device for inference (`cuda` or `cpu`) | `cuda` |
 | `--camera_names` | Camera names to use (space-separated) | `r_vis top` |
 | `--num_queries` | Policy query frequency (every N timesteps) | 1 |
 | `--obs_horizon` | Observation history length | 2 |
 | `--no_video` | Disable video saving | False |
 ## Usage Details
 ### Policy Query Frequency
 The `--num_queries` parameter controls how often the policy is queried:
 - `--num_queries 1`: Query every timestep (default, most accurate)
 - `--num_queries 4`: Query every 4 timesteps (faster, but uses cached actions)
 When using cached actions (num_queries > 1), the policy predicts a chunk of actions (pred_horizon=16), and these actions are executed sequentially until the next query.
 ### Camera Selection
 Available cameras depend on your environment:
 - `r_vis`: Right arm RealSense camera
 - `top`: Top-down view camera
 - `angle`: Angled view camera
 Use `--camera_names` to specify which cameras to use:
 ```bash
 --camera_names r_vis top      # Use 2 cameras
 --camera_names r_vis top angle # Use all 3 cameras
 ```
 ### Observation Horizon
 The `--obs_horizon` parameter determines how many past observations to use as context:
 ```bash
 --obs_horizon 1  # Use only current observation
 --obs_horizon 2  # Use current + 1 past observation (default)
 --obs_horizon 4  # Use current + 3 past observations
 ```
 **Note**: Must match the value used during training.
 ## Output
 ### Console Output
 During evaluation, you'll see:
 ```
 ============================================================
 Episode 1/3
 ============================================================
 Episode 1: 100%|████████████████████| 700/700 [02:30<00:00,  4.64it/s]
 ✅ Task completed at timestep 453!
 Episode 1 Summary:
  Total Reward: 1.0000
  Max Reward: 1.0000
  Length: 453 timesteps
  Video saved: outputs/eval_vla_episode_0.mp4
 ```
 ### Video Output
 Videos are saved to `outputs/eval_vla_episode_{N}.mp4` showing the robot's execution.
 ### Metrics
 - **Total Reward**: Sum of rewards throughout the episode
 - **Max Reward**: Maximum reward achieved (1.0 = success)
 - **Length**: Number of timesteps executed
 ## Action Smoothing
 The evaluator includes EMA (Exponential Moving Average) smoothing by default to reduce jitter:
 ```python
 # Default smoothing parameters
 smooth_method = 'ema'
 smooth_alpha = 0.3  # Lower = more smoothing
 ```
 To disable or modify smoothing, edit the `evaluate_policy()` call in `eval_vla.py`:
 ```python
 evaluator = VLAEvaluator(
    agent=agent,
    use_smoothing=False,  # Disable smoothing
    # or
    smooth_method='moving_avg',  # Use different method
    smooth_alpha=0.5  # Adjust smoothing strength
 )
 ```
 ## Troubleshooting
 ### Issue: Checkpoint not found
 ```
 FileNotFoundError: Checkpoint not found: checkpoints/vla_model_best.pt
 ```
 **Solution**: Ensure you've trained the model and checkpoints exist:
 ```bash
 ls -la checkpoints/
 # Should show: vla_model_best.pt, vla_model_final.pt, etc.
 ```
 ### Issue: CUDA out of memory
 **Solution**: Use CPU for inference:
 ```bash
 python eval_vla.py --ckpt_path checkpoints/vla_model_best.pt --device cpu
 ```
 ### Issue: Camera names don't match
 **Solution**: Check your HDF5 files for available cameras:
 ```python
 import h5py
 with h5py.File('roboimi/demos/dataset/sim_transfer/episode_0.hdf5', 'r') as f:
    print(list(f['observations/images'].keys()))
    # Output: ['angle', 'r_vis', 'top']
 ```
 Then use the correct camera names in your eval command.
 ### Issue: Mismatched obs_horizon
 ```
 RuntimeError: Tensor shape mismatch
 ```
 **Solution**: Ensure `--obs_horizon` matches the training config (`data.obs_horizon`).
 ## Advanced Usage
 ### Custom Evaluation Script
 You can also use the evaluator in your own scripts:
 ```python
 from roboimi.demos.eval_vla import VLAEvaluator, load_checkpoint
 from roboimi.envs.double_pos_ctrl_env import make_sim_env
 # Load model
 agent = load_checkpoint('checkpoints/vla_model_best.pt')
 # Create evaluator
 evaluator = VLAEvaluator(
    agent=agent,
    device='cuda',
    camera_names=['r_vis', 'top'],
    num_queries=1,
    obs_horizon=2
 )
 # Create environment
 env = make_sim_env('sim_transfer')
 env.reset()
 evaluator.reset()
 # Run episode
 obs = env._get_image_obs()
 obs['qpos'] = env._get_qpos_obs()['qpos']
 # Predict and execute action
 action = evaluator.predict_action(obs)
 env.step_jnt(action)
 ```
 ### Batch Evaluation
 Evaluate multiple checkpoints:
 ```bash
 for ckpt in checkpoints/vla_model_step_*.pt; do
    echo "Evaluating $ckpt"
    python roboimi/demos/eval_vla.py \
        --ckpt_path "$ckpt" \
        --num_episodes 1 \
        --no_video
 done
 ```
 ## Next Steps
 1. **Train your model**: See [RESNET_TRAINING_GUIDE.md](roboimi/vla/RESNET_TRAINING_GUIDE.md)
 2. **Evaluate performance**: Use this evaluation script
 3. **Analyze results**: Compare different checkpoints
 4. **Deploy to real robot**: Adapt the evaluator for real robot control
 ## References
 - Training Guide: [roboimi/vla/RESNET_TRAINING_GUIDE.md](roboimi/vla/RESNET_TRAINING_GUIDE.md)
 - Project Documentation: [CLAUDE.md](CLAUDE.md)
 - Original ACT Paper: https://arxiv.org/abs/2304.13705
 - Diffusion Policy: https://diffusion-policy.cs.columbia.edu/
--- a/roboimi/vla/agent.py
+++ b/roboimi/vla/agent.py
@@ -1,8 +1,10 @@
 import torch
 import torch.nn as nn
 import numpy as np
 from typing import Dict, Optional, Any
 from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead
 from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
 from diffusers.schedulers.scheduling_ddim import DDIMScheduler
 from roboimi.vla.models.heads.diffusion import ConditionalUnet1D
 class VLAAgent(nn.Module):
@@ -18,6 +20,13 @@ class VLAAgent(nn.Module):
        num_cams=2,          # 视觉输入的摄像头数量
    ):
        super().__init__()
        # Store parameters
        self.action_dim = action_dim
        self.obs_dim = obs_dim
        self.pred_horizon = pred_horizon
        self.obs_horizon = obs_horizon
        self.num_cams = num_cams
        self.vision_encoder = vision_backbone
        single_img_feat_dim = self.vision_encoder.output_dim
        total_vision_dim = single_img_feat_dim * num_cams * obs_horizon
@@ -31,6 +40,14 @@ class VLAAgent(nn.Module):
                    prediction_type='epsilon' # 预测噪声
                )
        # DDIM scheduler for faster inference
        self.infer_scheduler = DDIMScheduler(
            num_train_timesteps=diffusion_steps,
            beta_schedule='squaredcos_cap_v2',
            clip_sample=True,
            prediction_type='epsilon'
        )
        self.noise_pred_net = ConditionalUnet1D(
            input_dim=action_dim,
            global_cond_dim=self.global_cond_dim
@@ -70,18 +87,12 @@ class VLAAgent(nn.Module):
        )
        # 6. 网络预测噪声
        # 注意：U-Net 1D 通常期望 channel 在中间: (B, C, T)
        # noisy_actions_inp = noisy_actions.permute(0, 2, 1) 
        pred_noise = self.noise_pred_net(
            sample=noisy_actions, 
            timestep=timesteps, 
            global_cond=global_cond
        )
        # 还原维度 (B, T, C)
        pred_noise = pred_noise.permute(0, 2, 1)
        # 7. 计算 Loss (MSE)
        loss = nn.functional.mse_loss(pred_noise, noise)
        return loss
@@ -96,20 +107,27 @@ class VLAAgent(nn.Module):
        # 1. 提取当前观测特征 (只做一次)
        visual_features = self.vision_encoder(images).view(B, -1)
        proprioception = proprioception.view(B, -1)
        if hasattr(self, 'qpos_mean') and hasattr(self, 'qpos_std') and self.qpos_mean is not None:
            # Convert to tensor for normalization
            qpos_mean = torch.from_numpy(self.qpos_mean).float().to(proprioception.device)
            qpos_std = torch.from_numpy(self.qpos_std).float().to(proprioception.device)
            qpos_mean = qpos_mean.repeat(2)
            qpos_std = qpos_std.repeat(2)
            # Normalize: (qpos - mean) / std
            proprioception = (proprioception - qpos_mean.unsqueeze(0)) / qpos_std.unsqueeze(0)
        global_cond = torch.cat([visual_features, proprioception], dim=-1)
        # 2. 初始化纯高斯噪声动作
-        # Shape: (B, Horizon, Action_Dim)
+        # Shape: (B, pred_horizon, action_dim)
        current_actions = torch.randn(
-            (B, 16, 7), device=global_cond.device
+            (B, self.pred_horizon, self.action_dim), device=global_cond.device
        )
        # 3. 逐步去噪循环 (Reverse Diffusion)
-        self.noise_scheduler.set_timesteps(10) # 推理时可以用更少步加速 (如 DDIM)
+        self.infer_scheduler.set_timesteps(10) # DDIM 推理步数
-        for t in self.noise_scheduler.timesteps:
+        for t in self.infer_scheduler.timesteps:
-            # 调整输入格式适应 1D CNN
+            model_input = current_actions
            model_input = current_actions.permute(0, 2, 1)
            # 预测噪声
            noise_pred = self.noise_pred_net(
@@ -117,12 +135,19 @@ class VLAAgent(nn.Module):
                timestep=t, 
                global_cond=global_cond
            )
            # noise_pred = noise_pred.permute(0, 2, 1)
            # 移除噪声，更新 current_actions
-            current_actions = self.noise_scheduler.step(
+            current_actions = self.infer_scheduler.step(
                noise_pred, t, current_actions
            ).prev_sample
-        # 4. 输出最终动作序列
+        # 4. 反归一化动作 (Denormalize actions)
        if hasattr(self, 'action_mean') and hasattr(self, 'action_std') and self.action_mean is not None:
            # Convert to numpy for denormalization
            action_mean = torch.from_numpy(self.action_mean).float().to(current_actions.device)
            action_std = torch.from_numpy(self.action_std).float().to(current_actions.device)
            # Denormalize: action * std + mean
            current_actions = current_actions * action_std.unsqueeze(0).unsqueeze(0) + action_mean.unsqueeze(0).unsqueeze(0)
        # 5. 输出最终动作序列
        return current_actions # 返回去噪后的干净动作
--- a/roboimi/vla/conf/config.yaml
+++ b/roboimi/vla/conf/config.yaml
@@ -4,10 +4,10 @@ defaults:
  - data: resnet_dataset
 train:
-  batch_size: 8       # Batch size for training
+  batch_size: 32       # Batch size for training
  lr: 1e-4            # Learning rate
-  max_steps: 10000    # Maximum training steps
+  max_steps: 20000    # Maximum training steps
  log_freq: 100       # Log frequency (steps)
-  save_freq: 1000     # Save checkpoint frequency (steps)
+  save_freq: 2000     # Save checkpoint frequency (steps)
  device: "cuda"      # Device: "cuda" or "cpu"
  num_workers: 8      # DataLoader workers (set to 0 for debugging, 8 for production)
--- a/roboimi/vla/data/dataset.py
+++ b/roboimi/vla/data/dataset.py
@@ -11,7 +11,7 @@ class RobotDiffusionDataset(Dataset):
    def __init__(self, 
                 dataset_dir, 
                 pred_horizon=16, 
-                 obs_horizon=1, 
+                 obs_horizon=2, 
                 action_horizon=8,
                 camera_names=['r_vis', 'top'],
                 normalization_type='gaussian'):