debug(inference): 添加推理阶段qpos归一化

2026-02-06 09:00:44 +08:00
parent b0a944f7aa
commit 66009473ad
7 changed files with 859 additions and 121 deletions
--- a/roboimi/demos/eval_vla.py
+++ b/roboimi/demos/eval_vla.py
@@ -0,0 +1,532 @@
+"""
+VLA Policy Evaluation Script
+
+This script evaluates a trained Vision-Language-Action (VLA) policy
+in the MuJoCo simulation environment.
+
+Usage:
+    python roboimi/demos/eval_vla.py --ckpt_path checkpoints/vla_model_best.pt --num_episodes 3
+"""
+
+import torch
+import numpy as np
+import argparse
+from pathlib import Path
+from typing import Dict, List
+from tqdm import tqdm
+
+from roboimi.envs.double_pos_ctrl_env import make_sim_env
+from roboimi.utils.act_ex_utils import sample_transfer_pose
+from einops import rearrange
+
+
+class VLAEvaluator:
+    """
+    VLA Policy Evaluator for MuJoCo Simulation
+    """
+
+    def __init__(
+        self,
+        agent: torch.nn.Module,
+        device: str = 'cuda',
+        camera_names: List[str] = ['r_vis', 'top'],
+        num_queries: int = 1,
+        obs_horizon: int = 2,
+        pred_horizon: int = 16,
+        use_smoothing: bool = False,
+        smooth_method: str = 'ema',
+        smooth_alpha: float = 0.3
+    ):
+        """
+        Args:
+            agent: Trained VLAAgent
+            device: Device for inference
+            camera_names: List of camera names to use
+            num_queries: How often to query the policy (in timesteps)
+            obs_horizon: Number of observations to use as context
+            pred_horizon: Number of future actions to predict
+            use_smoothing: Whether to apply action smoothing
+            smooth_method: Smoothing method ('ema', 'moving_avg', 'lowpass')
+            smooth_alpha: Smoothing coefficient
+        """
+        self.agent = agent.to(device)
+        self.device = device
+        self.camera_names = camera_names
+        self.num_queries = num_queries
+        self.obs_horizon = obs_horizon
+        self.pred_horizon = pred_horizon
+
+        # Action smoothing
+        self.use_smoothing = use_smoothing
+        self.smooth_method = smooth_method
+        self.smooth_alpha = smooth_alpha
+        self.smoother = ActionSmoother(
+            action_dim=16,  # Assuming 16-dim actions
+            method=smooth_method,
+            alpha=smooth_alpha
+        ) if use_smoothing else None
+
+        # Observation buffer for obs_horizon
+        self.obs_buffer = {
+            'images': {cam: [] for cam in camera_names},
+            'qpos': []
+        }
+        self.cached_actions = None
+        self.query_step = 0
+
+    def reset(self):
+        """Reset evaluator state"""
+        self.obs_buffer = {
+            'images': {cam: [] for cam in self.camera_names},
+            'qpos': []
+        }
+        self.cached_actions = None
+        self.query_step = 0
+        if self.smoother is not None:
+            self.smoother.reset()
+
+    def _get_image_dict(self, obs: Dict) -> Dict[str, torch.Tensor]:
+        """
+        Extract and preprocess images from observation
+
+        Args:
+            obs: Environment observation dict
+
+        Returns:
+            Dict mapping camera names to image tensors (B, obs_horizon, C, H, W)
+        """
+        images = {}
+        for cam_name in self.camera_names:
+            # Extract image: (H, W, C) -> (C, H, W)
+            img = obs['images'][cam_name]
+            img = rearrange(img, 'h w c -> c h w')
+            img = torch.from_numpy(img / 255.0).float()
+            images[cam_name] = img  # (C, H, W)
+
+        # Stack to create batch dimension
+        image_dict = {}
+        for cam_name in self.camera_names:
+            # Collect obs_horizon frames
+            cam_images = self.obs_buffer['images'][cam_name]
+            cam_images.append(images[cam_name])
+
+            # Pad to obs_horizon if needed (duplicate first frame)
+            while len(cam_images) < self.obs_horizon:
+                cam_images.insert(0, cam_images[0])
+
+            # Keep only obs_horizon frames
+            if len(cam_images) > self.obs_horizon:
+                cam_images = cam_images[-self.obs_horizon:]
+
+            # Stack: (obs_horizon, C, H, W) -> (1, obs_horizon, C, H, W)
+            img_tensor = torch.stack(cam_images, dim=0).unsqueeze(0)
+            image_dict[cam_name] = img_tensor
+
+            # Update buffer (without padding)
+            self.obs_buffer['images'][cam_name] = cam_images[-self.obs_horizon:]
+
+        return image_dict
+
+    def _get_qpos_dict(self, obs: Dict) -> torch.Tensor:
+        """
+        Extract and preprocess qpos from observation
+
+        Args:
+            obs: Environment observation dict
+
+        Returns:
+            qpos tensor: (1, obs_horizon, obs_dim)
+        """
+        qpos = obs['qpos']
+        qpos = torch.from_numpy(qpos).float()
+
+        # Add to buffer
+        self.obs_buffer['qpos'].append(qpos)
+
+        # Pad to obs_horizon if needed (duplicate first frame)
+        while len(self.obs_buffer['qpos']) < self.obs_horizon:
+            self.obs_buffer['qpos'].insert(0, self.obs_buffer['qpos'][0])
+
+        # Keep only obs_horizon frames
+        if len(self.obs_buffer['qpos']) > self.obs_horizon:
+            self.obs_buffer['qpos'] = self.obs_buffer['qpos'][-self.obs_horizon:]
+
+        # Stack: (obs_horizon, obs_dim) -> (1, obs_horizon, obs_dim)
+        qpos_tensor = torch.stack(self.obs_buffer['qpos'], dim=0).unsqueeze(0)
+
+        return qpos_tensor
+
+    @torch.no_grad()
+    def predict_action(self, obs: Dict) -> np.ndarray:
+        """
+        Predict action using VLA policy
+
+        Args:
+            obs: Current environment observation
+
+        Returns:
+            action: numpy array of shape (action_dim,)
+        """
+        # 1. Prepare observations
+        images = self._get_image_dict(obs)  # Dict[str, (1, obs_horizon, C, H, W)]
+        qpos = self._get_qpos_dict(obs)      # (1, obs_horizon, obs_dim)
+
+        # 2. Check if we need to query the policy
+        if self.cached_actions is None or self.query_step % self.num_queries == 0:
+            # Prepare input for VLA agent
+            # VLAAgent.predict_action expects:
+            # - images: Dict[str, Tensor] with shape (B, obs_horizon, C, H, W)
+            # - proprioception: Tensor with shape (B, obs_horizon, obs_dim)
+
+            # Move to device
+            images = {k: v.to(self.device) for k, v in images.items()}
+            qpos = qpos.to(self.device)
+
+            # Predict actions using VLA agent
+            # Returns: (B, pred_horizon, action_dim)
+            predicted_actions = self.agent.predict_action(
+                images=images,
+                proprioception=qpos
+            )
+
+            # Cache predicted actions (CPU numpy array)
+            self.cached_actions = predicted_actions.squeeze(0).cpu().numpy()  # (pred_horizon, action_dim)
+            self.query_step = 0
+
+        # 3. Get action from cache
+        raw_action = self.cached_actions[self.query_step]
+        self.query_step += 1
+
+        # 4. Apply smoothing if enabled
+        if self.smoother is not None:
+            raw_action = self.smoother.smooth(raw_action)
+
+        return raw_action
+
+
+class ActionSmoother:
+    """Action smoothing for smoother execution"""
+
+    def __init__(self, action_dim: int, method: str = 'ema', alpha: float = 0.3):
+        self.action_dim = action_dim
+        self.method = method
+        self.alpha = alpha
+        self.prev_action = None
+
+    def smooth(self, action: np.ndarray) -> np.ndarray:
+        if self.method == 'ema':
+            if self.prev_action is None:
+                smoothed = action
+            else:
+                smoothed = self.alpha * action + (1 - self.alpha) * self.prev_action
+            self.prev_action = smoothed
+            return smoothed
+        else:
+            return action
+
+    def reset(self):
+        self.prev_action = None
+
+
+def load_checkpoint(
+    ckpt_path: str,
+    device: str = 'cuda'
+) -> torch.nn.Module:
+    """
+    Load trained VLA model from checkpoint
+
+    Args:
+        ckpt_path: Path to checkpoint file (.pt)
+        device: Device to load model on
+
+    Returns:
+        Loaded VLAAgent model
+    """
+    from roboimi.vla.agent import VLAAgent
+    from hydra import initialize_config_dir, compose
+    from pathlib import Path as PathLib
+
+    ckpt_path = PathLib(ckpt_path).absolute()
+    if not ckpt_path.exists():
+        raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
+
+    # Load checkpoint
+    print(f"Loading checkpoint from {ckpt_path}")
+    checkpoint = torch.load(ckpt_path, map_location=device, weights_only=False)
+
+    print(f"Checkpoint keys: {checkpoint.keys()}")
+
+    # Find VLA config directory
+    import os
+
+    # Get script directory
+    script_dir = PathLib(__file__).resolve().parent
+    current_dir = PathLib(os.getcwd()).absolute()
+
+    # Try to find vla/conf directory
+    config_dir = None
+
+    # Option 1: If running from roboimi directory
+    if (current_dir / 'vla' / 'conf').exists():
+        config_dir = current_dir / 'vla' / 'conf'
+    # Option 2: If running from project root
+    elif (current_dir / 'roboimi' / 'vla' / 'conf').exists():
+        config_dir = current_dir / 'roboimi' / 'vla' / 'conf'
+    # Option 3: Relative to script location
+    elif (script_dir / '../vla' / 'conf').exists():
+        config_dir = (script_dir / '../vla' / 'conf').resolve()
+    # Option 4: Search upwards
+    else:
+        search_start = current_dir
+        while search_start != search_start.parent:
+            if (search_start / 'vla' / 'conf').exists():
+                config_dir = search_start / 'vla' / 'conf'
+                break
+            search_start = search_start.parent
+
+    if config_dir is None:
+        raise FileNotFoundError(
+            f"Could not find VLA config directory.\n"
+            f"Current directory: {current_dir}\n"
+            f"Script location: {script_dir}\n"
+            f"Please ensure you're running from the roboimi directory."
+        )
+
+    config_abs_path = str(config_dir.absolute())
+    print(f"Loading config from {config_abs_path}")
+
+    if not PathLib(config_abs_path).exists():
+        raise FileNotFoundError(f"Config directory does not exist: {config_abs_path}")
+    print(f"Loading config from {config_abs_path}")
+
+    # Initialize Hydra with absolute path
+    with initialize_config_dir(config_dir=config_abs_path, version_base=None):
+        cfg = compose(config_name="config")
+
+    # Instantiate agent from config
+    print("Instantiating agent from config...")
+    from hydra.utils import instantiate
+    agent = instantiate(cfg.agent)
+
+    # Load model state
+    if 'model_state_dict' in checkpoint:
+        agent.load_state_dict(checkpoint['model_state_dict'])
+        print(f"✅ Model state loaded (step: {checkpoint.get('step', 'unknown')})")
+    elif 'state_dict' in checkpoint:
+        agent.load_state_dict(checkpoint['state_dict'])
+        print("✅ Model state loaded")
+    else:
+        # Assume checkpoint is the state_dict itself
+        agent.load_state_dict(checkpoint)
+        print("✅ Model state loaded")
+
+    # Load dataset statistics for denormalization
+    import json
+    stats_path = ckpt_path.parent / 'dataset_stats.json'
+    if stats_path.exists():
+        with open(stats_path, 'r') as f:
+            stats = json.load(f)
+        # Convert lists to numpy arrays
+        agent.action_mean = np.array(stats['action_mean'])
+        agent.action_std = np.array(stats['action_std'])
+        agent.qpos_mean = np.array(stats['qpos_mean'])
+        agent.qpos_std = np.array(stats['qpos_std'])
+        print(f"✅ Dataset statistics loaded for denormalization")
+    else:
+        print(f"⚠️  Warning: {stats_path} not found. Actions will not be denormalized!")
+        agent.action_mean = None
+        agent.action_std = None
+
+    agent.eval()
+    agent.to(device)
+
+    print(f"✅ Model loaded successfully on {device}")
+
+    return agent
+
+
+def evaluate_policy(
+    agent: torch.nn.Module,
+    num_episodes: int = 3,
+    max_timesteps: int = 700,
+    task_name: str = 'sim_transfer',
+    device: str = 'cuda',
+    camera_names: List[str] = ['r_vis', 'top'],
+    num_queries: int = 1,
+    obs_horizon: int = 2,
+    save_video: bool = True
+):
+    """
+    Evaluate VLA policy in simulation
+
+    Args:
+        agent: Trained VLAAgent
+        num_episodes: Number of episodes to run
+        max_timesteps: Maximum timesteps per episode
+        task_name: Task name for environment creation
+        device: Device for inference
+        camera_names: List of camera names
+        num_queries: Policy query frequency
+        obs_horizon: Observation horizon
+        save_video: Whether to save video
+    """
+    # Create evaluator
+    evaluator = VLAEvaluator(
+        agent=agent,
+        device=device,
+        camera_names=camera_names,
+        num_queries=num_queries,
+        obs_horizon=obs_horizon,
+        use_smoothing=False,
+        smooth_method='ema',
+        smooth_alpha=0.3
+    )
+
+    # Create environment
+    env = make_sim_env(task_name)
+
+    # Run episodes
+    for episode_idx in range(num_episodes):
+        print(f"\n{'='*60}")
+        print(f"Episode {episode_idx + 1}/{num_episodes}")
+        print(f"{'='*60}\n")
+
+        # Reset environment and evaluator
+        box_pos = sample_transfer_pose()
+        env.reset(box_pos)
+        evaluator.reset()
+
+        # Storage for visualization
+        episode_images = []
+        success = False
+        success_timestep = 0
+
+        with torch.inference_mode():
+            for t in tqdm(range(max_timesteps), desc=f"Episode {episode_idx + 1}"):
+                # Get observation
+                obs = env._get_image_obs()
+                qpos_obs = env._get_qpos_obs()
+
+                # Merge observations
+                obs['qpos'] = qpos_obs['qpos']
+
+                # Predict action
+                action = evaluator.predict_action(obs)
+
+                # Execute action
+                env.step_jnt(action)
+
+                # Save images for video
+                if save_video:
+                    episode_images.append(obs['images'])
+
+                # Render
+                env.render()
+
+                # Check if episode is done
+                if env.rew == 1.0:  # Success condition
+                    success = True
+                    success_timestep = t
+                    print(f"\n✅ Task completed at timestep {t}!")
+                    break
+
+        # Episode summary
+        print(f"\nEpisode {episode_idx + 1} Summary:")
+        print(f"  Success: {success}")
+        if success:
+            print(f"  Success Timestep: {success_timestep}")
+        print(f"  Length: {len(episode_images)} timesteps")
+
+        # Save video
+        if save_video and episode_images:
+            save_video_episode(
+                episode_images,
+                save_path=f"outputs/eval_vla_episode_{episode_idx}.mp4"
+            )
+            print(f"  Video saved: outputs/eval_vla_episode_{episode_idx}.mp4")
+
+    print(f"\n{'='*60}")
+    print("Evaluation complete!")
+    print(f"{'='*60}\n")
+
+
+def save_video_episode(images: List[Dict], save_path: str, fps: int = 20):
+    """
+    Save episode as video
+
+    Args:
+        images: List of observation dicts containing images
+        save_path: Path to save video
+        fps: Frames per second
+    """
+    try:
+        import cv2
+        from tqdm import tqdm
+
+        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
+
+        # Use first camera (e.g., 'r_vis') for visualization
+        cam_name = list(images[0].keys())[0]
+
+        # Get image size
+        H, W, C = images[0][cam_name].shape
+
+        # Create video writer
+        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
+        video_writer = cv2.VideoWriter(save_path, fourcc, fps, (W, H))
+
+        # Write frames
+        for img_dict in tqdm(images, desc="Saving video"):
+            frame = img_dict[cam_name]
+            # Convert RGB to BGR for OpenCV
+            frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+            video_writer.write(frame_bgr)
+
+        video_writer.release()
+        print(f"Video saved to {save_path}")
+
+    except ImportError:
+        print("Warning: opencv-python not installed, skipping video save")
+        print("Install with: pip install opencv-python")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Evaluate VLA Policy')
+    parser.add_argument('--ckpt_path', type=str, required=True,
+                        help='Path to model checkpoint')
+    parser.add_argument('--num_episodes', type=int, default=3,
+                        help='Number of evaluation episodes')
+    parser.add_argument('--max_timesteps', type=int, default=700,
+                        help='Maximum timesteps per episode')
+    parser.add_argument('--device', type=str, default='cuda',
+                        help='Device for inference')
+    parser.add_argument('--camera_names', nargs='+', default=['r_vis', 'top'],
+                        help='Camera names to use')
+    parser.add_argument('--num_queries', type=int, default=16,
+                        help='Policy query frequency (timesteps)')
+    parser.add_argument('--obs_horizon', type=int, default=2,
+                        help='Observation horizon')
+    parser.add_argument('--no_video', action='store_true',
+                        help='Do not save episode videos')
+
+    args = parser.parse_args()
+
+    # Load model
+    print(f"Loading model from {args.ckpt_path}...")
+    agent = load_checkpoint(args.ckpt_path, device=args.device)
+
+    # Evaluate
+    evaluate_policy(
+        agent=agent,
+        num_episodes=args.num_episodes,
+        max_timesteps=args.max_timesteps,
+        device=args.device,
+        camera_names=args.camera_names,
+        num_queries=args.num_queries,
+        obs_horizon=args.obs_horizon,
+        save_video=not args.no_video
+    )
+
+
+if __name__ == '__main__':
+    main()
--- a/roboimi/demos/vla_scripts/eval_vla.py
+++ b/roboimi/demos/vla_scripts/eval_vla.py
@@ -1,100 +0,0 @@
-import sys
-import os
-import hydra
-import torch
-import matplotlib.pyplot as plt
-import numpy as np
-from omegaconf import DictConfig, OmegaConf
-from hydra.utils import instantiate
-from torch.utils.data import DataLoader
-
-# 确保能导入 roboimi
-sys.path.append(os.getcwd())
-from roboimi.vla.agent import VLAAgent
-
-def recursive_to_device(data, device):
-    if isinstance(data, torch.Tensor):
-        return data.to(device)
-    elif isinstance(data, dict):
-        return {k: recursive_to_device(v, device) for k, v in data.items()}
-    return data
-
-@hydra.main(version_base=None, config_path="../../../roboimi/vla/conf", config_name="config")
-def main(cfg: DictConfig):
-    print(">>> 🤖 Starting VLA Inference...")
-    device = cfg.train.device
-    
-    # 1. 实例化 Agent (结构必须与训练时完全一致)
-    # 也可以在这里覆盖配置，例如 forcing freeze=True
-    agent: VLAAgent = instantiate(cfg.agent)
-    agent.to(device)
-    agent.eval() # 关键：切换到 Eval 模式
-    
-    # 2. 加载权重
-    ckpt_path = "checkpoints/vla_model_final.pt"
-    if not os.path.exists(ckpt_path):
-        print(f"❌ Checkpoint not found at {ckpt_path}. Run training first!")
-        return
-
-    print(f"Loading weights from {ckpt_path}...")
-    # map_location='cpu' 防止在只有 CPU 的机器上加载 GPU 权重报错
-    state_dict = torch.load(ckpt_path, map_location=device)
-    agent.load_state_dict(state_dict)
-    print("✅ Weights loaded successfully.")
-
-    # 3. 准备测试数据 (从 Dataset 里取一个样本)
-    dataset = instantiate(cfg.data)
-    dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
-    sample = next(iter(dataloader))
-    
-    # 准备输入 (模拟机器人实时运行)
-    # 注意：推理时不需要传 sample['actions']
-    primary_cam_key = cfg.data.obs_keys[0]
-    input_img = sample['obs'][primary_cam_key][:, -1, :, :, :] # (1, C, H, W)
-    
-    agent_input = {
-        "obs": {
-            "image": input_img.to(device),
-            "text": sample["language"] # 即使不用文本，占位符也要留着
-        }
-        # ⚠️ 关键：这里不传 'actions'，触发 Agent 进入 Inference 分支
-    }
-    
-    # 4. 执行推理 (Reverse Diffusion)
-    print("running reverse diffusion (this may take a moment)...")
-    with torch.no_grad():
-        # 这会触发 DiffusionHead 的分支 B (loop over timesteps)
-        outputs = agent(agent_input)
-    
-    # 5. 获取结果
-    # 输出 shape: (1, Chunk_Size, Action_Dim)
-    pred_actions = outputs['pred_actions'].cpu().numpy()[0] 
-    gt_actions = sample['actions'][0].numpy() # 用来对比
-    
-    print(f"✅ Generated Action Chunk Shape: {pred_actions.shape}")
-    
-    # 6. 可视化对比 (保存图片)
-    plot_results(pred_actions, gt_actions)
-
-def plot_results(pred, gt):
-    """
-    简单的可视化：画出前几个维度的轨迹对比
-    """
-    plt.figure(figsize=(10, 5))
-    
-    # 比如只画前 3 个维度 (x, y, z)
-    dims_to_plot = 3
-    for i in range(dims_to_plot):
-        plt.subplot(1, dims_to_plot, i+1)
-        plt.plot(gt[:, i], 'g--', label='Ground Truth')
-        plt.plot(pred[:, i], 'b-', label='Diffusion Pred')
-        plt.title(f"Action Dim {i}")
-        if i == 0: plt.legend()
-        plt.ylim(-1, 1) # 假设动作是归一化的
-        
-    plt.tight_layout()
-    plt.savefig("inference_result.png")
-    print("📊 Result plot saved to 'inference_result.png'")
-
-if __name__ == "__main__":
-    main()
--- a/roboimi/demos/vla_scripts/train_vla.py
+++ b/roboimi/demos/vla_scripts/train_vla.py
@@ -1,6 +1,8 @@
 import sys
 import os
 import logging
+import json
+import pickle
 import hydra
 import torch
 from tqdm import tqdm
@@ -103,6 +105,46 @@ def main(cfg: DictConfig):
        log.error(f"❌ Failed to initialize agent: {e}")
        raise

+    # =========================================================================
+    # 2.5. Save Dataset Statistics as JSON
+    # =========================================================================
+    log.info("💾 Saving dataset statistics...")
+    try:
+        # Get dataset_dir from config
+        dataset_dir = cfg.data.get('dataset_dir', 'roboimi/demos/dataset/sim_transfer')
+        stats_path = Path(dataset_dir) / 'data_stats.pkl'
+
+        if stats_path.exists():
+            # Load pickle file
+            with open(stats_path, 'rb') as f:
+                stats = pickle.load(f)
+
+            # Extract action statistics
+            action_mean = stats['action']['mean'].tolist() if 'action' in stats else []
+            action_std = stats['action']['std'].tolist() if 'action' in stats else []
+            qpos_mean = stats['qpos']['mean'].tolist() if 'qpos' in stats else []
+            qpos_std = stats['qpos']['std'].tolist() if 'qpos' in stats else []
+
+            # Save as JSON
+            json_stats = {
+                'action_mean': action_mean,
+                'action_std': action_std,
+                'qpos_mean': qpos_mean,
+                'qpos_std': qpos_std
+            }
+            json_path = checkpoint_dir / 'dataset_stats.json'
+            with open(json_path, 'w') as f:
+                json.dump(json_stats, f, indent=2)
+
+            log.info(f"✅ Dataset statistics saved to {json_path}")
+        else:
+            log.warning(f"⚠️  Statistics file not found: {stats_path}")
+            log.warning("⚠️  Actions will not be denormalized during inference!")
+
+    except Exception as e:
+        log.warning(f"⚠️  Failed to save statistics as JSON: {e}")
+        log.warning("⚠️  Training will continue, but inference may not work correctly")
+
    # =========================================================================
    # 3. Setup Optimizer
    # =========================================================================
--- a/roboimi/vla/VLA_EVALUATION_GUIDE.md
+++ b/roboimi/vla/VLA_EVALUATION_GUIDE.md
@@ -0,0 +1,239 @@
+# VLA Evaluation Guide
+
+This guide explains how to evaluate a trained Vision-Language-Action (VLA) policy in the MuJoCo simulation environment.
+
+## Prerequisites
+
+1. **Trained Model**: Train your VLA model first using `train_vla.py`
+2. **Checkpoints**: Ensure you have saved model checkpoints in `checkpoints/` directory
+3. **Dependencies**: Install required dependencies:
+   ```bash
+   pip install opencv-python tqdm
+   ```
+
+## Quick Start
+
+### Basic Evaluation
+
+```bash
+# Evaluate with default settings (3 episodes)
+python roboimi/demos/eval_vla.py \
+    --ckpt_path checkpoints/vla_model_best.pt
+
+# Evaluate with custom settings
+python roboimi/demos/eval_vla.py \
+    --ckpt_path checkpoints/vla_model_step_5000.pt \
+    --num_episodes 5 \
+    --max_timesteps 700 \
+    --camera_names r_vis top angle \
+    --num_queries 1 \
+    --obs_horizon 2
+```
+
+### Parameters
+
+| Parameter | Description | Default |
+|-----------|-------------|---------|
+| `--ckpt_path` | Path to model checkpoint (.pt file) | Required |
+| `--num_episodes` | Number of evaluation episodes | 3 |
+| `--max_timesteps` | Maximum timesteps per episode | 700 |
+| `--device` | Device for inference (`cuda` or `cpu`) | `cuda` |
+| `--camera_names` | Camera names to use (space-separated) | `r_vis top` |
+| `--num_queries` | Policy query frequency (every N timesteps) | 1 |
+| `--obs_horizon` | Observation history length | 2 |
+| `--no_video` | Disable video saving | False |
+
+## Usage Details
+
+### Policy Query Frequency
+
+The `--num_queries` parameter controls how often the policy is queried:
+
+- `--num_queries 1`: Query every timestep (default, most accurate)
+- `--num_queries 4`: Query every 4 timesteps (faster, but uses cached actions)
+
+When using cached actions (num_queries > 1), the policy predicts a chunk of actions (pred_horizon=16), and these actions are executed sequentially until the next query.
+
+### Camera Selection
+
+Available cameras depend on your environment:
+- `r_vis`: Right arm RealSense camera
+- `top`: Top-down view camera
+- `angle`: Angled view camera
+
+Use `--camera_names` to specify which cameras to use:
+```bash
+--camera_names r_vis top      # Use 2 cameras
+--camera_names r_vis top angle # Use all 3 cameras
+```
+
+### Observation Horizon
+
+The `--obs_horizon` parameter determines how many past observations to use as context:
+
+```bash
+--obs_horizon 1  # Use only current observation
+--obs_horizon 2  # Use current + 1 past observation (default)
+--obs_horizon 4  # Use current + 3 past observations
+```
+
+**Note**: Must match the value used during training.
+
+## Output
+
+### Console Output
+
+During evaluation, you'll see:
+
+```
+============================================================
+Episode 1/3
+============================================================
+
+Episode 1: 100%|████████████████████| 700/700 [02:30<00:00,  4.64it/s]
+
+✅ Task completed at timestep 453!
+
+Episode 1 Summary:
+  Total Reward: 1.0000
+  Max Reward: 1.0000
+  Length: 453 timesteps
+  Video saved: outputs/eval_vla_episode_0.mp4
+```
+
+### Video Output
+
+Videos are saved to `outputs/eval_vla_episode_{N}.mp4` showing the robot's execution.
+
+### Metrics
+
+- **Total Reward**: Sum of rewards throughout the episode
+- **Max Reward**: Maximum reward achieved (1.0 = success)
+- **Length**: Number of timesteps executed
+
+## Action Smoothing
+
+The evaluator includes EMA (Exponential Moving Average) smoothing by default to reduce jitter:
+
+```python
+# Default smoothing parameters
+smooth_method = 'ema'
+smooth_alpha = 0.3  # Lower = more smoothing
+```
+
+To disable or modify smoothing, edit the `evaluate_policy()` call in `eval_vla.py`:
+
+```python
+evaluator = VLAEvaluator(
+    agent=agent,
+    use_smoothing=False,  # Disable smoothing
+    # or
+    smooth_method='moving_avg',  # Use different method
+    smooth_alpha=0.5  # Adjust smoothing strength
+)
+```
+
+## Troubleshooting
+
+### Issue: Checkpoint not found
+
+```
+FileNotFoundError: Checkpoint not found: checkpoints/vla_model_best.pt
+```
+
+**Solution**: Ensure you've trained the model and checkpoints exist:
+```bash
+ls -la checkpoints/
+# Should show: vla_model_best.pt, vla_model_final.pt, etc.
+```
+
+### Issue: CUDA out of memory
+
+**Solution**: Use CPU for inference:
+```bash
+python eval_vla.py --ckpt_path checkpoints/vla_model_best.pt --device cpu
+```
+
+### Issue: Camera names don't match
+
+**Solution**: Check your HDF5 files for available cameras:
+```python
+import h5py
+with h5py.File('roboimi/demos/dataset/sim_transfer/episode_0.hdf5', 'r') as f:
+    print(list(f['observations/images'].keys()))
+    # Output: ['angle', 'r_vis', 'top']
+```
+
+Then use the correct camera names in your eval command.
+
+### Issue: Mismatched obs_horizon
+
+```
+RuntimeError: Tensor shape mismatch
+```
+
+**Solution**: Ensure `--obs_horizon` matches the training config (`data.obs_horizon`).
+
+## Advanced Usage
+
+### Custom Evaluation Script
+
+You can also use the evaluator in your own scripts:
+
+```python
+from roboimi.demos.eval_vla import VLAEvaluator, load_checkpoint
+from roboimi.envs.double_pos_ctrl_env import make_sim_env
+
+# Load model
+agent = load_checkpoint('checkpoints/vla_model_best.pt')
+
+# Create evaluator
+evaluator = VLAEvaluator(
+    agent=agent,
+    device='cuda',
+    camera_names=['r_vis', 'top'],
+    num_queries=1,
+    obs_horizon=2
+)
+
+# Create environment
+env = make_sim_env('sim_transfer')
+env.reset()
+evaluator.reset()
+
+# Run episode
+obs = env._get_image_obs()
+obs['qpos'] = env._get_qpos_obs()['qpos']
+
+# Predict and execute action
+action = evaluator.predict_action(obs)
+env.step_jnt(action)
+```
+
+### Batch Evaluation
+
+Evaluate multiple checkpoints:
+
+```bash
+for ckpt in checkpoints/vla_model_step_*.pt; do
+    echo "Evaluating $ckpt"
+    python roboimi/demos/eval_vla.py \
+        --ckpt_path "$ckpt" \
+        --num_episodes 1 \
+        --no_video
+done
+```
+
+## Next Steps
+
+1. **Train your model**: See [RESNET_TRAINING_GUIDE.md](roboimi/vla/RESNET_TRAINING_GUIDE.md)
+2. **Evaluate performance**: Use this evaluation script
+3. **Analyze results**: Compare different checkpoints
+4. **Deploy to real robot**: Adapt the evaluator for real robot control
+
+## References
+
+- Training Guide: [roboimi/vla/RESNET_TRAINING_GUIDE.md](roboimi/vla/RESNET_TRAINING_GUIDE.md)
+- Project Documentation: [CLAUDE.md](CLAUDE.md)
+- Original ACT Paper: https://arxiv.org/abs/2304.13705
+- Diffusion Policy: https://diffusion-policy.cs.columbia.edu/
--- a/roboimi/vla/agent.py
+++ b/roboimi/vla/agent.py
@@ -1,8 +1,10 @@
 import torch
 import torch.nn as nn
+import numpy as np
 from typing import Dict, Optional, Any
 from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead
 from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
+from diffusers.schedulers.scheduling_ddim import DDIMScheduler
 from roboimi.vla.models.heads.diffusion import ConditionalUnet1D

 class VLAAgent(nn.Module):
@@ -18,6 +20,13 @@ class VLAAgent(nn.Module):
        num_cams=2,          # 视觉输入的摄像头数量
    ):
        super().__init__()
+        # Store parameters
+        self.action_dim = action_dim
+        self.obs_dim = obs_dim
+        self.pred_horizon = pred_horizon
+        self.obs_horizon = obs_horizon
+        self.num_cams = num_cams
+
        self.vision_encoder = vision_backbone
        single_img_feat_dim = self.vision_encoder.output_dim
        total_vision_dim = single_img_feat_dim * num_cams * obs_horizon
@@ -31,6 +40,14 @@ class VLAAgent(nn.Module):
                    prediction_type='epsilon' # 预测噪声
                )

+        # DDIM scheduler for faster inference
+        self.infer_scheduler = DDIMScheduler(
+            num_train_timesteps=diffusion_steps,
+            beta_schedule='squaredcos_cap_v2',
+            clip_sample=True,
+            prediction_type='epsilon'
+        )
+
        self.noise_pred_net = ConditionalUnet1D(
            input_dim=action_dim,
            global_cond_dim=self.global_cond_dim
@@ -70,18 +87,12 @@ class VLAAgent(nn.Module):
        )

        # 6. 网络预测噪声
-        # 注意：U-Net 1D 通常期望 channel 在中间: (B, C, T)
-        # noisy_actions_inp = noisy_actions.permute(0, 2, 1) 
-        
        pred_noise = self.noise_pred_net(
            sample=noisy_actions, 
            timestep=timesteps, 
            global_cond=global_cond
        )

-        # 还原维度 (B, T, C)
-        pred_noise = pred_noise.permute(0, 2, 1)
-
        # 7. 计算 Loss (MSE)
        loss = nn.functional.mse_loss(pred_noise, noise)
        return loss
@@ -96,20 +107,27 @@ class VLAAgent(nn.Module):
        # 1. 提取当前观测特征 (只做一次)
        visual_features = self.vision_encoder(images).view(B, -1)
        proprioception = proprioception.view(B, -1)
+        if hasattr(self, 'qpos_mean') and hasattr(self, 'qpos_std') and self.qpos_mean is not None:
+            # Convert to tensor for normalization
+            qpos_mean = torch.from_numpy(self.qpos_mean).float().to(proprioception.device)
+            qpos_std = torch.from_numpy(self.qpos_std).float().to(proprioception.device)
+            qpos_mean = qpos_mean.repeat(2)
+            qpos_std = qpos_std.repeat(2)
+            # Normalize: (qpos - mean) / std
+            proprioception = (proprioception - qpos_mean.unsqueeze(0)) / qpos_std.unsqueeze(0)
        global_cond = torch.cat([visual_features, proprioception], dim=-1)

        # 2. 初始化纯高斯噪声动作
-        # Shape: (B, Horizon, Action_Dim)
+        # Shape: (B, pred_horizon, action_dim)
        current_actions = torch.randn(
-            (B, 16, 7), device=global_cond.device
+            (B, self.pred_horizon, self.action_dim), device=global_cond.device
        )

        # 3. 逐步去噪循环 (Reverse Diffusion)
-        self.noise_scheduler.set_timesteps(10) # 推理时可以用更少步加速 (如 DDIM)
+        self.infer_scheduler.set_timesteps(10) # DDIM 推理步数
        
-        for t in self.noise_scheduler.timesteps:
-            # 调整输入格式适应 1D CNN
-            model_input = current_actions.permute(0, 2, 1)
+        for t in self.infer_scheduler.timesteps:
+            model_input = current_actions
            
            # 预测噪声
            noise_pred = self.noise_pred_net(
@@ -117,12 +135,19 @@ class VLAAgent(nn.Module):
                timestep=t, 
                global_cond=global_cond
            )
-            # noise_pred = noise_pred.permute(0, 2, 1)

            # 移除噪声，更新 current_actions
-            current_actions = self.noise_scheduler.step(
+            current_actions = self.infer_scheduler.step(
                noise_pred, t, current_actions
            ).prev_sample

-        # 4. 输出最终动作序列
+        # 4. 反归一化动作 (Denormalize actions)
+        if hasattr(self, 'action_mean') and hasattr(self, 'action_std') and self.action_mean is not None:
+            # Convert to numpy for denormalization
+            action_mean = torch.from_numpy(self.action_mean).float().to(current_actions.device)
+            action_std = torch.from_numpy(self.action_std).float().to(current_actions.device)
+            # Denormalize: action * std + mean
+            current_actions = current_actions * action_std.unsqueeze(0).unsqueeze(0) + action_mean.unsqueeze(0).unsqueeze(0)
+
+        # 5. 输出最终动作序列
        return current_actions # 返回去噪后的干净动作
--- a/roboimi/vla/conf/config.yaml
+++ b/roboimi/vla/conf/config.yaml
@@ -4,10 +4,10 @@ defaults:
  - data: resnet_dataset

 train:
-  batch_size: 8       # Batch size for training
+  batch_size: 32       # Batch size for training
  lr: 1e-4            # Learning rate
-  max_steps: 10000    # Maximum training steps
+  max_steps: 20000    # Maximum training steps
  log_freq: 100       # Log frequency (steps)
-  save_freq: 1000     # Save checkpoint frequency (steps)
+  save_freq: 2000     # Save checkpoint frequency (steps)
  device: "cuda"      # Device: "cuda" or "cpu"
  num_workers: 8      # DataLoader workers (set to 0 for debugging, 8 for production)
--- a/roboimi/vla/data/dataset.py
+++ b/roboimi/vla/data/dataset.py
@@ -11,7 +11,7 @@ class RobotDiffusionDataset(Dataset):
    def __init__(self, 
                 dataset_dir, 
                 pred_horizon=16, 
-                 obs_horizon=1, 
+                 obs_horizon=2, 
                 action_horizon=8,
                 camera_names=['r_vis', 'top'],
                 normalization_type='gaussian'):