From a43a2e3d18c371924eb2aadeaaaf17200a141e2c Mon Sep 17 00:00:00 2001
From: gouhanke <12219217+gouhanke@user.noreply.gitee.com>
Date: Fri, 6 Feb 2026 13:45:35 +0800
Subject: [PATCH] =?UTF-8?q?chore:=20=E5=88=A0=E9=99=A4=E5=A4=9A=E4=BD=99?=
 =?UTF-8?q?=E8=84=9A=E6=9C=AC?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 roboimi/demos/eval_vla.py                    | 532 -------------------
 roboimi/demos/vla_scripts/eval_vla.py        | 328 ++++++++++++
 roboimi/vla/RESNET_TRAINING_GUIDE.md         | 238 ---------
 roboimi/vla/VLA_EVALUATION_GUIDE.md          | 239 ---------
 roboimi/vla/conf/agent/base_siglip.yaml      |  25 -
 roboimi/vla/conf/agent/debug_vla.yaml        |  24 -
 roboimi/vla/conf/agent/default.yaml          |  30 --
 roboimi/vla/conf/agent/resnet_diffusion.yaml |  15 +-
 roboimi/vla/conf/agent/siglip_diffusion.yaml |  24 -
 roboimi/vla/conf/agent/tiny.yaml             |  26 -
 roboimi/vla/conf/backbone/clip.yaml          |   1 -
 roboimi/vla/conf/backbone/resnet.yaml        |   7 +-
 roboimi/vla/conf/backbone/siglip.yaml        |   4 -
 roboimi/vla/conf/config.yaml                 |   3 +-
 roboimi/vla/conf/data/default_dataset.yaml   |  16 -
 roboimi/vla/conf/data/resnet_dataset.yaml    |   6 +-
 roboimi/vla/conf/data/siglip2.yaml           |   8 -
 roboimi/vla/conf/eval/eval.yaml              |  21 +
 roboimi/vla/conf/head/act.yaml               |   1 -
 roboimi/vla/conf/head/diffusion.yaml         |   2 +-
 roboimi/vla/conf/train/debug.yaml            |   1 -
 roboimi/vla/conf/train/gpu.yaml              |   1 -
 roboimi/vla/data/image_transform.py          |  75 ---
 roboimi/vla/data/text_processing.py          |   1 -
 roboimi/vla/models/backbones/__init__.py     |   8 +-
 roboimi/vla/models/backbones/siglip.py       |  62 ---
 roboimi/vla/models/heads/__init__.py         |   4 -
 27 files changed, 366 insertions(+), 1336 deletions(-)
 delete mode 100644 roboimi/demos/eval_vla.py
 create mode 100644 roboimi/demos/vla_scripts/eval_vla.py
 delete mode 100644 roboimi/vla/RESNET_TRAINING_GUIDE.md
 delete mode 100644 roboimi/vla/VLA_EVALUATION_GUIDE.md
 delete mode 100644 roboimi/vla/conf/agent/base_siglip.yaml
 delete mode 100644 roboimi/vla/conf/agent/debug_vla.yaml
 delete mode 100644 roboimi/vla/conf/agent/default.yaml
 delete mode 100644 roboimi/vla/conf/agent/siglip_diffusion.yaml
 delete mode 100644 roboimi/vla/conf/agent/tiny.yaml
 delete mode 100644 roboimi/vla/conf/backbone/clip.yaml
 delete mode 100644 roboimi/vla/conf/backbone/siglip.yaml
 delete mode 100644 roboimi/vla/conf/data/default_dataset.yaml
 delete mode 100644 roboimi/vla/conf/data/siglip2.yaml
 create mode 100644 roboimi/vla/conf/eval/eval.yaml
 delete mode 100644 roboimi/vla/conf/head/act.yaml
 delete mode 100644 roboimi/vla/conf/train/debug.yaml
 delete mode 100644 roboimi/vla/conf/train/gpu.yaml
 delete mode 100644 roboimi/vla/data/image_transform.py
 delete mode 100644 roboimi/vla/data/text_processing.py
 delete mode 100644 roboimi/vla/models/backbones/siglip.py

diff --git a/roboimi/demos/eval_vla.py b/roboimi/demos/eval_vla.py
deleted file mode 100644
index 91df49b..0000000
--- a/roboimi/demos/eval_vla.py
+++ /dev/null
@@ -1,532 +0,0 @@
-"""
-VLA Policy Evaluation Script
-
-This script evaluates a trained Vision-Language-Action (VLA) policy
-in the MuJoCo simulation environment.
-
-Usage:
-    python roboimi/demos/eval_vla.py --ckpt_path checkpoints/vla_model_best.pt --num_episodes 3
-"""
-
-import torch
-import numpy as np
-import argparse
-from pathlib import Path
-from typing import Dict, List
-from tqdm import tqdm
-
-from roboimi.envs.double_pos_ctrl_env import make_sim_env
-from roboimi.utils.act_ex_utils import sample_transfer_pose
-from einops import rearrange
-
-
-class VLAEvaluator:
-    """
-    VLA Policy Evaluator for MuJoCo Simulation
-    """
-
-    def __init__(
-        self,
-        agent: torch.nn.Module,
-        device: str = 'cuda',
-        camera_names: List[str] = ['r_vis', 'top', 'front'],
-        num_queries: int = 1,
-        obs_horizon: int = 2,
-        pred_horizon: int = 16,
-        use_smoothing: bool = False,
-        smooth_method: str = 'ema',
-        smooth_alpha: float = 0.3
-    ):
-        """
-        Args:
-            agent: Trained VLAAgent
-            device: Device for inference
-            camera_names: List of camera names to use
-            num_queries: How often to query the policy (in timesteps)
-            obs_horizon: Number of observations to use as context
-            pred_horizon: Number of future actions to predict
-            use_smoothing: Whether to apply action smoothing
-            smooth_method: Smoothing method ('ema', 'moving_avg', 'lowpass')
-            smooth_alpha: Smoothing coefficient
-        """
-        self.agent = agent.to(device)
-        self.device = device
-        self.camera_names = camera_names
-        self.num_queries = num_queries
-        self.obs_horizon = obs_horizon
-        self.pred_horizon = pred_horizon
-
-        # Action smoothing
-        self.use_smoothing = use_smoothing
-        self.smooth_method = smooth_method
-        self.smooth_alpha = smooth_alpha
-        self.smoother = ActionSmoother(
-            action_dim=16,  # Assuming 16-dim actions
-            method=smooth_method,
-            alpha=smooth_alpha
-        ) if use_smoothing else None
-
-        # Observation buffer for obs_horizon
-        self.obs_buffer = {
-            'images': {cam: [] for cam in camera_names},
-            'qpos': []
-        }
-        self.cached_actions = None
-        self.query_step = 0
-
-    def reset(self):
-        """Reset evaluator state"""
-        self.obs_buffer = {
-            'images': {cam: [] for cam in self.camera_names},
-            'qpos': []
-        }
-        self.cached_actions = None
-        self.query_step = 0
-        if self.smoother is not None:
-            self.smoother.reset()
-
-    def _get_image_dict(self, obs: Dict) -> Dict[str, torch.Tensor]:
-        """
-        Extract and preprocess images from observation
-
-        Args:
-            obs: Environment observation dict
-
-        Returns:
-            Dict mapping camera names to image tensors (B, obs_horizon, C, H, W)
-        """
-        images = {}
-        for cam_name in self.camera_names:
-            # Extract image: (H, W, C) -> (C, H, W)
-            img = obs['images'][cam_name]
-            img = rearrange(img, 'h w c -> c h w')
-            img = torch.from_numpy(img / 255.0).float()
-            images[cam_name] = img  # (C, H, W)
-
-        # Stack to create batch dimension
-        image_dict = {}
-        for cam_name in self.camera_names:
-            # Collect obs_horizon frames
-            cam_images = self.obs_buffer['images'][cam_name]
-            cam_images.append(images[cam_name])
-
-            # Pad to obs_horizon if needed (duplicate first frame)
-            while len(cam_images) < self.obs_horizon:
-                cam_images.insert(0, cam_images[0])
-
-            # Keep only obs_horizon frames
-            if len(cam_images) > self.obs_horizon:
-                cam_images = cam_images[-self.obs_horizon:]
-
-            # Stack: (obs_horizon, C, H, W) -> (1, obs_horizon, C, H, W)
-            img_tensor = torch.stack(cam_images, dim=0).unsqueeze(0)
-            image_dict[cam_name] = img_tensor
-
-            # Update buffer (without padding)
-            self.obs_buffer['images'][cam_name] = cam_images[-self.obs_horizon:]
-
-        return image_dict
-
-    def _get_qpos_dict(self, obs: Dict) -> torch.Tensor:
-        """
-        Extract and preprocess qpos from observation
-
-        Args:
-            obs: Environment observation dict
-
-        Returns:
-            qpos tensor: (1, obs_horizon, obs_dim)
-        """
-        qpos = obs['qpos']
-        qpos = torch.from_numpy(qpos).float()
-
-        # Add to buffer
-        self.obs_buffer['qpos'].append(qpos)
-
-        # Pad to obs_horizon if needed (duplicate first frame)
-        while len(self.obs_buffer['qpos']) < self.obs_horizon:
-            self.obs_buffer['qpos'].insert(0, self.obs_buffer['qpos'][0])
-
-        # Keep only obs_horizon frames
-        if len(self.obs_buffer['qpos']) > self.obs_horizon:
-            self.obs_buffer['qpos'] = self.obs_buffer['qpos'][-self.obs_horizon:]
-
-        # Stack: (obs_horizon, obs_dim) -> (1, obs_horizon, obs_dim)
-        qpos_tensor = torch.stack(self.obs_buffer['qpos'], dim=0).unsqueeze(0)
-
-        return qpos_tensor
-
-    @torch.no_grad()
-    def predict_action(self, obs: Dict) -> np.ndarray:
-        """
-        Predict action using VLA policy
-
-        Args:
-            obs: Current environment observation
-
-        Returns:
-            action: numpy array of shape (action_dim,)
-        """
-        # 1. Prepare observations
-        images = self._get_image_dict(obs)  # Dict[str, (1, obs_horizon, C, H, W)]
-        qpos = self._get_qpos_dict(obs)      # (1, obs_horizon, obs_dim)
-
-        # 2. Check if we need to query the policy
-        if self.cached_actions is None or self.query_step % self.num_queries == 0:
-            # Prepare input for VLA agent
-            # VLAAgent.predict_action expects:
-            # - images: Dict[str, Tensor] with shape (B, obs_horizon, C, H, W)
-            # - proprioception: Tensor with shape (B, obs_horizon, obs_dim)
-
-            # Move to device
-            images = {k: v.to(self.device) for k, v in images.items()}
-            qpos = qpos.to(self.device)
-
-            # Predict actions using VLA agent
-            # Returns: (B, pred_horizon, action_dim)
-            predicted_actions = self.agent.predict_action(
-                images=images,
-                proprioception=qpos
-            )
-
-            # Cache predicted actions (CPU numpy array)
-            self.cached_actions = predicted_actions.squeeze(0).cpu().numpy()  # (pred_horizon, action_dim)
-            self.query_step = 0
-
-        # 3. Get action from cache
-        raw_action = self.cached_actions[self.query_step]
-        self.query_step += 1
-
-        # 4. Apply smoothing if enabled
-        if self.smoother is not None:
-            raw_action = self.smoother.smooth(raw_action)
-
-        return raw_action
-
-
-class ActionSmoother:
-    """Action smoothing for smoother execution"""
-
-    def __init__(self, action_dim: int, method: str = 'ema', alpha: float = 0.3):
-        self.action_dim = action_dim
-        self.method = method
-        self.alpha = alpha
-        self.prev_action = None
-
-    def smooth(self, action: np.ndarray) -> np.ndarray:
-        if self.method == 'ema':
-            if self.prev_action is None:
-                smoothed = action
-            else:
-                smoothed = self.alpha * action + (1 - self.alpha) * self.prev_action
-            self.prev_action = smoothed
-            return smoothed
-        else:
-            return action
-
-    def reset(self):
-        self.prev_action = None
-
-
-def load_checkpoint(
-    ckpt_path: str,
-    device: str = 'cuda'
-) -> torch.nn.Module:
-    """
-    Load trained VLA model from checkpoint
-
-    Args:
-        ckpt_path: Path to checkpoint file (.pt)
-        device: Device to load model on
-
-    Returns:
-        Loaded VLAAgent model
-    """
-    from roboimi.vla.agent import VLAAgent
-    from hydra import initialize_config_dir, compose
-    from pathlib import Path as PathLib
-
-    ckpt_path = PathLib(ckpt_path).absolute()
-    if not ckpt_path.exists():
-        raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
-
-    # Load checkpoint
-    print(f"Loading checkpoint from {ckpt_path}")
-    checkpoint = torch.load(ckpt_path, map_location=device, weights_only=False)
-
-    print(f"Checkpoint keys: {checkpoint.keys()}")
-
-    # Find VLA config directory
-    import os
-
-    # Get script directory
-    script_dir = PathLib(__file__).resolve().parent
-    current_dir = PathLib(os.getcwd()).absolute()
-
-    # Try to find vla/conf directory
-    config_dir = None
-
-    # Option 1: If running from roboimi directory
-    if (current_dir / 'vla' / 'conf').exists():
-        config_dir = current_dir / 'vla' / 'conf'
-    # Option 2: If running from project root
-    elif (current_dir / 'roboimi' / 'vla' / 'conf').exists():
-        config_dir = current_dir / 'roboimi' / 'vla' / 'conf'
-    # Option 3: Relative to script location
-    elif (script_dir / '../vla' / 'conf').exists():
-        config_dir = (script_dir / '../vla' / 'conf').resolve()
-    # Option 4: Search upwards
-    else:
-        search_start = current_dir
-        while search_start != search_start.parent:
-            if (search_start / 'vla' / 'conf').exists():
-                config_dir = search_start / 'vla' / 'conf'
-                break
-            search_start = search_start.parent
-
-    if config_dir is None:
-        raise FileNotFoundError(
-            f"Could not find VLA config directory.\n"
-            f"Current directory: {current_dir}\n"
-            f"Script location: {script_dir}\n"
-            f"Please ensure you're running from the roboimi directory."
-        )
-
-    config_abs_path = str(config_dir.absolute())
-    print(f"Loading config from {config_abs_path}")
-
-    if not PathLib(config_abs_path).exists():
-        raise FileNotFoundError(f"Config directory does not exist: {config_abs_path}")
-    print(f"Loading config from {config_abs_path}")
-
-    # Initialize Hydra with absolute path
-    with initialize_config_dir(config_dir=config_abs_path, version_base=None):
-        cfg = compose(config_name="config")
-
-    # Instantiate agent from config
-    print("Instantiating agent from config...")
-    from hydra.utils import instantiate
-    agent = instantiate(cfg.agent)
-
-    # Load model state
-    if 'model_state_dict' in checkpoint:
-        agent.load_state_dict(checkpoint['model_state_dict'])
-        print(f"✅ Model state loaded (step: {checkpoint.get('step', 'unknown')})")
-    elif 'state_dict' in checkpoint:
-        agent.load_state_dict(checkpoint['state_dict'])
-        print("✅ Model state loaded")
-    else:
-        # Assume checkpoint is the state_dict itself
-        agent.load_state_dict(checkpoint)
-        print("✅ Model state loaded")
-
-    # Load dataset statistics for denormalization
-    import json
-    stats_path = ckpt_path.parent / 'dataset_stats.json'
-    if stats_path.exists():
-        with open(stats_path, 'r') as f:
-            stats = json.load(f)
-        # Convert lists to numpy arrays
-        agent.action_mean = np.array(stats['action_mean'])
-        agent.action_std = np.array(stats['action_std'])
-        agent.qpos_mean = np.array(stats['qpos_mean'])
-        agent.qpos_std = np.array(stats['qpos_std'])
-        print(f"✅ Dataset statistics loaded for denormalization")
-    else:
-        print(f"⚠️  Warning: {stats_path} not found. Actions will not be denormalized!")
-        agent.action_mean = None
-        agent.action_std = None
-
-    agent.eval()
-    agent.to(device)
-
-    print(f"✅ Model loaded successfully on {device}")
-
-    return agent
-
-
-def evaluate_policy(
-    agent: torch.nn.Module,
-    num_episodes: int = 3,
-    max_timesteps: int = 700,
-    task_name: str = 'sim_transfer',
-    device: str = 'cuda',
-    camera_names: List[str] = ['r_vis', 'top', 'front'],
-    num_queries: int = 1,
-    obs_horizon: int = 2,
-    save_video: bool = True
-):
-    """
-    Evaluate VLA policy in simulation
-
-    Args:
-        agent: Trained VLAAgent
-        num_episodes: Number of episodes to run
-        max_timesteps: Maximum timesteps per episode
-        task_name: Task name for environment creation
-        device: Device for inference
-        camera_names: List of camera names
-        num_queries: Policy query frequency
-        obs_horizon: Observation horizon
-        save_video: Whether to save video
-    """
-    # Create evaluator
-    evaluator = VLAEvaluator(
-        agent=agent,
-        device=device,
-        camera_names=camera_names,
-        num_queries=num_queries,
-        obs_horizon=obs_horizon,
-        use_smoothing=False,
-        smooth_method='ema',
-        smooth_alpha=0.3
-    )
-
-    # Create environment
-    env = make_sim_env(task_name)
-
-    # Run episodes
-    for episode_idx in range(num_episodes):
-        print(f"\n{'='*60}")
-        print(f"Episode {episode_idx + 1}/{num_episodes}")
-        print(f"{'='*60}\n")
-
-        # Reset environment and evaluator
-        box_pos = sample_transfer_pose()
-        env.reset(box_pos)
-        evaluator.reset()
-
-        # Storage for visualization
-        episode_images = []
-        success = False
-        success_timestep = 0
-
-        with torch.inference_mode():
-            for t in tqdm(range(max_timesteps), desc=f"Episode {episode_idx + 1}"):
-                # Get observation
-                obs = env._get_image_obs()
-                qpos_obs = env._get_qpos_obs()
-
-                # Merge observations
-                obs['qpos'] = qpos_obs['qpos']
-
-                # Predict action
-                action = evaluator.predict_action(obs)
-
-                # Execute action
-                env.step_jnt(action)
-
-                # Save images for video
-                if save_video:
-                    episode_images.append(obs['images'])
-
-                # Render
-                env.render()
-
-                # Check if episode is done
-                if env.rew == 1.0:  # Success condition
-                    success = True
-                    success_timestep = t
-                    print(f"\n✅ Task completed at timestep {t}!")
-                    break
-
-        # Episode summary
-        print(f"\nEpisode {episode_idx + 1} Summary:")
-        print(f"  Success: {success}")
-        if success:
-            print(f"  Success Timestep: {success_timestep}")
-        print(f"  Length: {len(episode_images)} timesteps")
-
-        # Save video
-        if save_video and episode_images:
-            save_video_episode(
-                episode_images,
-                save_path=f"outputs/eval_vla_episode_{episode_idx}.mp4"
-            )
-            print(f"  Video saved: outputs/eval_vla_episode_{episode_idx}.mp4")
-
-    print(f"\n{'='*60}")
-    print("Evaluation complete!")
-    print(f"{'='*60}\n")
-
-
-def save_video_episode(images: List[Dict], save_path: str, fps: int = 20):
-    """
-    Save episode as video
-
-    Args:
-        images: List of observation dicts containing images
-        save_path: Path to save video
-        fps: Frames per second
-    """
-    try:
-        import cv2
-        from tqdm import tqdm
-
-        Path(save_path).parent.mkdir(parents=True, exist_ok=True)
-
-        # Use first camera (e.g., 'r_vis') for visualization
-        cam_name = list(images[0].keys())[0]
-
-        # Get image size
-        H, W, C = images[0][cam_name].shape
-
-        # Create video writer
-        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-        video_writer = cv2.VideoWriter(save_path, fourcc, fps, (W, H))
-
-        # Write frames
-        for img_dict in tqdm(images, desc="Saving video"):
-            frame = img_dict[cam_name]
-            # Convert RGB to BGR for OpenCV
-            frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
-            video_writer.write(frame_bgr)
-
-        video_writer.release()
-        print(f"Video saved to {save_path}")
-
-    except ImportError:
-        print("Warning: opencv-python not installed, skipping video save")
-        print("Install with: pip install opencv-python")
-
-
-def main():
-    parser = argparse.ArgumentParser(description='Evaluate VLA Policy')
-    parser.add_argument('--ckpt_path', type=str, required=True,
-                        help='Path to model checkpoint')
-    parser.add_argument('--num_episodes', type=int, default=3,
-                        help='Number of evaluation episodes')
-    parser.add_argument('--max_timesteps', type=int, default=700,
-                        help='Maximum timesteps per episode')
-    parser.add_argument('--device', type=str, default='cuda',
-                        help='Device for inference')
-    parser.add_argument('--camera_names', nargs='+', default=['r_vis', 'top', 'front'],
-                        help='Camera names to use')
-    parser.add_argument('--num_queries', type=int, default=16,
-                        help='Policy query frequency (timesteps)')
-    parser.add_argument('--obs_horizon', type=int, default=2,
-                        help='Observation horizon')
-    parser.add_argument('--no_video', action='store_true',
-                        help='Do not save episode videos')
-
-    args = parser.parse_args()
-
-    # Load model
-    print(f"Loading model from {args.ckpt_path}...")
-    agent = load_checkpoint(args.ckpt_path, device=args.device)
-
-    # Evaluate
-    evaluate_policy(
-        agent=agent,
-        num_episodes=args.num_episodes,
-        max_timesteps=args.max_timesteps,
-        device=args.device,
-        camera_names=args.camera_names,
-        num_queries=args.num_queries,
-        obs_horizon=args.obs_horizon,
-        save_video=not args.no_video
-    )
-
-
-if __name__ == '__main__':
-    main()
diff --git a/roboimi/demos/vla_scripts/eval_vla.py b/roboimi/demos/vla_scripts/eval_vla.py
new file mode 100644
index 0000000..225fe4e
--- /dev/null
+++ b/roboimi/demos/vla_scripts/eval_vla.py
@@ -0,0 +1,328 @@
+"""
+VLA Policy Evaluation Script (Hydra-based)
+
+This script evaluates a trained Vision-Language-Action (VLA) policy
+in the MuJoCo simulation environment.
+
+Usage:
+    python roboimi/demos/eval_vla.py
+    python roboimi/demos/eval_vla.py ckpt_path=checkpoints/vla_model_step_8000.pt num_episodes=5
+    python roboimi/demos/eval_vla.py use_smoothing=true smooth_alpha=0.5
+"""
+
+import sys
+import os
+import json
+import logging
+import torch
+import numpy as np
+import hydra
+from pathlib import Path
+from typing import Dict, List
+from tqdm import tqdm
+from omegaconf import DictConfig, OmegaConf
+from hydra.utils import instantiate
+
+from roboimi.envs.double_pos_ctrl_env import make_sim_env
+from roboimi.utils.act_ex_utils import sample_transfer_pose
+from einops import rearrange
+
+# Ensure correct import path
+sys.path.append(os.getcwd())
+
+log = logging.getLogger(__name__)
+
+
+class VLAEvaluator:
+    """
+    VLA Policy Evaluator for MuJoCo Simulation
+    """
+
+    def __init__(
+        self,
+        agent: torch.nn.Module,
+        device: str = 'cuda',
+        camera_names: List[str] = ['r_vis', 'top', 'front'],
+        num_queries: int = 1,
+        obs_horizon: int = 2,
+        pred_horizon: int = 16,
+        use_smoothing: bool = False,
+        smooth_method: str = 'ema',
+        smooth_alpha: float = 0.3
+    ):
+        self.agent = agent.to(device)
+        self.device = device
+        self.camera_names = camera_names
+        self.num_queries = num_queries
+        self.obs_horizon = obs_horizon
+        self.pred_horizon = pred_horizon
+
+        # Action smoothing
+        self.use_smoothing = use_smoothing
+        self.smooth_method = smooth_method
+        self.smooth_alpha = smooth_alpha
+        self.smoother = ActionSmoother(
+            action_dim=16,
+            method=smooth_method,
+            alpha=smooth_alpha
+        ) if use_smoothing else None
+
+        # Observation buffer for obs_horizon
+        self.obs_buffer = {
+            'images': {cam: [] for cam in camera_names},
+            'qpos': []
+        }
+        self.cached_actions = None
+        self.query_step = 0
+
+    def reset(self):
+        """Reset evaluator state"""
+        self.obs_buffer = {
+            'images': {cam: [] for cam in self.camera_names},
+            'qpos': []
+        }
+        self.cached_actions = None
+        self.query_step = 0
+        if self.smoother is not None:
+            self.smoother.reset()
+
+    def _get_image_dict(self, obs: Dict) -> Dict[str, torch.Tensor]:
+        images = {}
+        for cam_name in self.camera_names:
+            img = obs['images'][cam_name]
+            img = rearrange(img, 'h w c -> c h w')
+            img = torch.from_numpy(img / 255.0).float()
+            images[cam_name] = img
+
+        image_dict = {}
+        for cam_name in self.camera_names:
+            cam_images = self.obs_buffer['images'][cam_name]
+            cam_images.append(images[cam_name])
+
+            while len(cam_images) < self.obs_horizon:
+                cam_images.insert(0, cam_images[0])
+
+            if len(cam_images) > self.obs_horizon:
+                cam_images = cam_images[-self.obs_horizon:]
+
+            img_tensor = torch.stack(cam_images, dim=0).unsqueeze(0)
+            image_dict[cam_name] = img_tensor
+
+            self.obs_buffer['images'][cam_name] = cam_images[-self.obs_horizon:]
+
+        return image_dict
+
+    def _get_qpos_dict(self, obs: Dict) -> torch.Tensor:
+        qpos = obs['qpos']
+        qpos = torch.from_numpy(qpos).float()
+
+        self.obs_buffer['qpos'].append(qpos)
+
+        while len(self.obs_buffer['qpos']) < self.obs_horizon:
+            self.obs_buffer['qpos'].insert(0, self.obs_buffer['qpos'][0])
+
+        if len(self.obs_buffer['qpos']) > self.obs_horizon:
+            self.obs_buffer['qpos'] = self.obs_buffer['qpos'][-self.obs_horizon:]
+
+        qpos_tensor = torch.stack(self.obs_buffer['qpos'], dim=0).unsqueeze(0)
+        return qpos_tensor
+
+    @torch.no_grad()
+    def predict_action(self, obs: Dict) -> np.ndarray:
+        images = self._get_image_dict(obs)
+        qpos = self._get_qpos_dict(obs)
+
+        if self.cached_actions is None or self.query_step % self.num_queries == 0:
+            images = {k: v.to(self.device) for k, v in images.items()}
+            qpos = qpos.to(self.device)
+
+            predicted_actions = self.agent.predict_action(
+                images=images,
+                proprioception=qpos
+            )
+
+            self.cached_actions = predicted_actions.squeeze(0).cpu().numpy()
+            self.query_step = 0
+
+        raw_action = self.cached_actions[self.query_step]
+        self.query_step += 1
+
+        if self.smoother is not None:
+            raw_action = self.smoother.smooth(raw_action)
+
+        return raw_action
+
+
+class ActionSmoother:
+    """Action smoothing for smoother execution"""
+
+    def __init__(self, action_dim: int, method: str = 'ema', alpha: float = 0.3):
+        self.action_dim = action_dim
+        self.method = method
+        self.alpha = alpha
+        self.prev_action = None
+
+    def smooth(self, action: np.ndarray) -> np.ndarray:
+        if self.method == 'ema':
+            if self.prev_action is None:
+                smoothed = action
+            else:
+                smoothed = self.alpha * action + (1 - self.alpha) * self.prev_action
+            self.prev_action = smoothed
+            return smoothed
+        else:
+            return action
+
+    def reset(self):
+        self.prev_action = None
+
+
+def load_checkpoint(
+    ckpt_path: str,
+    agent_cfg: DictConfig,
+    device: str = 'cuda'
+) -> torch.nn.Module:
+    """
+    Load trained VLA model from checkpoint using Hydra agent config.
+
+    Args:
+        ckpt_path: Path to checkpoint file (.pt)
+        agent_cfg: Hydra agent config for instantiation
+        device: Device to load model on
+
+    Returns:
+        Loaded VLAAgent model
+    """
+    from pathlib import Path as PathLib
+
+    ckpt_path = PathLib(ckpt_path).absolute()
+    if not ckpt_path.exists():
+        raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
+
+    log.info(f"Loading checkpoint from {ckpt_path}")
+    checkpoint = torch.load(ckpt_path, map_location=device, weights_only=False)
+    log.info(f"Checkpoint keys: {checkpoint.keys()}")
+
+    # Instantiate agent from Hydra config
+    log.info("Instantiating agent from config...")
+    agent = instantiate(agent_cfg)
+
+    # Load model state
+    if 'model_state_dict' in checkpoint:
+        agent.load_state_dict(checkpoint['model_state_dict'])
+        log.info(f"✅ Model state loaded (step: {checkpoint.get('step', 'unknown')})")
+    elif 'state_dict' in checkpoint:
+        agent.load_state_dict(checkpoint['state_dict'])
+        log.info("✅ Model state loaded")
+    else:
+        agent.load_state_dict(checkpoint)
+        log.info("✅ Model state loaded")
+
+    # Load dataset statistics for denormalization
+    stats_path = ckpt_path.parent / 'dataset_stats.json'
+    if stats_path.exists():
+        with open(stats_path, 'r') as f:
+            stats = json.load(f)
+        agent.action_mean = np.array(stats['action_mean'])
+        agent.action_std = np.array(stats['action_std'])
+        agent.qpos_mean = np.array(stats['qpos_mean'])
+        agent.qpos_std = np.array(stats['qpos_std'])
+        log.info("✅ Dataset statistics loaded for denormalization")
+    else:
+        log.warning(f"⚠️  {stats_path} not found. Actions will not be denormalized!")
+        agent.action_mean = None
+        agent.action_std = None
+
+    agent.eval()
+    agent.to(device)
+
+    log.info(f"✅ Model loaded successfully on {device}")
+    return agent
+
+
+@hydra.main(version_base=None, config_path="../../vla/conf", config_name="config")
+def main(cfg: DictConfig):
+    """
+    VLA Evaluation Script with Hydra Configuration.
+
+    All eval parameters come from vla/conf/eval.yaml, merged into cfg.
+    Override on command line: python eval_vla.py eval.ckpt_path=... eval.num_episodes=5
+    """
+
+    # Print configuration
+    print("=" * 80)
+    print("VLA Evaluation Configuration:")
+    print("=" * 80)
+    print(OmegaConf.to_yaml(cfg))
+    print("=" * 80)
+
+    eval_cfg = cfg.eval
+    device = eval_cfg.device
+    camera_names = list(eval_cfg.camera_names)
+
+    # Load model
+    log.info(f"🚀 Loading model from {eval_cfg.ckpt_path}...")
+    agent = load_checkpoint(
+        ckpt_path=eval_cfg.ckpt_path,
+        agent_cfg=cfg.agent,
+        device=device
+    )
+
+    # Create evaluator
+    evaluator = VLAEvaluator(
+        agent=agent,
+        device=device,
+        camera_names=camera_names,
+        num_queries=eval_cfg.num_queries,
+        obs_horizon=eval_cfg.obs_horizon,
+        use_smoothing=eval_cfg.use_smoothing,
+        smooth_method=eval_cfg.smooth_method,
+        smooth_alpha=eval_cfg.smooth_alpha
+    )
+
+    # Create environment
+    env = make_sim_env(eval_cfg.task_name)
+
+    # Run episodes
+    for episode_idx in range(eval_cfg.num_episodes):
+        print(f"\n{'='*60}")
+        print(f"Episode {episode_idx + 1}/{eval_cfg.num_episodes}")
+        print(f"{'='*60}\n")
+
+        box_pos = sample_transfer_pose()
+        env.reset(box_pos)
+        evaluator.reset()
+
+        success = False
+        success_timestep = 0
+
+        with torch.inference_mode():
+            for t in tqdm(range(eval_cfg.max_timesteps), desc=f"Episode {episode_idx + 1}"):
+                obs = env._get_image_obs()
+                qpos_obs = env._get_qpos_obs()
+                obs['qpos'] = qpos_obs['qpos']
+
+                action = evaluator.predict_action(obs)
+                env.step_jnt(action)
+
+                env.render()
+
+                if env.rew == 1.0:
+                    success = True
+                    success_timestep = t
+                    print(f"\n✅ Task completed at timestep {t}!")
+                    break
+
+        print(f"\nEpisode {episode_idx + 1} Summary:")
+        print(f"  Success: {success}")
+        if success:
+            print(f"  Success Timestep: {success_timestep}")
+        print(f"  Length: {t + 1} timesteps")
+
+    print(f"\n{'='*60}")
+    print("Evaluation complete!")
+    print(f"{'='*60}\n")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/roboimi/vla/RESNET_TRAINING_GUIDE.md b/roboimi/vla/RESNET_TRAINING_GUIDE.md
deleted file mode 100644
index 8071d4f..0000000
--- a/roboimi/vla/RESNET_TRAINING_GUIDE.md
+++ /dev/null
@@ -1,238 +0,0 @@
-# ResNet VLA Training Guide
-
-This guide explains how to train the VLA agent with ResNet backbone and action_dim=16, obs_dim=16.
-
-## Configuration Overview
-
-### 1. Backbone Configuration
-**File**: `roboimi/vla/conf/backbone/resnet.yaml`
-- Model: microsoft/resnet-18
-- Output dim: 1024 (512 channels × 2 from SpatialSoftmax)
-- Frozen by default for faster training
-
-### 2. Agent Configuration
-**File**: `roboimi/vla/conf/agent/resnet_diffusion.yaml`
-- Vision backbone: ResNet-18 with SpatialSoftmax
-- Action dimension: 16
-- Observation dimension: 16
-- Prediction horizon: 16 steps
-- Observation horizon: 2 steps
-- Diffusion steps: 100
-- Number of cameras: 2
-
-### 3. Dataset Configuration
-**File**: `roboimi/vla/conf/data/resnet_dataset.yaml`
-- Dataset class: RobotDiffusionDataset
-- Prediction horizon: 16
-- Observation horizon: 2
-- Camera names: [r_vis, top]
-- Normalization: gaussian (mean/std)
-
-### 4. Training Configuration
-**File**: `roboimi/vla/conf/config.yaml`
-- Batch size: 8
-- Learning rate: 1e-4
-- Max steps: 10000
-- Log frequency: 100 steps
-- Save frequency: 1000 steps
-- Device: cuda
-- Num workers: 4
-
-## Prerequisites
-
-### 1. Prepare Dataset
-Your dataset should be organized as:
-```
-/path/to/your/dataset/
-├── episode_0.hdf5
-├── episode_1.hdf5
-├── ...
-└── data_stats.pkl
-```
-
-Each HDF5 file should contain:
-```
-episode_N.hdf5
-├── action              # (T, 16) float32
-└── observations/
-    ├── qpos           # (T, 16) float32
-    └── images/
-        ├── r_vis/     # (T, H, W, 3) uint8
-        └── top/       # (T, H, W, 3) uint8
-```
-
-### 2. Generate Dataset Statistics
-Create `data_stats.pkl` with:
-```python
-import pickle
-import numpy as np
-
-stats = {
-    'action': {
-        'mean': np.zeros(16),
-        'std': np.ones(16)
-    },
-    'qpos': {
-        'mean': np.zeros(16),
-        'std': np.ones(16)
-    }
-}
-
-with open('/path/to/your/dataset/data_stats.pkl', 'wb') as f:
-    pickle.dump(stats, f)
-```
-
-Or use the provided script:
-```bash
-python -m roboimi.vla.scripts.calculate_stats --dataset_dir /path/to/your/dataset
-```
-
-## Usage
-
-### 1. Update Dataset Path
-Edit `roboimi/vla/conf/data/resnet_dataset.yaml`:
-```yaml
-dataset_dir: "/path/to/your/dataset"  # CHANGE THIS
-camera_names:
-  - r_vis  # CHANGE TO YOUR CAMERA NAMES
-  - top
-```
-
-### 2. Run Training
-```bash
-# Basic training
-python roboimi/demos/vla_scripts/train_vla.py
-
-# Override configurations
-python roboimi/demos/vla_scripts/train_vla.py train.batch_size=16
-python roboimi/demos/vla_scripts/train_vla.py train.device=cpu
-python roboimi/demos/vla_scripts/train_vla.py train.max_steps=20000
-python roboimi/demos/vla_scripts/train_vla.py data.dataset_dir=/custom/path
-
-# Debug mode (CPU, small batch, few steps)
-python roboimi/demos/vla_scripts/train_vla.py \
-    train.device=cpu \
-    train.batch_size=2 \
-    train.max_steps=10 \
-    train.num_workers=0
-```
-
-### 3. Monitor Training
-Checkpoints are saved to:
-- `checkpoints/vla_model_step_1000.pt` - Periodic checkpoints
-- `checkpoints/vla_model_best.pt` - Best model (lowest loss)
-- `checkpoints/vla_model_final.pt` - Final model
-
-## Architecture Details
-
-### Data Flow
-1. **Input**: Images from multiple cameras + proprioception (qpos)
-2. **Vision Encoder**: ResNet-18 → SpatialSoftmax → (B, T, 1024) per camera
-3. **Feature Concatenation**: All cameras + qpos → Global conditioning
-4. **Diffusion Policy**: 1D U-Net predicts noise on action sequences
-5. **Output**: Clean action sequence (B, 16, 16)
-
-### Training Process
-1. Sample random timestep t from [0, 100]
-2. Add noise to ground truth actions
-3. Predict noise using vision + proprioception conditioning
-4. Compute MSE loss between predicted and actual noise
-5. Backpropagate and update weights
-
-### Inference Process
-1. Extract visual features from current observation
-2. Start with random noise action sequence
-3. Iteratively denoise over 10 steps (DDPM scheduler)
-4. Return clean action sequence
-
-## Common Issues
-
-### Issue: Out of Memory
-**Solution**: Reduce batch size or use CPU
-```bash
-python train_vla.py train.batch_size=4 train.device=cpu
-```
-
-### Issue: Dataset not found
-**Solution**: Check dataset_dir path in config
-```bash
-python train_vla.py data.dataset_dir=/absolute/path/to/dataset
-```
-
-### Issue: Camera names mismatch
-**Solution**: Update camera_names in data config
-```yaml
-# roboimi/vla/conf/data/resnet_dataset.yaml
-camera_names:
-  - your_camera_1
-  - your_camera_2
-```
-
-### Issue: data_stats.pkl missing
-**Solution**: Generate statistics file
-```bash
-python -m roboimi.vla.scripts.calculate_stats --dataset_dir /path/to/dataset
-```
-
-## Model Files Created
-
-```
-roboimi/vla/
-├── conf/
-│   ├── config.yaml (UPDATED)
-│   ├── backbone/
-│   │   └── resnet.yaml (NEW)
-│   ├── agent/
-│   │   └── resnet_diffusion.yaml (NEW)
-│   └── data/
-│       └── resnet_dataset.yaml (NEW)
-├── models/
-│   └── backbones/
-│       ├── __init__.py (UPDATED - added resnet export)
-│       └── resnet.py (EXISTING)
-└── demos/vla_scripts/
-    └── train_vla.py (REWRITTEN)
-```
-
-## Next Steps
-
-1. **Prepare your dataset** in the required HDF5 format
-2. **Update dataset_dir** in `roboimi/vla/conf/data/resnet_dataset.yaml`
-3. **Run training** with `python roboimi/demos/vla_scripts/train_vla.py`
-4. **Monitor checkpoints** in `checkpoints/` directory
-5. **Evaluate** the trained model using the best checkpoint
-
-## Advanced Configuration
-
-### Use Different ResNet Variant
-Edit `roboimi/vla/conf/agent/resnet_diffusion.yaml`:
-```yaml
-vision_backbone:
-  model_name: "microsoft/resnet-50"  # or resnet-34, resnet-101
-```
-
-### Adjust Diffusion Steps
-```yaml
-# More steps = better quality, slower training
-diffusion_steps: 200  # default: 100
-```
-
-### Change Horizons
-```yaml
-pred_horizon: 32  # Predict more future steps
-obs_horizon: 4    # Use more history
-```
-
-### Multi-GPU Training
-```bash
-# Use CUDA device 1
-python train_vla.py train.device=cuda:1
-
-# For multi-GPU, use torch.distributed (requires code modification)
-```
-
-## References
-
-- ResNet Paper: https://arxiv.org/abs/1512.03385
-- Diffusion Policy: https://diffusion-policy.cs.columbia.edu/
-- VLA Framework Documentation: See CLAUDE.md in project root
diff --git a/roboimi/vla/VLA_EVALUATION_GUIDE.md b/roboimi/vla/VLA_EVALUATION_GUIDE.md
deleted file mode 100644
index 655a6a3..0000000
--- a/roboimi/vla/VLA_EVALUATION_GUIDE.md
+++ /dev/null
@@ -1,239 +0,0 @@
-# VLA Evaluation Guide
-
-This guide explains how to evaluate a trained Vision-Language-Action (VLA) policy in the MuJoCo simulation environment.
-
-## Prerequisites
-
-1. **Trained Model**: Train your VLA model first using `train_vla.py`
-2. **Checkpoints**: Ensure you have saved model checkpoints in `checkpoints/` directory
-3. **Dependencies**: Install required dependencies:
-   ```bash
-   pip install opencv-python tqdm
-   ```
-
-## Quick Start
-
-### Basic Evaluation
-
-```bash
-# Evaluate with default settings (3 episodes)
-python roboimi/demos/eval_vla.py \
-    --ckpt_path checkpoints/vla_model_best.pt
-
-# Evaluate with custom settings
-python roboimi/demos/eval_vla.py \
-    --ckpt_path checkpoints/vla_model_step_5000.pt \
-    --num_episodes 5 \
-    --max_timesteps 700 \
-    --camera_names r_vis top angle \
-    --num_queries 1 \
-    --obs_horizon 2
-```
-
-### Parameters
-
-| Parameter | Description | Default |
-|-----------|-------------|---------|
-| `--ckpt_path` | Path to model checkpoint (.pt file) | Required |
-| `--num_episodes` | Number of evaluation episodes | 3 |
-| `--max_timesteps` | Maximum timesteps per episode | 700 |
-| `--device` | Device for inference (`cuda` or `cpu`) | `cuda` |
-| `--camera_names` | Camera names to use (space-separated) | `r_vis top` |
-| `--num_queries` | Policy query frequency (every N timesteps) | 1 |
-| `--obs_horizon` | Observation history length | 2 |
-| `--no_video` | Disable video saving | False |
-
-## Usage Details
-
-### Policy Query Frequency
-
-The `--num_queries` parameter controls how often the policy is queried:
-
-- `--num_queries 1`: Query every timestep (default, most accurate)
-- `--num_queries 4`: Query every 4 timesteps (faster, but uses cached actions)
-
-When using cached actions (num_queries > 1), the policy predicts a chunk of actions (pred_horizon=16), and these actions are executed sequentially until the next query.
-
-### Camera Selection
-
-Available cameras depend on your environment:
-- `r_vis`: Right arm RealSense camera
-- `top`: Top-down view camera
-- `angle`: Angled view camera
-
-Use `--camera_names` to specify which cameras to use:
-```bash
---camera_names r_vis top      # Use 2 cameras
---camera_names r_vis top angle # Use all 3 cameras
-```
-
-### Observation Horizon
-
-The `--obs_horizon` parameter determines how many past observations to use as context:
-
-```bash
---obs_horizon 1  # Use only current observation
---obs_horizon 2  # Use current + 1 past observation (default)
---obs_horizon 4  # Use current + 3 past observations
-```
-
-**Note**: Must match the value used during training.
-
-## Output
-
-### Console Output
-
-During evaluation, you'll see:
-
-```
-============================================================
-Episode 1/3
-============================================================
-
-Episode 1: 100%|████████████████████| 700/700 [02:30<00:00,  4.64it/s]
-
-✅ Task completed at timestep 453!
-
-Episode 1 Summary:
-  Total Reward: 1.0000
-  Max Reward: 1.0000
-  Length: 453 timesteps
-  Video saved: outputs/eval_vla_episode_0.mp4
-```
-
-### Video Output
-
-Videos are saved to `outputs/eval_vla_episode_{N}.mp4` showing the robot's execution.
-
-### Metrics
-
-- **Total Reward**: Sum of rewards throughout the episode
-- **Max Reward**: Maximum reward achieved (1.0 = success)
-- **Length**: Number of timesteps executed
-
-## Action Smoothing
-
-The evaluator includes EMA (Exponential Moving Average) smoothing by default to reduce jitter:
-
-```python
-# Default smoothing parameters
-smooth_method = 'ema'
-smooth_alpha = 0.3  # Lower = more smoothing
-```
-
-To disable or modify smoothing, edit the `evaluate_policy()` call in `eval_vla.py`:
-
-```python
-evaluator = VLAEvaluator(
-    agent=agent,
-    use_smoothing=False,  # Disable smoothing
-    # or
-    smooth_method='moving_avg',  # Use different method
-    smooth_alpha=0.5  # Adjust smoothing strength
-)
-```
-
-## Troubleshooting
-
-### Issue: Checkpoint not found
-
-```
-FileNotFoundError: Checkpoint not found: checkpoints/vla_model_best.pt
-```
-
-**Solution**: Ensure you've trained the model and checkpoints exist:
-```bash
-ls -la checkpoints/
-# Should show: vla_model_best.pt, vla_model_final.pt, etc.
-```
-
-### Issue: CUDA out of memory
-
-**Solution**: Use CPU for inference:
-```bash
-python eval_vla.py --ckpt_path checkpoints/vla_model_best.pt --device cpu
-```
-
-### Issue: Camera names don't match
-
-**Solution**: Check your HDF5 files for available cameras:
-```python
-import h5py
-with h5py.File('roboimi/demos/dataset/sim_transfer/episode_0.hdf5', 'r') as f:
-    print(list(f['observations/images'].keys()))
-    # Output: ['angle', 'r_vis', 'top']
-```
-
-Then use the correct camera names in your eval command.
-
-### Issue: Mismatched obs_horizon
-
-```
-RuntimeError: Tensor shape mismatch
-```
-
-**Solution**: Ensure `--obs_horizon` matches the training config (`data.obs_horizon`).
-
-## Advanced Usage
-
-### Custom Evaluation Script
-
-You can also use the evaluator in your own scripts:
-
-```python
-from roboimi.demos.eval_vla import VLAEvaluator, load_checkpoint
-from roboimi.envs.double_pos_ctrl_env import make_sim_env
-
-# Load model
-agent = load_checkpoint('checkpoints/vla_model_best.pt')
-
-# Create evaluator
-evaluator = VLAEvaluator(
-    agent=agent,
-    device='cuda',
-    camera_names=['r_vis', 'top'],
-    num_queries=1,
-    obs_horizon=2
-)
-
-# Create environment
-env = make_sim_env('sim_transfer')
-env.reset()
-evaluator.reset()
-
-# Run episode
-obs = env._get_image_obs()
-obs['qpos'] = env._get_qpos_obs()['qpos']
-
-# Predict and execute action
-action = evaluator.predict_action(obs)
-env.step_jnt(action)
-```
-
-### Batch Evaluation
-
-Evaluate multiple checkpoints:
-
-```bash
-for ckpt in checkpoints/vla_model_step_*.pt; do
-    echo "Evaluating $ckpt"
-    python roboimi/demos/eval_vla.py \
-        --ckpt_path "$ckpt" \
-        --num_episodes 1 \
-        --no_video
-done
-```
-
-## Next Steps
-
-1. **Train your model**: See [RESNET_TRAINING_GUIDE.md](roboimi/vla/RESNET_TRAINING_GUIDE.md)
-2. **Evaluate performance**: Use this evaluation script
-3. **Analyze results**: Compare different checkpoints
-4. **Deploy to real robot**: Adapt the evaluator for real robot control
-
-## References
-
-- Training Guide: [roboimi/vla/RESNET_TRAINING_GUIDE.md](roboimi/vla/RESNET_TRAINING_GUIDE.md)
-- Project Documentation: [CLAUDE.md](CLAUDE.md)
-- Original ACT Paper: https://arxiv.org/abs/2304.13705
-- Diffusion Policy: https://diffusion-policy.cs.columbia.edu/
diff --git a/roboimi/vla/conf/agent/base_siglip.yaml b/roboimi/vla/conf/agent/base_siglip.yaml
deleted file mode 100644
index e9231b4..0000000
--- a/roboimi/vla/conf/agent/base_siglip.yaml
+++ /dev/null
@@ -1,25 +0,0 @@
-# @package agent
-_target_: roboimi.vla.agent.VLAAgent
-
-# --- Real Vision Backbone ---
-backbone:
-  _target_: roboimi.vla.models.backbones.siglip.SigLIPBackbone
-  # Google SigLIP (SOTA Vision Encoder)
-  # 第一次运行会自动下载 (~1.5GB)
-  model_name: "google/siglip-so400m-patch14-384"
-  freeze: true  # 初始阶段冻结视觉层，只训练 Head
-  embed_dim: 1152  # SigLIP so400m-patch14-384 的 hidden_size
-
-# --- Adapter ---
-projector:
-  _target_: roboimi.vla.models.projectors.mlp.MLPProjector
-  # 自动读取 SigLIP 的 1152 维
-  input_dim: ${..backbone.embed_dim}
-  output_dim: 384  # 压缩到 384 或 512 给 Policy 用
-
-# --- Policy Head ---
-head:
-  _target_: roboimi.vla.models.heads.debug.DebugHead
-  input_dim: ${..projector.output_dim}
-  action_dim: 16
-  chunk_size: 16
\ No newline at end of file
diff --git a/roboimi/vla/conf/agent/debug_vla.yaml b/roboimi/vla/conf/agent/debug_vla.yaml
deleted file mode 100644
index f8962ab..0000000
--- a/roboimi/vla/conf/agent/debug_vla.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-_target_: roboimi.vla.agent.VLAAgent
-
-# 1. Backbone Configuration
-backbone:
-  _target_: roboimi.vla.models.backbones.debug.DebugBackbone
-  embed_dim: 768  # Variable A
-  seq_len: 10
-
-# 2. Projector Configuration
-projector:
-  _target_: roboimi.vla.models.projectors.mlp.MLPProjector
-  # Dependency Injection via Interpolation:
-  # Takes 'embed_dim' from the sibling 'backbone' config above.
-  input_dim: ${..backbone.embed_dim} 
-  output_dim: 512 # Variable B (The bottleneck size)
-
-# 3. Head Configuration
-head:
-  _target_: roboimi.vla.models.heads.debug.DebugHead
-  # Dependency Injection via Interpolation:
-  # Takes 'output_dim' from the sibling 'projector' config above.
-  input_dim: ${..projector.output_dim}
-  action_dim: 7  # (x,y,z, r,p,y, gripper)
-  chunk_size: 16
\ No newline at end of file
diff --git a/roboimi/vla/conf/agent/default.yaml b/roboimi/vla/conf/agent/default.yaml
deleted file mode 100644
index 9ddde09..0000000
--- a/roboimi/vla/conf/agent/default.yaml
+++ /dev/null
@@ -1,30 +0,0 @@
-# @package _global_
-defaults:
-  # 1. 将 backbone 配置挂载到 agent.vlm_backbone 节点
-  - /backbone@vlm_backbone: siglip
-  
-  # 2. 将 projector 配置挂载到 agent.img_projector 节点 (新增)
-  - /projector@img_projector: mlp
-  
-  # 3. 将 head 配置挂载到 agent.action_head 节点
-  - /head@action_head: diffusion
-  
-  # 4. 允许当前文件覆盖上述配置
-  - _self_
-
-_target_: roboimi.vla.agent.VLAAgent
-
-# 核心超参数：单一真值源
-state_dim: 14
-embed_dim: 512
-
-# --- 参数一致性绑定 (Interpolation) ---
-
-# 强制 Projector 输出维度 = Agent 嵌入维度
-img_projector:
-  input_dim: ${..vlm_backbone.output_dim} # 自动获取 backbone 的输出维度
-  output_dim: ${..embed_dim}              # 引用上方的 embed_dim
-
-# 强制 Head 输入维度 = Agent 嵌入维度
-action_head:
-  input_dim: ${..embed_dim}               # 引用上方的 embed_dim
\ No newline at end of file
diff --git a/roboimi/vla/conf/agent/resnet_diffusion.yaml b/roboimi/vla/conf/agent/resnet_diffusion.yaml
index 61d76a2..4851b5f 100644
--- a/roboimi/vla/conf/agent/resnet_diffusion.yaml
+++ b/roboimi/vla/conf/agent/resnet_diffusion.yaml
@@ -8,15 +8,18 @@ vision_backbone:
   freeze: true
 
 # Action and Observation Dimensions
-action_dim: 16  # Robot action dimension
-obs_dim: 16     # Proprioception dimension (qpos)
+action_dim: 16
+obs_dim: 16
 
-# Prediction Horizons
-pred_horizon: 16   # How many future actions to predict
-obs_horizon: 2     # How many historical observations to use
+# Prediction and Observation Horizons
+pred_horizon: 16
+obs_horizon: 2
 
 # Diffusion Parameters
 diffusion_steps: 100  # Number of diffusion timesteps for training
 
 # Camera Configuration
-num_cams: 3  # Number of cameras (e.g., r_vis, top)
+# num_cams 应与 data.camera_names 列表长度一致
+# 可使用 Hydra OmegaConf resolver: ${oc.len:data.camera_names}
+# 但部分版本不支持，这里手动保持同步
+num_cams: 3  # len(data.camera_names) = 3
diff --git a/roboimi/vla/conf/agent/siglip_diffusion.yaml b/roboimi/vla/conf/agent/siglip_diffusion.yaml
deleted file mode 100644
index cd0089f..0000000
--- a/roboimi/vla/conf/agent/siglip_diffusion.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-# @package agent
-_target_: roboimi.vla.agent.VLAAgent
-
-# 1. Vision
-backbone:
-  _target_: roboimi.vla.models.backbones.siglip.SigLIPBackbone
-  model_name: "google/siglip-so400m-patch14-384"
-  embed_dim: 1152
-  freeze: true
-
-# 2. Adapter
-projector:
-  _target_: roboimi.vla.models.projectors.mlp.MLPProjector
-  input_dim: ${..backbone.embed_dim}
-  output_dim: 256 # 压缩给 Diffusion 用
-
-# 3. Diffusion Policy Head
-head:
-  _target_: roboimi.vla.models.heads.diffusion.DiffusionHead
-  input_dim: ${..projector.output_dim}
-  action_dim: 16
-  chunk_size: 16
-  n_timesteps: 50 # 训练用100，这里调试用50快一点
-  hidden_dim: 256
\ No newline at end of file
diff --git a/roboimi/vla/conf/agent/tiny.yaml b/roboimi/vla/conf/agent/tiny.yaml
deleted file mode 100644
index 83518c4..0000000
--- a/roboimi/vla/conf/agent/tiny.yaml
+++ /dev/null
@@ -1,26 +0,0 @@
-# 调试用小模型
-# @package agent
-_target_: roboimi.vla.agent.VLAAgent
-
-# --- 1. Backbone (VLM) ---
-backbone:
-  _target_: roboimi.vla.models.backbones.debug.DebugBackbone
-  embed_dim: 768   # 定义源头维度
-  seq_len: 10
-
-# --- 2. Projector (Adapter) ---
-projector:
-  _target_: roboimi.vla.models.projectors.mlp.MLPProjector
-  # 【关键】依赖注入：自动读取 backbone 的 embed_dim
-  input_dim: ${..backbone.embed_dim}
-  output_dim: 128  # 瓶颈层维度 (Tiny scale)
-
-# --- 3. Head (Policy) ---
-head:
-  _target_: roboimi.vla.models.heads.debug.DebugHead
-  input_dim: ${..projector.output_dim}
-  
-  # 【关键修改】改为 16 以匹配你的 Sim 数据
-  action_dim: 16  
-
-  chunk_size: 16
\ No newline at end of file
diff --git a/roboimi/vla/conf/backbone/clip.yaml b/roboimi/vla/conf/backbone/clip.yaml
deleted file mode 100644
index b6cf693..0000000
--- a/roboimi/vla/conf/backbone/clip.yaml
+++ /dev/null
@@ -1 +0,0 @@
-# CLIP Backbone 配置
diff --git a/roboimi/vla/conf/backbone/resnet.yaml b/roboimi/vla/conf/backbone/resnet.yaml
index 584eddd..487577d 100644
--- a/roboimi/vla/conf/backbone/resnet.yaml
+++ b/roboimi/vla/conf/backbone/resnet.yaml
@@ -2,9 +2,4 @@
 _target_: roboimi.vla.models.backbones.resnet.ResNetBackbone
 
 model_name: "microsoft/resnet-18"
-freeze: true
-
-# Output dimension calculation:
-# ResNet-18 final layer has 512 channels
-# After SpatialSoftmax: 512 * 2 = 1024 (x,y coordinates per channel)
-# output_dim: 1024
+freeze: true
\ No newline at end of file
diff --git a/roboimi/vla/conf/backbone/siglip.yaml b/roboimi/vla/conf/backbone/siglip.yaml
deleted file mode 100644
index 306bd12..0000000
--- a/roboimi/vla/conf/backbone/siglip.yaml
+++ /dev/null
@@ -1,4 +0,0 @@
-_target_: roboimi.vla.models.backbones.SigLIPBackbone
-model_name: "google/siglip-so400m-patch14-384"
-frozen: true
-output_dim: 1152  # SigLIP Large 的特征维度，需显式声明供 Projector 引用
\ No newline at end of file
diff --git a/roboimi/vla/conf/config.yaml b/roboimi/vla/conf/config.yaml
index 0b18727..d724b77 100644
--- a/roboimi/vla/conf/config.yaml
+++ b/roboimi/vla/conf/config.yaml
@@ -1,7 +1,8 @@
 defaults:
-  - _self_
   - agent: resnet_diffusion
   - data: resnet_dataset
+  - eval: eval
+  - _self_
 
 train:
   batch_size: 16       # Batch size for training
diff --git a/roboimi/vla/conf/data/default_dataset.yaml b/roboimi/vla/conf/data/default_dataset.yaml
deleted file mode 100644
index 6b52e13..0000000
--- a/roboimi/vla/conf/data/default_dataset.yaml
+++ /dev/null
@@ -1,16 +0,0 @@
-_target_: roboimi.vla.data.dataset.VLADataset
-dataset_dir: "/path/to/your/roboimi/demos/dataset/collected_data"
-pred_horizon: 16
-obs_horizon: 2
-
-# 这里展示了 Hydra 的嵌套实例化：Transform 作为参数传入
-transform:
-  _target_: roboimi.vla.data.image_transforms.VLAImageProcessor
-  size: [224, 224]
-  mean: [0.5, 0.5, 0.5] # SigLIP/CLIP 常用归一化
-  std: [0.5, 0.5, 0.5]
-
-# 如果需要 Tokenizer
-tokenizer: null 
-#  _target_: roboimi.vla.data.text_processing.SimpleTokenizer
-#  max_length: 77
\ No newline at end of file
diff --git a/roboimi/vla/conf/data/resnet_dataset.yaml b/roboimi/vla/conf/data/resnet_dataset.yaml
index 62b0d5e..73b7435 100644
--- a/roboimi/vla/conf/data/resnet_dataset.yaml
+++ b/roboimi/vla/conf/data/resnet_dataset.yaml
@@ -4,9 +4,9 @@ _target_: roboimi.vla.data.dataset.RobotDiffusionDataset
 # Dataset Directory (CHANGE THIS TO YOUR DATA PATH)
 dataset_dir: "roboimi/demos/dataset/sim_transfer"  # Path to your dataset directory
 
-# Horizon Parameters
-pred_horizon: 16  # Prediction horizon (matches agent.pred_horizon)
-obs_horizon: 2    # Observation horizon (matches agent.obs_horizon)
+# Horizon Parameters — 使用 Hydra 插值，从 agent 配置中引用，保持一致性
+pred_horizon: ${agent.pred_horizon}
+obs_horizon: ${agent.obs_horizon}
 action_horizon: 8 # Action execution horizon (used during evaluation)
 
 # Camera Names (CHANGE THIS TO MATCH YOUR CAMERAS)
diff --git a/roboimi/vla/conf/data/siglip2.yaml b/roboimi/vla/conf/data/siglip2.yaml
deleted file mode 100644
index 65ec0e9..0000000
--- a/roboimi/vla/conf/data/siglip2.yaml
+++ /dev/null
@@ -1,8 +0,0 @@
-_target_: roboimi.vla.data.dataset.RobotDiffusionDataset
-
-dataset_dir: "/home/d51/workspace/work/robo-imi-act/roboimi/demos/dataset/sim_transfer"
-pred_horizon: 16
-obs_horizon: 1
-action_horizon: 8
-camera_names: ['r_vis', 'top', 'front'] # ['angle', 'r_vis', 'top']
-normalization_type: 'gaussian' # 'min_max' or 'gaussian'
\ No newline at end of file
diff --git a/roboimi/vla/conf/eval/eval.yaml b/roboimi/vla/conf/eval/eval.yaml
new file mode 100644
index 0000000..10456f2
--- /dev/null
+++ b/roboimi/vla/conf/eval/eval.yaml
@@ -0,0 +1,21 @@
+# @package eval
+# Evaluation Configuration
+ckpt_path: "checkpoints/vla_model_best.pt"  # Path to model checkpoint
+num_episodes: 3           # Number of evaluation episodes
+max_timesteps: 700        # Maximum timesteps per episode
+device: ${train.device}   # 与训练保持一致
+task_name: "sim_transfer" # Task name for environment creation
+
+# Policy execution — 从 agent 配置中引用，保持一致性
+num_queries: ${agent.pred_horizon}  # 每次预测 pred_horizon 步后重新查询
+obs_horizon: ${agent.obs_horizon}
+
+# Camera names — 从 data 配置中引用，保持一致性
+camera_names: ${data.camera_names}
+
+# Action smoothing
+use_smoothing: false
+smooth_method: "ema"
+smooth_alpha: 0.3
+
+
diff --git a/roboimi/vla/conf/head/act.yaml b/roboimi/vla/conf/head/act.yaml
deleted file mode 100644
index e4ecbb0..0000000
--- a/roboimi/vla/conf/head/act.yaml
+++ /dev/null
@@ -1 +0,0 @@
-# ACT-VAE Head 配置
diff --git a/roboimi/vla/conf/head/diffusion.yaml b/roboimi/vla/conf/head/diffusion.yaml
index a442fe5..2934c94 100644
--- a/roboimi/vla/conf/head/diffusion.yaml
+++ b/roboimi/vla/conf/head/diffusion.yaml
@@ -1,7 +1,7 @@
 _target_: roboimi.vla.models.heads.DiffusionActionHead
 
 # 显式声明必填参数
-input_dim: ???         # 【修复】必须存在，等待 agent/default.yaml 填充
+input_dim: ???         # 等待 agent/default.yaml 填充
 action_dim: 7          
 obs_horizon: 2         
 pred_horizon: 16       
diff --git a/roboimi/vla/conf/train/debug.yaml b/roboimi/vla/conf/train/debug.yaml
deleted file mode 100644
index 3a8f68f..0000000
--- a/roboimi/vla/conf/train/debug.yaml
+++ /dev/null
@@ -1 +0,0 @@
-# Debug 训练超参数
diff --git a/roboimi/vla/conf/train/gpu.yaml b/roboimi/vla/conf/train/gpu.yaml
deleted file mode 100644
index 5f39934..0000000
--- a/roboimi/vla/conf/train/gpu.yaml
+++ /dev/null
@@ -1 +0,0 @@
-# GPU 训练超参数
diff --git a/roboimi/vla/data/image_transform.py b/roboimi/vla/data/image_transform.py
deleted file mode 100644
index 14a3ea1..0000000
--- a/roboimi/vla/data/image_transform.py
+++ /dev/null
@@ -1,75 +0,0 @@
-# 图像预处理
-import torch
-import numpy as np
-import torchvision.transforms as T
-from PIL import Image
-from typing import Union, List
-
-class VLAImageProcessor:
-    """
-    VLA 图像预处理器，专为 SigLIP/CLIP 等 ViT 架构设计。
-    功能：
-    1. Numpy (HWC) -> Tensor (CHW)
-    2. Resize (e.g., 384x384)
-    3. Normalize (SigLIP: mean=0.5, std=0.5)
-    4. Data Augmentation (训练时开启颜色抖动)
-    """
-    def __init__(
-        self, 
-        resolution: int = 384, 
-        mean: List[float] = [0.5, 0.5, 0.5], 
-        std: List[float] = [0.5, 0.5, 0.5],
-        enable_augmentation: bool = True,
-        aug_strength: float = 0.1  # 增强强度，0.1~0.2 比较安全
-    ):
-        self.resolution = resolution
-        self.enable_augmentation = enable_augmentation
-        
-        # --- 1. 基础处理 (所有模式通用) ---
-        # 注意：这里我们分步定义，因为增强通常在 PIL 阶段做比较快
-        self.resize = T.Resize((resolution, resolution), interpolation=T.InterpolationMode.BICUBIC, antialias=True)
-        self.to_tensor = T.ToTensor()
-        self.normalize = T.Normalize(mean=mean, std=std)
-
-        # --- 2. 数据增强 (仅训练用) ---
-        # 机器人学习通常不做 RandomCrop (会丢失绝对坐标信息)，主要做颜色增强
-        if enable_augmentation:
-            self.aug = T.ColorJitter(
-                brightness=aug_strength, 
-                contrast=aug_strength, 
-                saturation=aug_strength, 
-                hue=aug_strength / 2
-            )
-        else:
-            self.aug = torch.nn.Identity()
-
-    def __call__(self, img: Union[np.ndarray, Image.Image, torch.Tensor]) -> torch.Tensor:
-        """
-        Args:
-            img: (H, W, C) uint8 numpy array (from HDF5) OR PIL Image
-        Returns:
-            tensor: (C, H, W) float32, Normalized
-        """
-        # 1. 统一转为 PIL Image (方便做 Resize 和 Jitter)
-        if isinstance(img, np.ndarray):
-            img = Image.fromarray(img)
-        elif isinstance(img, torch.Tensor):
-            # 假设 Tensor 是 CHW，转回 PIL 比较麻烦，通常 HDF5 出来都是 numpy
-            pass 
-
-        # 2. 数据增强 (如果开启)
-        if self.enable_augmentation:
-            img = self.aug(img)
-
-        # 3. 调整尺寸
-        img = self.resize(img)
-
-        # 4. 转张量 & 归一化
-        # ToTensor 会把 [0, 255] -> [0.0, 1.0]
-        tensor = self.to_tensor(img)
-        tensor = self.normalize(tensor)
-
-        return tensor
-
-    def __repr__(self):
-        return f"VLAImageProcessor(res={self.resolution}, aug={self.enable_augmentation})"
\ No newline at end of file
diff --git a/roboimi/vla/data/text_processing.py b/roboimi/vla/data/text_processing.py
deleted file mode 100644
index ecd3c3c..0000000
--- a/roboimi/vla/data/text_processing.py
+++ /dev/null
@@ -1 +0,0 @@
-# 文本 Tokenizer 包装
diff --git a/roboimi/vla/models/backbones/__init__.py b/roboimi/vla/models/backbones/__init__.py
index 2f36dcd..ce1b27e 100644
--- a/roboimi/vla/models/backbones/__init__.py
+++ b/roboimi/vla/models/backbones/__init__.py
@@ -1,10 +1,4 @@
 # Backbone models
-from .siglip import SigLIPBackbone
 from .resnet import ResNetBackbone
-# from .clip import CLIPBackbone
-# from .dinov2 import DinoV2Backbone
 
-__all__ = ["SigLIPBackbone", "ResNetBackbone"]
-
-# from .debug import DebugBackbone
-# __all__ = ["DebugBackbone"]
\ No newline at end of file
+__all__ = ["ResNetBackbone"]
diff --git a/roboimi/vla/models/backbones/siglip.py b/roboimi/vla/models/backbones/siglip.py
deleted file mode 100644
index ef7aa19..0000000
--- a/roboimi/vla/models/backbones/siglip.py
+++ /dev/null
@@ -1,62 +0,0 @@
-# SigLIP Backbone 实现
-import torch
-import torch.nn as nn
-from transformers import AutoModel, AutoProcessor, SiglipVisionModel
-from typing import Dict, Optional
-from roboimi.vla.core.interfaces import VLABackbone
-
-class SigLIPBackbone(VLABackbone):
-    """
-    Wraps Google's SigLIP Vision Encoder.
-    HuggingFace ID example: "google/siglip-so400m-patch14-384"
-    """
-    def __init__(
-        self,
-        model_name: str = "google/siglip-so400m-patch14-384",
-        freeze: bool = True,
-        embed_dim: Optional[int] = None
-    ):
-        super().__init__()
-        print(f"Loading SigLIP: {model_name} ...")
-
-        # 加载视觉部分 (Vision Model only)
-        # 我们不需要 Text Tower，因为 SigLIP 是对齐好的，只用 Vision Tower 抽特征即可
-        self.vision_model = SiglipVisionModel.from_pretrained(model_name)
-
-        # 优先使用配置传入的 embed_dim，否则自动获取
-        if embed_dim is not None:
-            self._embed_dim = embed_dim
-            print(f"✓ Using configured embed_dim: {embed_dim}")
-        else:
-            # 自动获取维度 (SigLIP so400m 通常是 1152)
-            self._embed_dim = self.vision_model.config.hidden_size
-            print(f"✓ Auto-detected embed_dim: {self._embed_dim}")
-        
-        if freeze:
-            self._freeze_parameters()
-
-    def _freeze_parameters(self):
-        print("❄️ Freezing Vision Backbone parameters")
-        for param in self.vision_model.parameters():
-            param.requires_grad = False
-        self.vision_model.eval()
-
-    def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
-        """
-        Args:
-            obs['image']: (B, C, H, W) normalized tensor
-        Returns:
-            features: (B, Seq_Len, Embed_Dim)
-        """
-        images = obs['image']
-        
-        # SigLIP 期望输入是 (B, C, H, W)
-        # HuggingFace 的 VisionModel 输出是一个 BaseModelOutputWithPooling
-        # last_hidden_state shape: (B, Num_Patches, Embed_Dim)
-        outputs = self.vision_model(pixel_values=images)
-        
-        return outputs.last_hidden_state
-
-    @property
-    def embed_dim(self) -> int:
-        return self._embed_dim
\ No newline at end of file
diff --git a/roboimi/vla/models/heads/__init__.py b/roboimi/vla/models/heads/__init__.py
index 4260dba..7a32179 100644
--- a/roboimi/vla/models/heads/__init__.py
+++ b/roboimi/vla/models/heads/__init__.py
@@ -1,8 +1,4 @@
 # # Action Head models
 from .diffusion import ConditionalUnet1D
-# from .act import ACTHead
 
 __all__ = ["ConditionalUnet1D"]
-
-# from .debug import DebugHead
-# __all__ = ["DebugHead"]
\ No newline at end of file