debug(inference): 添加推理阶段qpos归一化

This commit is contained in:
gouhanke
2026-02-06 09:00:44 +08:00
parent b0a944f7aa
commit 66009473ad
7 changed files with 859 additions and 121 deletions

532
roboimi/demos/eval_vla.py Normal file
View File

@@ -0,0 +1,532 @@
"""
VLA Policy Evaluation Script
This script evaluates a trained Vision-Language-Action (VLA) policy
in the MuJoCo simulation environment.
Usage:
python roboimi/demos/eval_vla.py --ckpt_path checkpoints/vla_model_best.pt --num_episodes 3
"""
import torch
import numpy as np
import argparse
from pathlib import Path
from typing import Dict, List
from tqdm import tqdm
from roboimi.envs.double_pos_ctrl_env import make_sim_env
from roboimi.utils.act_ex_utils import sample_transfer_pose
from einops import rearrange
class VLAEvaluator:
"""
VLA Policy Evaluator for MuJoCo Simulation
"""
def __init__(
self,
agent: torch.nn.Module,
device: str = 'cuda',
camera_names: List[str] = ['r_vis', 'top'],
num_queries: int = 1,
obs_horizon: int = 2,
pred_horizon: int = 16,
use_smoothing: bool = False,
smooth_method: str = 'ema',
smooth_alpha: float = 0.3
):
"""
Args:
agent: Trained VLAAgent
device: Device for inference
camera_names: List of camera names to use
num_queries: How often to query the policy (in timesteps)
obs_horizon: Number of observations to use as context
pred_horizon: Number of future actions to predict
use_smoothing: Whether to apply action smoothing
smooth_method: Smoothing method ('ema', 'moving_avg', 'lowpass')
smooth_alpha: Smoothing coefficient
"""
self.agent = agent.to(device)
self.device = device
self.camera_names = camera_names
self.num_queries = num_queries
self.obs_horizon = obs_horizon
self.pred_horizon = pred_horizon
# Action smoothing
self.use_smoothing = use_smoothing
self.smooth_method = smooth_method
self.smooth_alpha = smooth_alpha
self.smoother = ActionSmoother(
action_dim=16, # Assuming 16-dim actions
method=smooth_method,
alpha=smooth_alpha
) if use_smoothing else None
# Observation buffer for obs_horizon
self.obs_buffer = {
'images': {cam: [] for cam in camera_names},
'qpos': []
}
self.cached_actions = None
self.query_step = 0
def reset(self):
"""Reset evaluator state"""
self.obs_buffer = {
'images': {cam: [] for cam in self.camera_names},
'qpos': []
}
self.cached_actions = None
self.query_step = 0
if self.smoother is not None:
self.smoother.reset()
def _get_image_dict(self, obs: Dict) -> Dict[str, torch.Tensor]:
"""
Extract and preprocess images from observation
Args:
obs: Environment observation dict
Returns:
Dict mapping camera names to image tensors (B, obs_horizon, C, H, W)
"""
images = {}
for cam_name in self.camera_names:
# Extract image: (H, W, C) -> (C, H, W)
img = obs['images'][cam_name]
img = rearrange(img, 'h w c -> c h w')
img = torch.from_numpy(img / 255.0).float()
images[cam_name] = img # (C, H, W)
# Stack to create batch dimension
image_dict = {}
for cam_name in self.camera_names:
# Collect obs_horizon frames
cam_images = self.obs_buffer['images'][cam_name]
cam_images.append(images[cam_name])
# Pad to obs_horizon if needed (duplicate first frame)
while len(cam_images) < self.obs_horizon:
cam_images.insert(0, cam_images[0])
# Keep only obs_horizon frames
if len(cam_images) > self.obs_horizon:
cam_images = cam_images[-self.obs_horizon:]
# Stack: (obs_horizon, C, H, W) -> (1, obs_horizon, C, H, W)
img_tensor = torch.stack(cam_images, dim=0).unsqueeze(0)
image_dict[cam_name] = img_tensor
# Update buffer (without padding)
self.obs_buffer['images'][cam_name] = cam_images[-self.obs_horizon:]
return image_dict
def _get_qpos_dict(self, obs: Dict) -> torch.Tensor:
"""
Extract and preprocess qpos from observation
Args:
obs: Environment observation dict
Returns:
qpos tensor: (1, obs_horizon, obs_dim)
"""
qpos = obs['qpos']
qpos = torch.from_numpy(qpos).float()
# Add to buffer
self.obs_buffer['qpos'].append(qpos)
# Pad to obs_horizon if needed (duplicate first frame)
while len(self.obs_buffer['qpos']) < self.obs_horizon:
self.obs_buffer['qpos'].insert(0, self.obs_buffer['qpos'][0])
# Keep only obs_horizon frames
if len(self.obs_buffer['qpos']) > self.obs_horizon:
self.obs_buffer['qpos'] = self.obs_buffer['qpos'][-self.obs_horizon:]
# Stack: (obs_horizon, obs_dim) -> (1, obs_horizon, obs_dim)
qpos_tensor = torch.stack(self.obs_buffer['qpos'], dim=0).unsqueeze(0)
return qpos_tensor
@torch.no_grad()
def predict_action(self, obs: Dict) -> np.ndarray:
"""
Predict action using VLA policy
Args:
obs: Current environment observation
Returns:
action: numpy array of shape (action_dim,)
"""
# 1. Prepare observations
images = self._get_image_dict(obs) # Dict[str, (1, obs_horizon, C, H, W)]
qpos = self._get_qpos_dict(obs) # (1, obs_horizon, obs_dim)
# 2. Check if we need to query the policy
if self.cached_actions is None or self.query_step % self.num_queries == 0:
# Prepare input for VLA agent
# VLAAgent.predict_action expects:
# - images: Dict[str, Tensor] with shape (B, obs_horizon, C, H, W)
# - proprioception: Tensor with shape (B, obs_horizon, obs_dim)
# Move to device
images = {k: v.to(self.device) for k, v in images.items()}
qpos = qpos.to(self.device)
# Predict actions using VLA agent
# Returns: (B, pred_horizon, action_dim)
predicted_actions = self.agent.predict_action(
images=images,
proprioception=qpos
)
# Cache predicted actions (CPU numpy array)
self.cached_actions = predicted_actions.squeeze(0).cpu().numpy() # (pred_horizon, action_dim)
self.query_step = 0
# 3. Get action from cache
raw_action = self.cached_actions[self.query_step]
self.query_step += 1
# 4. Apply smoothing if enabled
if self.smoother is not None:
raw_action = self.smoother.smooth(raw_action)
return raw_action
class ActionSmoother:
"""Action smoothing for smoother execution"""
def __init__(self, action_dim: int, method: str = 'ema', alpha: float = 0.3):
self.action_dim = action_dim
self.method = method
self.alpha = alpha
self.prev_action = None
def smooth(self, action: np.ndarray) -> np.ndarray:
if self.method == 'ema':
if self.prev_action is None:
smoothed = action
else:
smoothed = self.alpha * action + (1 - self.alpha) * self.prev_action
self.prev_action = smoothed
return smoothed
else:
return action
def reset(self):
self.prev_action = None
def load_checkpoint(
ckpt_path: str,
device: str = 'cuda'
) -> torch.nn.Module:
"""
Load trained VLA model from checkpoint
Args:
ckpt_path: Path to checkpoint file (.pt)
device: Device to load model on
Returns:
Loaded VLAAgent model
"""
from roboimi.vla.agent import VLAAgent
from hydra import initialize_config_dir, compose
from pathlib import Path as PathLib
ckpt_path = PathLib(ckpt_path).absolute()
if not ckpt_path.exists():
raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
# Load checkpoint
print(f"Loading checkpoint from {ckpt_path}")
checkpoint = torch.load(ckpt_path, map_location=device, weights_only=False)
print(f"Checkpoint keys: {checkpoint.keys()}")
# Find VLA config directory
import os
# Get script directory
script_dir = PathLib(__file__).resolve().parent
current_dir = PathLib(os.getcwd()).absolute()
# Try to find vla/conf directory
config_dir = None
# Option 1: If running from roboimi directory
if (current_dir / 'vla' / 'conf').exists():
config_dir = current_dir / 'vla' / 'conf'
# Option 2: If running from project root
elif (current_dir / 'roboimi' / 'vla' / 'conf').exists():
config_dir = current_dir / 'roboimi' / 'vla' / 'conf'
# Option 3: Relative to script location
elif (script_dir / '../vla' / 'conf').exists():
config_dir = (script_dir / '../vla' / 'conf').resolve()
# Option 4: Search upwards
else:
search_start = current_dir
while search_start != search_start.parent:
if (search_start / 'vla' / 'conf').exists():
config_dir = search_start / 'vla' / 'conf'
break
search_start = search_start.parent
if config_dir is None:
raise FileNotFoundError(
f"Could not find VLA config directory.\n"
f"Current directory: {current_dir}\n"
f"Script location: {script_dir}\n"
f"Please ensure you're running from the roboimi directory."
)
config_abs_path = str(config_dir.absolute())
print(f"Loading config from {config_abs_path}")
if not PathLib(config_abs_path).exists():
raise FileNotFoundError(f"Config directory does not exist: {config_abs_path}")
print(f"Loading config from {config_abs_path}")
# Initialize Hydra with absolute path
with initialize_config_dir(config_dir=config_abs_path, version_base=None):
cfg = compose(config_name="config")
# Instantiate agent from config
print("Instantiating agent from config...")
from hydra.utils import instantiate
agent = instantiate(cfg.agent)
# Load model state
if 'model_state_dict' in checkpoint:
agent.load_state_dict(checkpoint['model_state_dict'])
print(f"✅ Model state loaded (step: {checkpoint.get('step', 'unknown')})")
elif 'state_dict' in checkpoint:
agent.load_state_dict(checkpoint['state_dict'])
print("✅ Model state loaded")
else:
# Assume checkpoint is the state_dict itself
agent.load_state_dict(checkpoint)
print("✅ Model state loaded")
# Load dataset statistics for denormalization
import json
stats_path = ckpt_path.parent / 'dataset_stats.json'
if stats_path.exists():
with open(stats_path, 'r') as f:
stats = json.load(f)
# Convert lists to numpy arrays
agent.action_mean = np.array(stats['action_mean'])
agent.action_std = np.array(stats['action_std'])
agent.qpos_mean = np.array(stats['qpos_mean'])
agent.qpos_std = np.array(stats['qpos_std'])
print(f"✅ Dataset statistics loaded for denormalization")
else:
print(f"⚠️ Warning: {stats_path} not found. Actions will not be denormalized!")
agent.action_mean = None
agent.action_std = None
agent.eval()
agent.to(device)
print(f"✅ Model loaded successfully on {device}")
return agent
def evaluate_policy(
agent: torch.nn.Module,
num_episodes: int = 3,
max_timesteps: int = 700,
task_name: str = 'sim_transfer',
device: str = 'cuda',
camera_names: List[str] = ['r_vis', 'top'],
num_queries: int = 1,
obs_horizon: int = 2,
save_video: bool = True
):
"""
Evaluate VLA policy in simulation
Args:
agent: Trained VLAAgent
num_episodes: Number of episodes to run
max_timesteps: Maximum timesteps per episode
task_name: Task name for environment creation
device: Device for inference
camera_names: List of camera names
num_queries: Policy query frequency
obs_horizon: Observation horizon
save_video: Whether to save video
"""
# Create evaluator
evaluator = VLAEvaluator(
agent=agent,
device=device,
camera_names=camera_names,
num_queries=num_queries,
obs_horizon=obs_horizon,
use_smoothing=False,
smooth_method='ema',
smooth_alpha=0.3
)
# Create environment
env = make_sim_env(task_name)
# Run episodes
for episode_idx in range(num_episodes):
print(f"\n{'='*60}")
print(f"Episode {episode_idx + 1}/{num_episodes}")
print(f"{'='*60}\n")
# Reset environment and evaluator
box_pos = sample_transfer_pose()
env.reset(box_pos)
evaluator.reset()
# Storage for visualization
episode_images = []
success = False
success_timestep = 0
with torch.inference_mode():
for t in tqdm(range(max_timesteps), desc=f"Episode {episode_idx + 1}"):
# Get observation
obs = env._get_image_obs()
qpos_obs = env._get_qpos_obs()
# Merge observations
obs['qpos'] = qpos_obs['qpos']
# Predict action
action = evaluator.predict_action(obs)
# Execute action
env.step_jnt(action)
# Save images for video
if save_video:
episode_images.append(obs['images'])
# Render
env.render()
# Check if episode is done
if env.rew == 1.0: # Success condition
success = True
success_timestep = t
print(f"\n✅ Task completed at timestep {t}!")
break
# Episode summary
print(f"\nEpisode {episode_idx + 1} Summary:")
print(f" Success: {success}")
if success:
print(f" Success Timestep: {success_timestep}")
print(f" Length: {len(episode_images)} timesteps")
# Save video
if save_video and episode_images:
save_video_episode(
episode_images,
save_path=f"outputs/eval_vla_episode_{episode_idx}.mp4"
)
print(f" Video saved: outputs/eval_vla_episode_{episode_idx}.mp4")
print(f"\n{'='*60}")
print("Evaluation complete!")
print(f"{'='*60}\n")
def save_video_episode(images: List[Dict], save_path: str, fps: int = 20):
"""
Save episode as video
Args:
images: List of observation dicts containing images
save_path: Path to save video
fps: Frames per second
"""
try:
import cv2
from tqdm import tqdm
Path(save_path).parent.mkdir(parents=True, exist_ok=True)
# Use first camera (e.g., 'r_vis') for visualization
cam_name = list(images[0].keys())[0]
# Get image size
H, W, C = images[0][cam_name].shape
# Create video writer
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(save_path, fourcc, fps, (W, H))
# Write frames
for img_dict in tqdm(images, desc="Saving video"):
frame = img_dict[cam_name]
# Convert RGB to BGR for OpenCV
frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
video_writer.write(frame_bgr)
video_writer.release()
print(f"Video saved to {save_path}")
except ImportError:
print("Warning: opencv-python not installed, skipping video save")
print("Install with: pip install opencv-python")
def main():
parser = argparse.ArgumentParser(description='Evaluate VLA Policy')
parser.add_argument('--ckpt_path', type=str, required=True,
help='Path to model checkpoint')
parser.add_argument('--num_episodes', type=int, default=3,
help='Number of evaluation episodes')
parser.add_argument('--max_timesteps', type=int, default=700,
help='Maximum timesteps per episode')
parser.add_argument('--device', type=str, default='cuda',
help='Device for inference')
parser.add_argument('--camera_names', nargs='+', default=['r_vis', 'top'],
help='Camera names to use')
parser.add_argument('--num_queries', type=int, default=16,
help='Policy query frequency (timesteps)')
parser.add_argument('--obs_horizon', type=int, default=2,
help='Observation horizon')
parser.add_argument('--no_video', action='store_true',
help='Do not save episode videos')
args = parser.parse_args()
# Load model
print(f"Loading model from {args.ckpt_path}...")
agent = load_checkpoint(args.ckpt_path, device=args.device)
# Evaluate
evaluate_policy(
agent=agent,
num_episodes=args.num_episodes,
max_timesteps=args.max_timesteps,
device=args.device,
camera_names=args.camera_names,
num_queries=args.num_queries,
obs_horizon=args.obs_horizon,
save_video=not args.no_video
)
if __name__ == '__main__':
main()

View File

@@ -1,100 +0,0 @@
import sys
import os
import hydra
import torch
import matplotlib.pyplot as plt
import numpy as np
from omegaconf import DictConfig, OmegaConf
from hydra.utils import instantiate
from torch.utils.data import DataLoader
# 确保能导入 roboimi
sys.path.append(os.getcwd())
from roboimi.vla.agent import VLAAgent
def recursive_to_device(data, device):
if isinstance(data, torch.Tensor):
return data.to(device)
elif isinstance(data, dict):
return {k: recursive_to_device(v, device) for k, v in data.items()}
return data
@hydra.main(version_base=None, config_path="../../../roboimi/vla/conf", config_name="config")
def main(cfg: DictConfig):
print(">>> 🤖 Starting VLA Inference...")
device = cfg.train.device
# 1. 实例化 Agent (结构必须与训练时完全一致)
# 也可以在这里覆盖配置,例如 forcing freeze=True
agent: VLAAgent = instantiate(cfg.agent)
agent.to(device)
agent.eval() # 关键:切换到 Eval 模式
# 2. 加载权重
ckpt_path = "checkpoints/vla_model_final.pt"
if not os.path.exists(ckpt_path):
print(f"❌ Checkpoint not found at {ckpt_path}. Run training first!")
return
print(f"Loading weights from {ckpt_path}...")
# map_location='cpu' 防止在只有 CPU 的机器上加载 GPU 权重报错
state_dict = torch.load(ckpt_path, map_location=device)
agent.load_state_dict(state_dict)
print("✅ Weights loaded successfully.")
# 3. 准备测试数据 (从 Dataset 里取一个样本)
dataset = instantiate(cfg.data)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
sample = next(iter(dataloader))
# 准备输入 (模拟机器人实时运行)
# 注意:推理时不需要传 sample['actions']
primary_cam_key = cfg.data.obs_keys[0]
input_img = sample['obs'][primary_cam_key][:, -1, :, :, :] # (1, C, H, W)
agent_input = {
"obs": {
"image": input_img.to(device),
"text": sample["language"] # 即使不用文本,占位符也要留着
}
# ⚠️ 关键:这里不传 'actions',触发 Agent 进入 Inference 分支
}
# 4. 执行推理 (Reverse Diffusion)
print("running reverse diffusion (this may take a moment)...")
with torch.no_grad():
# 这会触发 DiffusionHead 的分支 B (loop over timesteps)
outputs = agent(agent_input)
# 5. 获取结果
# 输出 shape: (1, Chunk_Size, Action_Dim)
pred_actions = outputs['pred_actions'].cpu().numpy()[0]
gt_actions = sample['actions'][0].numpy() # 用来对比
print(f"✅ Generated Action Chunk Shape: {pred_actions.shape}")
# 6. 可视化对比 (保存图片)
plot_results(pred_actions, gt_actions)
def plot_results(pred, gt):
"""
简单的可视化:画出前几个维度的轨迹对比
"""
plt.figure(figsize=(10, 5))
# 比如只画前 3 个维度 (x, y, z)
dims_to_plot = 3
for i in range(dims_to_plot):
plt.subplot(1, dims_to_plot, i+1)
plt.plot(gt[:, i], 'g--', label='Ground Truth')
plt.plot(pred[:, i], 'b-', label='Diffusion Pred')
plt.title(f"Action Dim {i}")
if i == 0: plt.legend()
plt.ylim(-1, 1) # 假设动作是归一化的
plt.tight_layout()
plt.savefig("inference_result.png")
print("📊 Result plot saved to 'inference_result.png'")
if __name__ == "__main__":
main()

View File

@@ -1,6 +1,8 @@
import sys
import os
import logging
import json
import pickle
import hydra
import torch
from tqdm import tqdm
@@ -103,6 +105,46 @@ def main(cfg: DictConfig):
log.error(f"❌ Failed to initialize agent: {e}")
raise
# =========================================================================
# 2.5. Save Dataset Statistics as JSON
# =========================================================================
log.info("💾 Saving dataset statistics...")
try:
# Get dataset_dir from config
dataset_dir = cfg.data.get('dataset_dir', 'roboimi/demos/dataset/sim_transfer')
stats_path = Path(dataset_dir) / 'data_stats.pkl'
if stats_path.exists():
# Load pickle file
with open(stats_path, 'rb') as f:
stats = pickle.load(f)
# Extract action statistics
action_mean = stats['action']['mean'].tolist() if 'action' in stats else []
action_std = stats['action']['std'].tolist() if 'action' in stats else []
qpos_mean = stats['qpos']['mean'].tolist() if 'qpos' in stats else []
qpos_std = stats['qpos']['std'].tolist() if 'qpos' in stats else []
# Save as JSON
json_stats = {
'action_mean': action_mean,
'action_std': action_std,
'qpos_mean': qpos_mean,
'qpos_std': qpos_std
}
json_path = checkpoint_dir / 'dataset_stats.json'
with open(json_path, 'w') as f:
json.dump(json_stats, f, indent=2)
log.info(f"✅ Dataset statistics saved to {json_path}")
else:
log.warning(f"⚠️ Statistics file not found: {stats_path}")
log.warning("⚠️ Actions will not be denormalized during inference!")
except Exception as e:
log.warning(f"⚠️ Failed to save statistics as JSON: {e}")
log.warning("⚠️ Training will continue, but inference may not work correctly")
# =========================================================================
# 3. Setup Optimizer
# =========================================================================

View File

@@ -0,0 +1,239 @@
# VLA Evaluation Guide
This guide explains how to evaluate a trained Vision-Language-Action (VLA) policy in the MuJoCo simulation environment.
## Prerequisites
1. **Trained Model**: Train your VLA model first using `train_vla.py`
2. **Checkpoints**: Ensure you have saved model checkpoints in `checkpoints/` directory
3. **Dependencies**: Install required dependencies:
```bash
pip install opencv-python tqdm
```
## Quick Start
### Basic Evaluation
```bash
# Evaluate with default settings (3 episodes)
python roboimi/demos/eval_vla.py \
--ckpt_path checkpoints/vla_model_best.pt
# Evaluate with custom settings
python roboimi/demos/eval_vla.py \
--ckpt_path checkpoints/vla_model_step_5000.pt \
--num_episodes 5 \
--max_timesteps 700 \
--camera_names r_vis top angle \
--num_queries 1 \
--obs_horizon 2
```
### Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `--ckpt_path` | Path to model checkpoint (.pt file) | Required |
| `--num_episodes` | Number of evaluation episodes | 3 |
| `--max_timesteps` | Maximum timesteps per episode | 700 |
| `--device` | Device for inference (`cuda` or `cpu`) | `cuda` |
| `--camera_names` | Camera names to use (space-separated) | `r_vis top` |
| `--num_queries` | Policy query frequency (every N timesteps) | 1 |
| `--obs_horizon` | Observation history length | 2 |
| `--no_video` | Disable video saving | False |
## Usage Details
### Policy Query Frequency
The `--num_queries` parameter controls how often the policy is queried:
- `--num_queries 1`: Query every timestep (default, most accurate)
- `--num_queries 4`: Query every 4 timesteps (faster, but uses cached actions)
When using cached actions (num_queries > 1), the policy predicts a chunk of actions (pred_horizon=16), and these actions are executed sequentially until the next query.
### Camera Selection
Available cameras depend on your environment:
- `r_vis`: Right arm RealSense camera
- `top`: Top-down view camera
- `angle`: Angled view camera
Use `--camera_names` to specify which cameras to use:
```bash
--camera_names r_vis top # Use 2 cameras
--camera_names r_vis top angle # Use all 3 cameras
```
### Observation Horizon
The `--obs_horizon` parameter determines how many past observations to use as context:
```bash
--obs_horizon 1 # Use only current observation
--obs_horizon 2 # Use current + 1 past observation (default)
--obs_horizon 4 # Use current + 3 past observations
```
**Note**: Must match the value used during training.
## Output
### Console Output
During evaluation, you'll see:
```
============================================================
Episode 1/3
============================================================
Episode 1: 100%|████████████████████| 700/700 [02:30<00:00, 4.64it/s]
✅ Task completed at timestep 453!
Episode 1 Summary:
Total Reward: 1.0000
Max Reward: 1.0000
Length: 453 timesteps
Video saved: outputs/eval_vla_episode_0.mp4
```
### Video Output
Videos are saved to `outputs/eval_vla_episode_{N}.mp4` showing the robot's execution.
### Metrics
- **Total Reward**: Sum of rewards throughout the episode
- **Max Reward**: Maximum reward achieved (1.0 = success)
- **Length**: Number of timesteps executed
## Action Smoothing
The evaluator includes EMA (Exponential Moving Average) smoothing by default to reduce jitter:
```python
# Default smoothing parameters
smooth_method = 'ema'
smooth_alpha = 0.3 # Lower = more smoothing
```
To disable or modify smoothing, edit the `evaluate_policy()` call in `eval_vla.py`:
```python
evaluator = VLAEvaluator(
agent=agent,
use_smoothing=False, # Disable smoothing
# or
smooth_method='moving_avg', # Use different method
smooth_alpha=0.5 # Adjust smoothing strength
)
```
## Troubleshooting
### Issue: Checkpoint not found
```
FileNotFoundError: Checkpoint not found: checkpoints/vla_model_best.pt
```
**Solution**: Ensure you've trained the model and checkpoints exist:
```bash
ls -la checkpoints/
# Should show: vla_model_best.pt, vla_model_final.pt, etc.
```
### Issue: CUDA out of memory
**Solution**: Use CPU for inference:
```bash
python eval_vla.py --ckpt_path checkpoints/vla_model_best.pt --device cpu
```
### Issue: Camera names don't match
**Solution**: Check your HDF5 files for available cameras:
```python
import h5py
with h5py.File('roboimi/demos/dataset/sim_transfer/episode_0.hdf5', 'r') as f:
print(list(f['observations/images'].keys()))
# Output: ['angle', 'r_vis', 'top']
```
Then use the correct camera names in your eval command.
### Issue: Mismatched obs_horizon
```
RuntimeError: Tensor shape mismatch
```
**Solution**: Ensure `--obs_horizon` matches the training config (`data.obs_horizon`).
## Advanced Usage
### Custom Evaluation Script
You can also use the evaluator in your own scripts:
```python
from roboimi.demos.eval_vla import VLAEvaluator, load_checkpoint
from roboimi.envs.double_pos_ctrl_env import make_sim_env
# Load model
agent = load_checkpoint('checkpoints/vla_model_best.pt')
# Create evaluator
evaluator = VLAEvaluator(
agent=agent,
device='cuda',
camera_names=['r_vis', 'top'],
num_queries=1,
obs_horizon=2
)
# Create environment
env = make_sim_env('sim_transfer')
env.reset()
evaluator.reset()
# Run episode
obs = env._get_image_obs()
obs['qpos'] = env._get_qpos_obs()['qpos']
# Predict and execute action
action = evaluator.predict_action(obs)
env.step_jnt(action)
```
### Batch Evaluation
Evaluate multiple checkpoints:
```bash
for ckpt in checkpoints/vla_model_step_*.pt; do
echo "Evaluating $ckpt"
python roboimi/demos/eval_vla.py \
--ckpt_path "$ckpt" \
--num_episodes 1 \
--no_video
done
```
## Next Steps
1. **Train your model**: See [RESNET_TRAINING_GUIDE.md](roboimi/vla/RESNET_TRAINING_GUIDE.md)
2. **Evaluate performance**: Use this evaluation script
3. **Analyze results**: Compare different checkpoints
4. **Deploy to real robot**: Adapt the evaluator for real robot control
## References
- Training Guide: [roboimi/vla/RESNET_TRAINING_GUIDE.md](roboimi/vla/RESNET_TRAINING_GUIDE.md)
- Project Documentation: [CLAUDE.md](CLAUDE.md)
- Original ACT Paper: https://arxiv.org/abs/2304.13705
- Diffusion Policy: https://diffusion-policy.cs.columbia.edu/

View File

@@ -1,8 +1,10 @@
import torch
import torch.nn as nn
import numpy as np
from typing import Dict, Optional, Any
from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead
from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
from diffusers.schedulers.scheduling_ddim import DDIMScheduler
from roboimi.vla.models.heads.diffusion import ConditionalUnet1D
class VLAAgent(nn.Module):
@@ -18,6 +20,13 @@ class VLAAgent(nn.Module):
num_cams=2, # 视觉输入的摄像头数量
):
super().__init__()
# Store parameters
self.action_dim = action_dim
self.obs_dim = obs_dim
self.pred_horizon = pred_horizon
self.obs_horizon = obs_horizon
self.num_cams = num_cams
self.vision_encoder = vision_backbone
single_img_feat_dim = self.vision_encoder.output_dim
total_vision_dim = single_img_feat_dim * num_cams * obs_horizon
@@ -31,6 +40,14 @@ class VLAAgent(nn.Module):
prediction_type='epsilon' # 预测噪声
)
# DDIM scheduler for faster inference
self.infer_scheduler = DDIMScheduler(
num_train_timesteps=diffusion_steps,
beta_schedule='squaredcos_cap_v2',
clip_sample=True,
prediction_type='epsilon'
)
self.noise_pred_net = ConditionalUnet1D(
input_dim=action_dim,
global_cond_dim=self.global_cond_dim
@@ -70,18 +87,12 @@ class VLAAgent(nn.Module):
)
# 6. 网络预测噪声
# 注意U-Net 1D 通常期望 channel 在中间: (B, C, T)
# noisy_actions_inp = noisy_actions.permute(0, 2, 1)
pred_noise = self.noise_pred_net(
sample=noisy_actions,
timestep=timesteps,
global_cond=global_cond
)
# 还原维度 (B, T, C)
pred_noise = pred_noise.permute(0, 2, 1)
# 7. 计算 Loss (MSE)
loss = nn.functional.mse_loss(pred_noise, noise)
return loss
@@ -96,20 +107,27 @@ class VLAAgent(nn.Module):
# 1. 提取当前观测特征 (只做一次)
visual_features = self.vision_encoder(images).view(B, -1)
proprioception = proprioception.view(B, -1)
if hasattr(self, 'qpos_mean') and hasattr(self, 'qpos_std') and self.qpos_mean is not None:
# Convert to tensor for normalization
qpos_mean = torch.from_numpy(self.qpos_mean).float().to(proprioception.device)
qpos_std = torch.from_numpy(self.qpos_std).float().to(proprioception.device)
qpos_mean = qpos_mean.repeat(2)
qpos_std = qpos_std.repeat(2)
# Normalize: (qpos - mean) / std
proprioception = (proprioception - qpos_mean.unsqueeze(0)) / qpos_std.unsqueeze(0)
global_cond = torch.cat([visual_features, proprioception], dim=-1)
# 2. 初始化纯高斯噪声动作
# Shape: (B, Horizon, Action_Dim)
# Shape: (B, pred_horizon, action_dim)
current_actions = torch.randn(
(B, 16, 7), device=global_cond.device
(B, self.pred_horizon, self.action_dim), device=global_cond.device
)
# 3. 逐步去噪循环 (Reverse Diffusion)
self.noise_scheduler.set_timesteps(10) # 推理时可以用更少步加速 (如 DDIM)
self.infer_scheduler.set_timesteps(10) # DDIM 推理步数
for t in self.noise_scheduler.timesteps:
# 调整输入格式适应 1D CNN
model_input = current_actions.permute(0, 2, 1)
for t in self.infer_scheduler.timesteps:
model_input = current_actions
# 预测噪声
noise_pred = self.noise_pred_net(
@@ -117,12 +135,19 @@ class VLAAgent(nn.Module):
timestep=t,
global_cond=global_cond
)
# noise_pred = noise_pred.permute(0, 2, 1)
# 移除噪声,更新 current_actions
current_actions = self.noise_scheduler.step(
current_actions = self.infer_scheduler.step(
noise_pred, t, current_actions
).prev_sample
# 4. 输出最终动作序列
# 4. 反归一化动作 (Denormalize actions)
if hasattr(self, 'action_mean') and hasattr(self, 'action_std') and self.action_mean is not None:
# Convert to numpy for denormalization
action_mean = torch.from_numpy(self.action_mean).float().to(current_actions.device)
action_std = torch.from_numpy(self.action_std).float().to(current_actions.device)
# Denormalize: action * std + mean
current_actions = current_actions * action_std.unsqueeze(0).unsqueeze(0) + action_mean.unsqueeze(0).unsqueeze(0)
# 5. 输出最终动作序列
return current_actions # 返回去噪后的干净动作

View File

@@ -4,10 +4,10 @@ defaults:
- data: resnet_dataset
train:
batch_size: 8 # Batch size for training
batch_size: 32 # Batch size for training
lr: 1e-4 # Learning rate
max_steps: 10000 # Maximum training steps
max_steps: 20000 # Maximum training steps
log_freq: 100 # Log frequency (steps)
save_freq: 1000 # Save checkpoint frequency (steps)
save_freq: 2000 # Save checkpoint frequency (steps)
device: "cuda" # Device: "cuda" or "cpu"
num_workers: 8 # DataLoader workers (set to 0 for debugging, 8 for production)

View File

@@ -11,7 +11,7 @@ class RobotDiffusionDataset(Dataset):
def __init__(self,
dataset_dir,
pred_horizon=16,
obs_horizon=1,
obs_horizon=2,
action_horizon=8,
camera_names=['r_vis', 'top'],
normalization_type='gaussian'):