debug(inference): 添加推理阶段qpos归一化

This commit is contained in:
gouhanke
2026-02-06 09:00:44 +08:00
parent b0a944f7aa
commit 66009473ad
7 changed files with 859 additions and 121 deletions

532
roboimi/demos/eval_vla.py Normal file
View File

@@ -0,0 +1,532 @@
"""
VLA Policy Evaluation Script
This script evaluates a trained Vision-Language-Action (VLA) policy
in the MuJoCo simulation environment.
Usage:
python roboimi/demos/eval_vla.py --ckpt_path checkpoints/vla_model_best.pt --num_episodes 3
"""
import torch
import numpy as np
import argparse
from pathlib import Path
from typing import Dict, List
from tqdm import tqdm
from roboimi.envs.double_pos_ctrl_env import make_sim_env
from roboimi.utils.act_ex_utils import sample_transfer_pose
from einops import rearrange
class VLAEvaluator:
"""
VLA Policy Evaluator for MuJoCo Simulation
"""
def __init__(
self,
agent: torch.nn.Module,
device: str = 'cuda',
camera_names: List[str] = ['r_vis', 'top'],
num_queries: int = 1,
obs_horizon: int = 2,
pred_horizon: int = 16,
use_smoothing: bool = False,
smooth_method: str = 'ema',
smooth_alpha: float = 0.3
):
"""
Args:
agent: Trained VLAAgent
device: Device for inference
camera_names: List of camera names to use
num_queries: How often to query the policy (in timesteps)
obs_horizon: Number of observations to use as context
pred_horizon: Number of future actions to predict
use_smoothing: Whether to apply action smoothing
smooth_method: Smoothing method ('ema', 'moving_avg', 'lowpass')
smooth_alpha: Smoothing coefficient
"""
self.agent = agent.to(device)
self.device = device
self.camera_names = camera_names
self.num_queries = num_queries
self.obs_horizon = obs_horizon
self.pred_horizon = pred_horizon
# Action smoothing
self.use_smoothing = use_smoothing
self.smooth_method = smooth_method
self.smooth_alpha = smooth_alpha
self.smoother = ActionSmoother(
action_dim=16, # Assuming 16-dim actions
method=smooth_method,
alpha=smooth_alpha
) if use_smoothing else None
# Observation buffer for obs_horizon
self.obs_buffer = {
'images': {cam: [] for cam in camera_names},
'qpos': []
}
self.cached_actions = None
self.query_step = 0
def reset(self):
"""Reset evaluator state"""
self.obs_buffer = {
'images': {cam: [] for cam in self.camera_names},
'qpos': []
}
self.cached_actions = None
self.query_step = 0
if self.smoother is not None:
self.smoother.reset()
def _get_image_dict(self, obs: Dict) -> Dict[str, torch.Tensor]:
"""
Extract and preprocess images from observation
Args:
obs: Environment observation dict
Returns:
Dict mapping camera names to image tensors (B, obs_horizon, C, H, W)
"""
images = {}
for cam_name in self.camera_names:
# Extract image: (H, W, C) -> (C, H, W)
img = obs['images'][cam_name]
img = rearrange(img, 'h w c -> c h w')
img = torch.from_numpy(img / 255.0).float()
images[cam_name] = img # (C, H, W)
# Stack to create batch dimension
image_dict = {}
for cam_name in self.camera_names:
# Collect obs_horizon frames
cam_images = self.obs_buffer['images'][cam_name]
cam_images.append(images[cam_name])
# Pad to obs_horizon if needed (duplicate first frame)
while len(cam_images) < self.obs_horizon:
cam_images.insert(0, cam_images[0])
# Keep only obs_horizon frames
if len(cam_images) > self.obs_horizon:
cam_images = cam_images[-self.obs_horizon:]
# Stack: (obs_horizon, C, H, W) -> (1, obs_horizon, C, H, W)
img_tensor = torch.stack(cam_images, dim=0).unsqueeze(0)
image_dict[cam_name] = img_tensor
# Update buffer (without padding)
self.obs_buffer['images'][cam_name] = cam_images[-self.obs_horizon:]
return image_dict
def _get_qpos_dict(self, obs: Dict) -> torch.Tensor:
"""
Extract and preprocess qpos from observation
Args:
obs: Environment observation dict
Returns:
qpos tensor: (1, obs_horizon, obs_dim)
"""
qpos = obs['qpos']
qpos = torch.from_numpy(qpos).float()
# Add to buffer
self.obs_buffer['qpos'].append(qpos)
# Pad to obs_horizon if needed (duplicate first frame)
while len(self.obs_buffer['qpos']) < self.obs_horizon:
self.obs_buffer['qpos'].insert(0, self.obs_buffer['qpos'][0])
# Keep only obs_horizon frames
if len(self.obs_buffer['qpos']) > self.obs_horizon:
self.obs_buffer['qpos'] = self.obs_buffer['qpos'][-self.obs_horizon:]
# Stack: (obs_horizon, obs_dim) -> (1, obs_horizon, obs_dim)
qpos_tensor = torch.stack(self.obs_buffer['qpos'], dim=0).unsqueeze(0)
return qpos_tensor
@torch.no_grad()
def predict_action(self, obs: Dict) -> np.ndarray:
"""
Predict action using VLA policy
Args:
obs: Current environment observation
Returns:
action: numpy array of shape (action_dim,)
"""
# 1. Prepare observations
images = self._get_image_dict(obs) # Dict[str, (1, obs_horizon, C, H, W)]
qpos = self._get_qpos_dict(obs) # (1, obs_horizon, obs_dim)
# 2. Check if we need to query the policy
if self.cached_actions is None or self.query_step % self.num_queries == 0:
# Prepare input for VLA agent
# VLAAgent.predict_action expects:
# - images: Dict[str, Tensor] with shape (B, obs_horizon, C, H, W)
# - proprioception: Tensor with shape (B, obs_horizon, obs_dim)
# Move to device
images = {k: v.to(self.device) for k, v in images.items()}
qpos = qpos.to(self.device)
# Predict actions using VLA agent
# Returns: (B, pred_horizon, action_dim)
predicted_actions = self.agent.predict_action(
images=images,
proprioception=qpos
)
# Cache predicted actions (CPU numpy array)
self.cached_actions = predicted_actions.squeeze(0).cpu().numpy() # (pred_horizon, action_dim)
self.query_step = 0
# 3. Get action from cache
raw_action = self.cached_actions[self.query_step]
self.query_step += 1
# 4. Apply smoothing if enabled
if self.smoother is not None:
raw_action = self.smoother.smooth(raw_action)
return raw_action
class ActionSmoother:
"""Action smoothing for smoother execution"""
def __init__(self, action_dim: int, method: str = 'ema', alpha: float = 0.3):
self.action_dim = action_dim
self.method = method
self.alpha = alpha
self.prev_action = None
def smooth(self, action: np.ndarray) -> np.ndarray:
if self.method == 'ema':
if self.prev_action is None:
smoothed = action
else:
smoothed = self.alpha * action + (1 - self.alpha) * self.prev_action
self.prev_action = smoothed
return smoothed
else:
return action
def reset(self):
self.prev_action = None
def load_checkpoint(
ckpt_path: str,
device: str = 'cuda'
) -> torch.nn.Module:
"""
Load trained VLA model from checkpoint
Args:
ckpt_path: Path to checkpoint file (.pt)
device: Device to load model on
Returns:
Loaded VLAAgent model
"""
from roboimi.vla.agent import VLAAgent
from hydra import initialize_config_dir, compose
from pathlib import Path as PathLib
ckpt_path = PathLib(ckpt_path).absolute()
if not ckpt_path.exists():
raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
# Load checkpoint
print(f"Loading checkpoint from {ckpt_path}")
checkpoint = torch.load(ckpt_path, map_location=device, weights_only=False)
print(f"Checkpoint keys: {checkpoint.keys()}")
# Find VLA config directory
import os
# Get script directory
script_dir = PathLib(__file__).resolve().parent
current_dir = PathLib(os.getcwd()).absolute()
# Try to find vla/conf directory
config_dir = None
# Option 1: If running from roboimi directory
if (current_dir / 'vla' / 'conf').exists():
config_dir = current_dir / 'vla' / 'conf'
# Option 2: If running from project root
elif (current_dir / 'roboimi' / 'vla' / 'conf').exists():
config_dir = current_dir / 'roboimi' / 'vla' / 'conf'
# Option 3: Relative to script location
elif (script_dir / '../vla' / 'conf').exists():
config_dir = (script_dir / '../vla' / 'conf').resolve()
# Option 4: Search upwards
else:
search_start = current_dir
while search_start != search_start.parent:
if (search_start / 'vla' / 'conf').exists():
config_dir = search_start / 'vla' / 'conf'
break
search_start = search_start.parent
if config_dir is None:
raise FileNotFoundError(
f"Could not find VLA config directory.\n"
f"Current directory: {current_dir}\n"
f"Script location: {script_dir}\n"
f"Please ensure you're running from the roboimi directory."
)
config_abs_path = str(config_dir.absolute())
print(f"Loading config from {config_abs_path}")
if not PathLib(config_abs_path).exists():
raise FileNotFoundError(f"Config directory does not exist: {config_abs_path}")
print(f"Loading config from {config_abs_path}")
# Initialize Hydra with absolute path
with initialize_config_dir(config_dir=config_abs_path, version_base=None):
cfg = compose(config_name="config")
# Instantiate agent from config
print("Instantiating agent from config...")
from hydra.utils import instantiate
agent = instantiate(cfg.agent)
# Load model state
if 'model_state_dict' in checkpoint:
agent.load_state_dict(checkpoint['model_state_dict'])
print(f"✅ Model state loaded (step: {checkpoint.get('step', 'unknown')})")
elif 'state_dict' in checkpoint:
agent.load_state_dict(checkpoint['state_dict'])
print("✅ Model state loaded")
else:
# Assume checkpoint is the state_dict itself
agent.load_state_dict(checkpoint)
print("✅ Model state loaded")
# Load dataset statistics for denormalization
import json
stats_path = ckpt_path.parent / 'dataset_stats.json'
if stats_path.exists():
with open(stats_path, 'r') as f:
stats = json.load(f)
# Convert lists to numpy arrays
agent.action_mean = np.array(stats['action_mean'])
agent.action_std = np.array(stats['action_std'])
agent.qpos_mean = np.array(stats['qpos_mean'])
agent.qpos_std = np.array(stats['qpos_std'])
print(f"✅ Dataset statistics loaded for denormalization")
else:
print(f"⚠️ Warning: {stats_path} not found. Actions will not be denormalized!")
agent.action_mean = None
agent.action_std = None
agent.eval()
agent.to(device)
print(f"✅ Model loaded successfully on {device}")
return agent
def evaluate_policy(
agent: torch.nn.Module,
num_episodes: int = 3,
max_timesteps: int = 700,
task_name: str = 'sim_transfer',
device: str = 'cuda',
camera_names: List[str] = ['r_vis', 'top'],
num_queries: int = 1,
obs_horizon: int = 2,
save_video: bool = True
):
"""
Evaluate VLA policy in simulation
Args:
agent: Trained VLAAgent
num_episodes: Number of episodes to run
max_timesteps: Maximum timesteps per episode
task_name: Task name for environment creation
device: Device for inference
camera_names: List of camera names
num_queries: Policy query frequency
obs_horizon: Observation horizon
save_video: Whether to save video
"""
# Create evaluator
evaluator = VLAEvaluator(
agent=agent,
device=device,
camera_names=camera_names,
num_queries=num_queries,
obs_horizon=obs_horizon,
use_smoothing=False,
smooth_method='ema',
smooth_alpha=0.3
)
# Create environment
env = make_sim_env(task_name)
# Run episodes
for episode_idx in range(num_episodes):
print(f"\n{'='*60}")
print(f"Episode {episode_idx + 1}/{num_episodes}")
print(f"{'='*60}\n")
# Reset environment and evaluator
box_pos = sample_transfer_pose()
env.reset(box_pos)
evaluator.reset()
# Storage for visualization
episode_images = []
success = False
success_timestep = 0
with torch.inference_mode():
for t in tqdm(range(max_timesteps), desc=f"Episode {episode_idx + 1}"):
# Get observation
obs = env._get_image_obs()
qpos_obs = env._get_qpos_obs()
# Merge observations
obs['qpos'] = qpos_obs['qpos']
# Predict action
action = evaluator.predict_action(obs)
# Execute action
env.step_jnt(action)
# Save images for video
if save_video:
episode_images.append(obs['images'])
# Render
env.render()
# Check if episode is done
if env.rew == 1.0: # Success condition
success = True
success_timestep = t
print(f"\n✅ Task completed at timestep {t}!")
break
# Episode summary
print(f"\nEpisode {episode_idx + 1} Summary:")
print(f" Success: {success}")
if success:
print(f" Success Timestep: {success_timestep}")
print(f" Length: {len(episode_images)} timesteps")
# Save video
if save_video and episode_images:
save_video_episode(
episode_images,
save_path=f"outputs/eval_vla_episode_{episode_idx}.mp4"
)
print(f" Video saved: outputs/eval_vla_episode_{episode_idx}.mp4")
print(f"\n{'='*60}")
print("Evaluation complete!")
print(f"{'='*60}\n")
def save_video_episode(images: List[Dict], save_path: str, fps: int = 20):
"""
Save episode as video
Args:
images: List of observation dicts containing images
save_path: Path to save video
fps: Frames per second
"""
try:
import cv2
from tqdm import tqdm
Path(save_path).parent.mkdir(parents=True, exist_ok=True)
# Use first camera (e.g., 'r_vis') for visualization
cam_name = list(images[0].keys())[0]
# Get image size
H, W, C = images[0][cam_name].shape
# Create video writer
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
video_writer = cv2.VideoWriter(save_path, fourcc, fps, (W, H))
# Write frames
for img_dict in tqdm(images, desc="Saving video"):
frame = img_dict[cam_name]
# Convert RGB to BGR for OpenCV
frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
video_writer.write(frame_bgr)
video_writer.release()
print(f"Video saved to {save_path}")
except ImportError:
print("Warning: opencv-python not installed, skipping video save")
print("Install with: pip install opencv-python")
def main():
parser = argparse.ArgumentParser(description='Evaluate VLA Policy')
parser.add_argument('--ckpt_path', type=str, required=True,
help='Path to model checkpoint')
parser.add_argument('--num_episodes', type=int, default=3,
help='Number of evaluation episodes')
parser.add_argument('--max_timesteps', type=int, default=700,
help='Maximum timesteps per episode')
parser.add_argument('--device', type=str, default='cuda',
help='Device for inference')
parser.add_argument('--camera_names', nargs='+', default=['r_vis', 'top'],
help='Camera names to use')
parser.add_argument('--num_queries', type=int, default=16,
help='Policy query frequency (timesteps)')
parser.add_argument('--obs_horizon', type=int, default=2,
help='Observation horizon')
parser.add_argument('--no_video', action='store_true',
help='Do not save episode videos')
args = parser.parse_args()
# Load model
print(f"Loading model from {args.ckpt_path}...")
agent = load_checkpoint(args.ckpt_path, device=args.device)
# Evaluate
evaluate_policy(
agent=agent,
num_episodes=args.num_episodes,
max_timesteps=args.max_timesteps,
device=args.device,
camera_names=args.camera_names,
num_queries=args.num_queries,
obs_horizon=args.obs_horizon,
save_video=not args.no_video
)
if __name__ == '__main__':
main()

View File

@@ -1,100 +0,0 @@
import sys
import os
import hydra
import torch
import matplotlib.pyplot as plt
import numpy as np
from omegaconf import DictConfig, OmegaConf
from hydra.utils import instantiate
from torch.utils.data import DataLoader
# 确保能导入 roboimi
sys.path.append(os.getcwd())
from roboimi.vla.agent import VLAAgent
def recursive_to_device(data, device):
if isinstance(data, torch.Tensor):
return data.to(device)
elif isinstance(data, dict):
return {k: recursive_to_device(v, device) for k, v in data.items()}
return data
@hydra.main(version_base=None, config_path="../../../roboimi/vla/conf", config_name="config")
def main(cfg: DictConfig):
print(">>> 🤖 Starting VLA Inference...")
device = cfg.train.device
# 1. 实例化 Agent (结构必须与训练时完全一致)
# 也可以在这里覆盖配置,例如 forcing freeze=True
agent: VLAAgent = instantiate(cfg.agent)
agent.to(device)
agent.eval() # 关键:切换到 Eval 模式
# 2. 加载权重
ckpt_path = "checkpoints/vla_model_final.pt"
if not os.path.exists(ckpt_path):
print(f"❌ Checkpoint not found at {ckpt_path}. Run training first!")
return
print(f"Loading weights from {ckpt_path}...")
# map_location='cpu' 防止在只有 CPU 的机器上加载 GPU 权重报错
state_dict = torch.load(ckpt_path, map_location=device)
agent.load_state_dict(state_dict)
print("✅ Weights loaded successfully.")
# 3. 准备测试数据 (从 Dataset 里取一个样本)
dataset = instantiate(cfg.data)
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
sample = next(iter(dataloader))
# 准备输入 (模拟机器人实时运行)
# 注意:推理时不需要传 sample['actions']
primary_cam_key = cfg.data.obs_keys[0]
input_img = sample['obs'][primary_cam_key][:, -1, :, :, :] # (1, C, H, W)
agent_input = {
"obs": {
"image": input_img.to(device),
"text": sample["language"] # 即使不用文本,占位符也要留着
}
# ⚠️ 关键:这里不传 'actions',触发 Agent 进入 Inference 分支
}
# 4. 执行推理 (Reverse Diffusion)
print("running reverse diffusion (this may take a moment)...")
with torch.no_grad():
# 这会触发 DiffusionHead 的分支 B (loop over timesteps)
outputs = agent(agent_input)
# 5. 获取结果
# 输出 shape: (1, Chunk_Size, Action_Dim)
pred_actions = outputs['pred_actions'].cpu().numpy()[0]
gt_actions = sample['actions'][0].numpy() # 用来对比
print(f"✅ Generated Action Chunk Shape: {pred_actions.shape}")
# 6. 可视化对比 (保存图片)
plot_results(pred_actions, gt_actions)
def plot_results(pred, gt):
"""
简单的可视化:画出前几个维度的轨迹对比
"""
plt.figure(figsize=(10, 5))
# 比如只画前 3 个维度 (x, y, z)
dims_to_plot = 3
for i in range(dims_to_plot):
plt.subplot(1, dims_to_plot, i+1)
plt.plot(gt[:, i], 'g--', label='Ground Truth')
plt.plot(pred[:, i], 'b-', label='Diffusion Pred')
plt.title(f"Action Dim {i}")
if i == 0: plt.legend()
plt.ylim(-1, 1) # 假设动作是归一化的
plt.tight_layout()
plt.savefig("inference_result.png")
print("📊 Result plot saved to 'inference_result.png'")
if __name__ == "__main__":
main()

View File

@@ -1,6 +1,8 @@
import sys import sys
import os import os
import logging import logging
import json
import pickle
import hydra import hydra
import torch import torch
from tqdm import tqdm from tqdm import tqdm
@@ -103,6 +105,46 @@ def main(cfg: DictConfig):
log.error(f"❌ Failed to initialize agent: {e}") log.error(f"❌ Failed to initialize agent: {e}")
raise raise
# =========================================================================
# 2.5. Save Dataset Statistics as JSON
# =========================================================================
log.info("💾 Saving dataset statistics...")
try:
# Get dataset_dir from config
dataset_dir = cfg.data.get('dataset_dir', 'roboimi/demos/dataset/sim_transfer')
stats_path = Path(dataset_dir) / 'data_stats.pkl'
if stats_path.exists():
# Load pickle file
with open(stats_path, 'rb') as f:
stats = pickle.load(f)
# Extract action statistics
action_mean = stats['action']['mean'].tolist() if 'action' in stats else []
action_std = stats['action']['std'].tolist() if 'action' in stats else []
qpos_mean = stats['qpos']['mean'].tolist() if 'qpos' in stats else []
qpos_std = stats['qpos']['std'].tolist() if 'qpos' in stats else []
# Save as JSON
json_stats = {
'action_mean': action_mean,
'action_std': action_std,
'qpos_mean': qpos_mean,
'qpos_std': qpos_std
}
json_path = checkpoint_dir / 'dataset_stats.json'
with open(json_path, 'w') as f:
json.dump(json_stats, f, indent=2)
log.info(f"✅ Dataset statistics saved to {json_path}")
else:
log.warning(f"⚠️ Statistics file not found: {stats_path}")
log.warning("⚠️ Actions will not be denormalized during inference!")
except Exception as e:
log.warning(f"⚠️ Failed to save statistics as JSON: {e}")
log.warning("⚠️ Training will continue, but inference may not work correctly")
# ========================================================================= # =========================================================================
# 3. Setup Optimizer # 3. Setup Optimizer
# ========================================================================= # =========================================================================

View File

@@ -0,0 +1,239 @@
# VLA Evaluation Guide
This guide explains how to evaluate a trained Vision-Language-Action (VLA) policy in the MuJoCo simulation environment.
## Prerequisites
1. **Trained Model**: Train your VLA model first using `train_vla.py`
2. **Checkpoints**: Ensure you have saved model checkpoints in `checkpoints/` directory
3. **Dependencies**: Install required dependencies:
```bash
pip install opencv-python tqdm
```
## Quick Start
### Basic Evaluation
```bash
# Evaluate with default settings (3 episodes)
python roboimi/demos/eval_vla.py \
--ckpt_path checkpoints/vla_model_best.pt
# Evaluate with custom settings
python roboimi/demos/eval_vla.py \
--ckpt_path checkpoints/vla_model_step_5000.pt \
--num_episodes 5 \
--max_timesteps 700 \
--camera_names r_vis top angle \
--num_queries 1 \
--obs_horizon 2
```
### Parameters
| Parameter | Description | Default |
|-----------|-------------|---------|
| `--ckpt_path` | Path to model checkpoint (.pt file) | Required |
| `--num_episodes` | Number of evaluation episodes | 3 |
| `--max_timesteps` | Maximum timesteps per episode | 700 |
| `--device` | Device for inference (`cuda` or `cpu`) | `cuda` |
| `--camera_names` | Camera names to use (space-separated) | `r_vis top` |
| `--num_queries` | Policy query frequency (every N timesteps) | 1 |
| `--obs_horizon` | Observation history length | 2 |
| `--no_video` | Disable video saving | False |
## Usage Details
### Policy Query Frequency
The `--num_queries` parameter controls how often the policy is queried:
- `--num_queries 1`: Query every timestep (default, most accurate)
- `--num_queries 4`: Query every 4 timesteps (faster, but uses cached actions)
When using cached actions (num_queries > 1), the policy predicts a chunk of actions (pred_horizon=16), and these actions are executed sequentially until the next query.
### Camera Selection
Available cameras depend on your environment:
- `r_vis`: Right arm RealSense camera
- `top`: Top-down view camera
- `angle`: Angled view camera
Use `--camera_names` to specify which cameras to use:
```bash
--camera_names r_vis top # Use 2 cameras
--camera_names r_vis top angle # Use all 3 cameras
```
### Observation Horizon
The `--obs_horizon` parameter determines how many past observations to use as context:
```bash
--obs_horizon 1 # Use only current observation
--obs_horizon 2 # Use current + 1 past observation (default)
--obs_horizon 4 # Use current + 3 past observations
```
**Note**: Must match the value used during training.
## Output
### Console Output
During evaluation, you'll see:
```
============================================================
Episode 1/3
============================================================
Episode 1: 100%|████████████████████| 700/700 [02:30<00:00, 4.64it/s]
✅ Task completed at timestep 453!
Episode 1 Summary:
Total Reward: 1.0000
Max Reward: 1.0000
Length: 453 timesteps
Video saved: outputs/eval_vla_episode_0.mp4
```
### Video Output
Videos are saved to `outputs/eval_vla_episode_{N}.mp4` showing the robot's execution.
### Metrics
- **Total Reward**: Sum of rewards throughout the episode
- **Max Reward**: Maximum reward achieved (1.0 = success)
- **Length**: Number of timesteps executed
## Action Smoothing
The evaluator includes EMA (Exponential Moving Average) smoothing by default to reduce jitter:
```python
# Default smoothing parameters
smooth_method = 'ema'
smooth_alpha = 0.3 # Lower = more smoothing
```
To disable or modify smoothing, edit the `evaluate_policy()` call in `eval_vla.py`:
```python
evaluator = VLAEvaluator(
agent=agent,
use_smoothing=False, # Disable smoothing
# or
smooth_method='moving_avg', # Use different method
smooth_alpha=0.5 # Adjust smoothing strength
)
```
## Troubleshooting
### Issue: Checkpoint not found
```
FileNotFoundError: Checkpoint not found: checkpoints/vla_model_best.pt
```
**Solution**: Ensure you've trained the model and checkpoints exist:
```bash
ls -la checkpoints/
# Should show: vla_model_best.pt, vla_model_final.pt, etc.
```
### Issue: CUDA out of memory
**Solution**: Use CPU for inference:
```bash
python eval_vla.py --ckpt_path checkpoints/vla_model_best.pt --device cpu
```
### Issue: Camera names don't match
**Solution**: Check your HDF5 files for available cameras:
```python
import h5py
with h5py.File('roboimi/demos/dataset/sim_transfer/episode_0.hdf5', 'r') as f:
print(list(f['observations/images'].keys()))
# Output: ['angle', 'r_vis', 'top']
```
Then use the correct camera names in your eval command.
### Issue: Mismatched obs_horizon
```
RuntimeError: Tensor shape mismatch
```
**Solution**: Ensure `--obs_horizon` matches the training config (`data.obs_horizon`).
## Advanced Usage
### Custom Evaluation Script
You can also use the evaluator in your own scripts:
```python
from roboimi.demos.eval_vla import VLAEvaluator, load_checkpoint
from roboimi.envs.double_pos_ctrl_env import make_sim_env
# Load model
agent = load_checkpoint('checkpoints/vla_model_best.pt')
# Create evaluator
evaluator = VLAEvaluator(
agent=agent,
device='cuda',
camera_names=['r_vis', 'top'],
num_queries=1,
obs_horizon=2
)
# Create environment
env = make_sim_env('sim_transfer')
env.reset()
evaluator.reset()
# Run episode
obs = env._get_image_obs()
obs['qpos'] = env._get_qpos_obs()['qpos']
# Predict and execute action
action = evaluator.predict_action(obs)
env.step_jnt(action)
```
### Batch Evaluation
Evaluate multiple checkpoints:
```bash
for ckpt in checkpoints/vla_model_step_*.pt; do
echo "Evaluating $ckpt"
python roboimi/demos/eval_vla.py \
--ckpt_path "$ckpt" \
--num_episodes 1 \
--no_video
done
```
## Next Steps
1. **Train your model**: See [RESNET_TRAINING_GUIDE.md](roboimi/vla/RESNET_TRAINING_GUIDE.md)
2. **Evaluate performance**: Use this evaluation script
3. **Analyze results**: Compare different checkpoints
4. **Deploy to real robot**: Adapt the evaluator for real robot control
## References
- Training Guide: [roboimi/vla/RESNET_TRAINING_GUIDE.md](roboimi/vla/RESNET_TRAINING_GUIDE.md)
- Project Documentation: [CLAUDE.md](CLAUDE.md)
- Original ACT Paper: https://arxiv.org/abs/2304.13705
- Diffusion Policy: https://diffusion-policy.cs.columbia.edu/

View File

@@ -1,8 +1,10 @@
import torch import torch
import torch.nn as nn import torch.nn as nn
import numpy as np
from typing import Dict, Optional, Any from typing import Dict, Optional, Any
from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead
from diffusers.schedulers.scheduling_ddpm import DDPMScheduler from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
from diffusers.schedulers.scheduling_ddim import DDIMScheduler
from roboimi.vla.models.heads.diffusion import ConditionalUnet1D from roboimi.vla.models.heads.diffusion import ConditionalUnet1D
class VLAAgent(nn.Module): class VLAAgent(nn.Module):
@@ -18,6 +20,13 @@ class VLAAgent(nn.Module):
num_cams=2, # 视觉输入的摄像头数量 num_cams=2, # 视觉输入的摄像头数量
): ):
super().__init__() super().__init__()
# Store parameters
self.action_dim = action_dim
self.obs_dim = obs_dim
self.pred_horizon = pred_horizon
self.obs_horizon = obs_horizon
self.num_cams = num_cams
self.vision_encoder = vision_backbone self.vision_encoder = vision_backbone
single_img_feat_dim = self.vision_encoder.output_dim single_img_feat_dim = self.vision_encoder.output_dim
total_vision_dim = single_img_feat_dim * num_cams * obs_horizon total_vision_dim = single_img_feat_dim * num_cams * obs_horizon
@@ -31,6 +40,14 @@ class VLAAgent(nn.Module):
prediction_type='epsilon' # 预测噪声 prediction_type='epsilon' # 预测噪声
) )
# DDIM scheduler for faster inference
self.infer_scheduler = DDIMScheduler(
num_train_timesteps=diffusion_steps,
beta_schedule='squaredcos_cap_v2',
clip_sample=True,
prediction_type='epsilon'
)
self.noise_pred_net = ConditionalUnet1D( self.noise_pred_net = ConditionalUnet1D(
input_dim=action_dim, input_dim=action_dim,
global_cond_dim=self.global_cond_dim global_cond_dim=self.global_cond_dim
@@ -70,18 +87,12 @@ class VLAAgent(nn.Module):
) )
# 6. 网络预测噪声 # 6. 网络预测噪声
# 注意U-Net 1D 通常期望 channel 在中间: (B, C, T)
# noisy_actions_inp = noisy_actions.permute(0, 2, 1)
pred_noise = self.noise_pred_net( pred_noise = self.noise_pred_net(
sample=noisy_actions, sample=noisy_actions,
timestep=timesteps, timestep=timesteps,
global_cond=global_cond global_cond=global_cond
) )
# 还原维度 (B, T, C)
pred_noise = pred_noise.permute(0, 2, 1)
# 7. 计算 Loss (MSE) # 7. 计算 Loss (MSE)
loss = nn.functional.mse_loss(pred_noise, noise) loss = nn.functional.mse_loss(pred_noise, noise)
return loss return loss
@@ -96,20 +107,27 @@ class VLAAgent(nn.Module):
# 1. 提取当前观测特征 (只做一次) # 1. 提取当前观测特征 (只做一次)
visual_features = self.vision_encoder(images).view(B, -1) visual_features = self.vision_encoder(images).view(B, -1)
proprioception = proprioception.view(B, -1) proprioception = proprioception.view(B, -1)
if hasattr(self, 'qpos_mean') and hasattr(self, 'qpos_std') and self.qpos_mean is not None:
# Convert to tensor for normalization
qpos_mean = torch.from_numpy(self.qpos_mean).float().to(proprioception.device)
qpos_std = torch.from_numpy(self.qpos_std).float().to(proprioception.device)
qpos_mean = qpos_mean.repeat(2)
qpos_std = qpos_std.repeat(2)
# Normalize: (qpos - mean) / std
proprioception = (proprioception - qpos_mean.unsqueeze(0)) / qpos_std.unsqueeze(0)
global_cond = torch.cat([visual_features, proprioception], dim=-1) global_cond = torch.cat([visual_features, proprioception], dim=-1)
# 2. 初始化纯高斯噪声动作 # 2. 初始化纯高斯噪声动作
# Shape: (B, Horizon, Action_Dim) # Shape: (B, pred_horizon, action_dim)
current_actions = torch.randn( current_actions = torch.randn(
(B, 16, 7), device=global_cond.device (B, self.pred_horizon, self.action_dim), device=global_cond.device
) )
# 3. 逐步去噪循环 (Reverse Diffusion) # 3. 逐步去噪循环 (Reverse Diffusion)
self.noise_scheduler.set_timesteps(10) # 推理时可以用更少步加速 (如 DDIM) self.infer_scheduler.set_timesteps(10) # DDIM 推理步数
for t in self.noise_scheduler.timesteps: for t in self.infer_scheduler.timesteps:
# 调整输入格式适应 1D CNN model_input = current_actions
model_input = current_actions.permute(0, 2, 1)
# 预测噪声 # 预测噪声
noise_pred = self.noise_pred_net( noise_pred = self.noise_pred_net(
@@ -117,12 +135,19 @@ class VLAAgent(nn.Module):
timestep=t, timestep=t,
global_cond=global_cond global_cond=global_cond
) )
# noise_pred = noise_pred.permute(0, 2, 1)
# 移除噪声,更新 current_actions # 移除噪声,更新 current_actions
current_actions = self.noise_scheduler.step( current_actions = self.infer_scheduler.step(
noise_pred, t, current_actions noise_pred, t, current_actions
).prev_sample ).prev_sample
# 4. 输出最终动作序列 # 4. 反归一化动作 (Denormalize actions)
if hasattr(self, 'action_mean') and hasattr(self, 'action_std') and self.action_mean is not None:
# Convert to numpy for denormalization
action_mean = torch.from_numpy(self.action_mean).float().to(current_actions.device)
action_std = torch.from_numpy(self.action_std).float().to(current_actions.device)
# Denormalize: action * std + mean
current_actions = current_actions * action_std.unsqueeze(0).unsqueeze(0) + action_mean.unsqueeze(0).unsqueeze(0)
# 5. 输出最终动作序列
return current_actions # 返回去噪后的干净动作 return current_actions # 返回去噪后的干净动作

View File

@@ -4,10 +4,10 @@ defaults:
- data: resnet_dataset - data: resnet_dataset
train: train:
batch_size: 8 # Batch size for training batch_size: 32 # Batch size for training
lr: 1e-4 # Learning rate lr: 1e-4 # Learning rate
max_steps: 10000 # Maximum training steps max_steps: 20000 # Maximum training steps
log_freq: 100 # Log frequency (steps) log_freq: 100 # Log frequency (steps)
save_freq: 1000 # Save checkpoint frequency (steps) save_freq: 2000 # Save checkpoint frequency (steps)
device: "cuda" # Device: "cuda" or "cpu" device: "cuda" # Device: "cuda" or "cpu"
num_workers: 8 # DataLoader workers (set to 0 for debugging, 8 for production) num_workers: 8 # DataLoader workers (set to 0 for debugging, 8 for production)

View File

@@ -11,7 +11,7 @@ class RobotDiffusionDataset(Dataset):
def __init__(self, def __init__(self,
dataset_dir, dataset_dir,
pred_horizon=16, pred_horizon=16,
obs_horizon=1, obs_horizon=2,
action_horizon=8, action_horizon=8,
camera_names=['r_vis', 'top'], camera_names=['r_vis', 'top'],
normalization_type='gaussian'): normalization_type='gaussian'):