debug(inference): 添加推理阶段qpos归一化
This commit is contained in:
532
roboimi/demos/eval_vla.py
Normal file
532
roboimi/demos/eval_vla.py
Normal file
@@ -0,0 +1,532 @@
|
||||
"""
|
||||
VLA Policy Evaluation Script
|
||||
|
||||
This script evaluates a trained Vision-Language-Action (VLA) policy
|
||||
in the MuJoCo simulation environment.
|
||||
|
||||
Usage:
|
||||
python roboimi/demos/eval_vla.py --ckpt_path checkpoints/vla_model_best.pt --num_episodes 3
|
||||
"""
|
||||
|
||||
import torch
|
||||
import numpy as np
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
from typing import Dict, List
|
||||
from tqdm import tqdm
|
||||
|
||||
from roboimi.envs.double_pos_ctrl_env import make_sim_env
|
||||
from roboimi.utils.act_ex_utils import sample_transfer_pose
|
||||
from einops import rearrange
|
||||
|
||||
|
||||
class VLAEvaluator:
|
||||
"""
|
||||
VLA Policy Evaluator for MuJoCo Simulation
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
agent: torch.nn.Module,
|
||||
device: str = 'cuda',
|
||||
camera_names: List[str] = ['r_vis', 'top'],
|
||||
num_queries: int = 1,
|
||||
obs_horizon: int = 2,
|
||||
pred_horizon: int = 16,
|
||||
use_smoothing: bool = False,
|
||||
smooth_method: str = 'ema',
|
||||
smooth_alpha: float = 0.3
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
agent: Trained VLAAgent
|
||||
device: Device for inference
|
||||
camera_names: List of camera names to use
|
||||
num_queries: How often to query the policy (in timesteps)
|
||||
obs_horizon: Number of observations to use as context
|
||||
pred_horizon: Number of future actions to predict
|
||||
use_smoothing: Whether to apply action smoothing
|
||||
smooth_method: Smoothing method ('ema', 'moving_avg', 'lowpass')
|
||||
smooth_alpha: Smoothing coefficient
|
||||
"""
|
||||
self.agent = agent.to(device)
|
||||
self.device = device
|
||||
self.camera_names = camera_names
|
||||
self.num_queries = num_queries
|
||||
self.obs_horizon = obs_horizon
|
||||
self.pred_horizon = pred_horizon
|
||||
|
||||
# Action smoothing
|
||||
self.use_smoothing = use_smoothing
|
||||
self.smooth_method = smooth_method
|
||||
self.smooth_alpha = smooth_alpha
|
||||
self.smoother = ActionSmoother(
|
||||
action_dim=16, # Assuming 16-dim actions
|
||||
method=smooth_method,
|
||||
alpha=smooth_alpha
|
||||
) if use_smoothing else None
|
||||
|
||||
# Observation buffer for obs_horizon
|
||||
self.obs_buffer = {
|
||||
'images': {cam: [] for cam in camera_names},
|
||||
'qpos': []
|
||||
}
|
||||
self.cached_actions = None
|
||||
self.query_step = 0
|
||||
|
||||
def reset(self):
|
||||
"""Reset evaluator state"""
|
||||
self.obs_buffer = {
|
||||
'images': {cam: [] for cam in self.camera_names},
|
||||
'qpos': []
|
||||
}
|
||||
self.cached_actions = None
|
||||
self.query_step = 0
|
||||
if self.smoother is not None:
|
||||
self.smoother.reset()
|
||||
|
||||
def _get_image_dict(self, obs: Dict) -> Dict[str, torch.Tensor]:
|
||||
"""
|
||||
Extract and preprocess images from observation
|
||||
|
||||
Args:
|
||||
obs: Environment observation dict
|
||||
|
||||
Returns:
|
||||
Dict mapping camera names to image tensors (B, obs_horizon, C, H, W)
|
||||
"""
|
||||
images = {}
|
||||
for cam_name in self.camera_names:
|
||||
# Extract image: (H, W, C) -> (C, H, W)
|
||||
img = obs['images'][cam_name]
|
||||
img = rearrange(img, 'h w c -> c h w')
|
||||
img = torch.from_numpy(img / 255.0).float()
|
||||
images[cam_name] = img # (C, H, W)
|
||||
|
||||
# Stack to create batch dimension
|
||||
image_dict = {}
|
||||
for cam_name in self.camera_names:
|
||||
# Collect obs_horizon frames
|
||||
cam_images = self.obs_buffer['images'][cam_name]
|
||||
cam_images.append(images[cam_name])
|
||||
|
||||
# Pad to obs_horizon if needed (duplicate first frame)
|
||||
while len(cam_images) < self.obs_horizon:
|
||||
cam_images.insert(0, cam_images[0])
|
||||
|
||||
# Keep only obs_horizon frames
|
||||
if len(cam_images) > self.obs_horizon:
|
||||
cam_images = cam_images[-self.obs_horizon:]
|
||||
|
||||
# Stack: (obs_horizon, C, H, W) -> (1, obs_horizon, C, H, W)
|
||||
img_tensor = torch.stack(cam_images, dim=0).unsqueeze(0)
|
||||
image_dict[cam_name] = img_tensor
|
||||
|
||||
# Update buffer (without padding)
|
||||
self.obs_buffer['images'][cam_name] = cam_images[-self.obs_horizon:]
|
||||
|
||||
return image_dict
|
||||
|
||||
def _get_qpos_dict(self, obs: Dict) -> torch.Tensor:
|
||||
"""
|
||||
Extract and preprocess qpos from observation
|
||||
|
||||
Args:
|
||||
obs: Environment observation dict
|
||||
|
||||
Returns:
|
||||
qpos tensor: (1, obs_horizon, obs_dim)
|
||||
"""
|
||||
qpos = obs['qpos']
|
||||
qpos = torch.from_numpy(qpos).float()
|
||||
|
||||
# Add to buffer
|
||||
self.obs_buffer['qpos'].append(qpos)
|
||||
|
||||
# Pad to obs_horizon if needed (duplicate first frame)
|
||||
while len(self.obs_buffer['qpos']) < self.obs_horizon:
|
||||
self.obs_buffer['qpos'].insert(0, self.obs_buffer['qpos'][0])
|
||||
|
||||
# Keep only obs_horizon frames
|
||||
if len(self.obs_buffer['qpos']) > self.obs_horizon:
|
||||
self.obs_buffer['qpos'] = self.obs_buffer['qpos'][-self.obs_horizon:]
|
||||
|
||||
# Stack: (obs_horizon, obs_dim) -> (1, obs_horizon, obs_dim)
|
||||
qpos_tensor = torch.stack(self.obs_buffer['qpos'], dim=0).unsqueeze(0)
|
||||
|
||||
return qpos_tensor
|
||||
|
||||
@torch.no_grad()
|
||||
def predict_action(self, obs: Dict) -> np.ndarray:
|
||||
"""
|
||||
Predict action using VLA policy
|
||||
|
||||
Args:
|
||||
obs: Current environment observation
|
||||
|
||||
Returns:
|
||||
action: numpy array of shape (action_dim,)
|
||||
"""
|
||||
# 1. Prepare observations
|
||||
images = self._get_image_dict(obs) # Dict[str, (1, obs_horizon, C, H, W)]
|
||||
qpos = self._get_qpos_dict(obs) # (1, obs_horizon, obs_dim)
|
||||
|
||||
# 2. Check if we need to query the policy
|
||||
if self.cached_actions is None or self.query_step % self.num_queries == 0:
|
||||
# Prepare input for VLA agent
|
||||
# VLAAgent.predict_action expects:
|
||||
# - images: Dict[str, Tensor] with shape (B, obs_horizon, C, H, W)
|
||||
# - proprioception: Tensor with shape (B, obs_horizon, obs_dim)
|
||||
|
||||
# Move to device
|
||||
images = {k: v.to(self.device) for k, v in images.items()}
|
||||
qpos = qpos.to(self.device)
|
||||
|
||||
# Predict actions using VLA agent
|
||||
# Returns: (B, pred_horizon, action_dim)
|
||||
predicted_actions = self.agent.predict_action(
|
||||
images=images,
|
||||
proprioception=qpos
|
||||
)
|
||||
|
||||
# Cache predicted actions (CPU numpy array)
|
||||
self.cached_actions = predicted_actions.squeeze(0).cpu().numpy() # (pred_horizon, action_dim)
|
||||
self.query_step = 0
|
||||
|
||||
# 3. Get action from cache
|
||||
raw_action = self.cached_actions[self.query_step]
|
||||
self.query_step += 1
|
||||
|
||||
# 4. Apply smoothing if enabled
|
||||
if self.smoother is not None:
|
||||
raw_action = self.smoother.smooth(raw_action)
|
||||
|
||||
return raw_action
|
||||
|
||||
|
||||
class ActionSmoother:
|
||||
"""Action smoothing for smoother execution"""
|
||||
|
||||
def __init__(self, action_dim: int, method: str = 'ema', alpha: float = 0.3):
|
||||
self.action_dim = action_dim
|
||||
self.method = method
|
||||
self.alpha = alpha
|
||||
self.prev_action = None
|
||||
|
||||
def smooth(self, action: np.ndarray) -> np.ndarray:
|
||||
if self.method == 'ema':
|
||||
if self.prev_action is None:
|
||||
smoothed = action
|
||||
else:
|
||||
smoothed = self.alpha * action + (1 - self.alpha) * self.prev_action
|
||||
self.prev_action = smoothed
|
||||
return smoothed
|
||||
else:
|
||||
return action
|
||||
|
||||
def reset(self):
|
||||
self.prev_action = None
|
||||
|
||||
|
||||
def load_checkpoint(
|
||||
ckpt_path: str,
|
||||
device: str = 'cuda'
|
||||
) -> torch.nn.Module:
|
||||
"""
|
||||
Load trained VLA model from checkpoint
|
||||
|
||||
Args:
|
||||
ckpt_path: Path to checkpoint file (.pt)
|
||||
device: Device to load model on
|
||||
|
||||
Returns:
|
||||
Loaded VLAAgent model
|
||||
"""
|
||||
from roboimi.vla.agent import VLAAgent
|
||||
from hydra import initialize_config_dir, compose
|
||||
from pathlib import Path as PathLib
|
||||
|
||||
ckpt_path = PathLib(ckpt_path).absolute()
|
||||
if not ckpt_path.exists():
|
||||
raise FileNotFoundError(f"Checkpoint not found: {ckpt_path}")
|
||||
|
||||
# Load checkpoint
|
||||
print(f"Loading checkpoint from {ckpt_path}")
|
||||
checkpoint = torch.load(ckpt_path, map_location=device, weights_only=False)
|
||||
|
||||
print(f"Checkpoint keys: {checkpoint.keys()}")
|
||||
|
||||
# Find VLA config directory
|
||||
import os
|
||||
|
||||
# Get script directory
|
||||
script_dir = PathLib(__file__).resolve().parent
|
||||
current_dir = PathLib(os.getcwd()).absolute()
|
||||
|
||||
# Try to find vla/conf directory
|
||||
config_dir = None
|
||||
|
||||
# Option 1: If running from roboimi directory
|
||||
if (current_dir / 'vla' / 'conf').exists():
|
||||
config_dir = current_dir / 'vla' / 'conf'
|
||||
# Option 2: If running from project root
|
||||
elif (current_dir / 'roboimi' / 'vla' / 'conf').exists():
|
||||
config_dir = current_dir / 'roboimi' / 'vla' / 'conf'
|
||||
# Option 3: Relative to script location
|
||||
elif (script_dir / '../vla' / 'conf').exists():
|
||||
config_dir = (script_dir / '../vla' / 'conf').resolve()
|
||||
# Option 4: Search upwards
|
||||
else:
|
||||
search_start = current_dir
|
||||
while search_start != search_start.parent:
|
||||
if (search_start / 'vla' / 'conf').exists():
|
||||
config_dir = search_start / 'vla' / 'conf'
|
||||
break
|
||||
search_start = search_start.parent
|
||||
|
||||
if config_dir is None:
|
||||
raise FileNotFoundError(
|
||||
f"Could not find VLA config directory.\n"
|
||||
f"Current directory: {current_dir}\n"
|
||||
f"Script location: {script_dir}\n"
|
||||
f"Please ensure you're running from the roboimi directory."
|
||||
)
|
||||
|
||||
config_abs_path = str(config_dir.absolute())
|
||||
print(f"Loading config from {config_abs_path}")
|
||||
|
||||
if not PathLib(config_abs_path).exists():
|
||||
raise FileNotFoundError(f"Config directory does not exist: {config_abs_path}")
|
||||
print(f"Loading config from {config_abs_path}")
|
||||
|
||||
# Initialize Hydra with absolute path
|
||||
with initialize_config_dir(config_dir=config_abs_path, version_base=None):
|
||||
cfg = compose(config_name="config")
|
||||
|
||||
# Instantiate agent from config
|
||||
print("Instantiating agent from config...")
|
||||
from hydra.utils import instantiate
|
||||
agent = instantiate(cfg.agent)
|
||||
|
||||
# Load model state
|
||||
if 'model_state_dict' in checkpoint:
|
||||
agent.load_state_dict(checkpoint['model_state_dict'])
|
||||
print(f"✅ Model state loaded (step: {checkpoint.get('step', 'unknown')})")
|
||||
elif 'state_dict' in checkpoint:
|
||||
agent.load_state_dict(checkpoint['state_dict'])
|
||||
print("✅ Model state loaded")
|
||||
else:
|
||||
# Assume checkpoint is the state_dict itself
|
||||
agent.load_state_dict(checkpoint)
|
||||
print("✅ Model state loaded")
|
||||
|
||||
# Load dataset statistics for denormalization
|
||||
import json
|
||||
stats_path = ckpt_path.parent / 'dataset_stats.json'
|
||||
if stats_path.exists():
|
||||
with open(stats_path, 'r') as f:
|
||||
stats = json.load(f)
|
||||
# Convert lists to numpy arrays
|
||||
agent.action_mean = np.array(stats['action_mean'])
|
||||
agent.action_std = np.array(stats['action_std'])
|
||||
agent.qpos_mean = np.array(stats['qpos_mean'])
|
||||
agent.qpos_std = np.array(stats['qpos_std'])
|
||||
print(f"✅ Dataset statistics loaded for denormalization")
|
||||
else:
|
||||
print(f"⚠️ Warning: {stats_path} not found. Actions will not be denormalized!")
|
||||
agent.action_mean = None
|
||||
agent.action_std = None
|
||||
|
||||
agent.eval()
|
||||
agent.to(device)
|
||||
|
||||
print(f"✅ Model loaded successfully on {device}")
|
||||
|
||||
return agent
|
||||
|
||||
|
||||
def evaluate_policy(
|
||||
agent: torch.nn.Module,
|
||||
num_episodes: int = 3,
|
||||
max_timesteps: int = 700,
|
||||
task_name: str = 'sim_transfer',
|
||||
device: str = 'cuda',
|
||||
camera_names: List[str] = ['r_vis', 'top'],
|
||||
num_queries: int = 1,
|
||||
obs_horizon: int = 2,
|
||||
save_video: bool = True
|
||||
):
|
||||
"""
|
||||
Evaluate VLA policy in simulation
|
||||
|
||||
Args:
|
||||
agent: Trained VLAAgent
|
||||
num_episodes: Number of episodes to run
|
||||
max_timesteps: Maximum timesteps per episode
|
||||
task_name: Task name for environment creation
|
||||
device: Device for inference
|
||||
camera_names: List of camera names
|
||||
num_queries: Policy query frequency
|
||||
obs_horizon: Observation horizon
|
||||
save_video: Whether to save video
|
||||
"""
|
||||
# Create evaluator
|
||||
evaluator = VLAEvaluator(
|
||||
agent=agent,
|
||||
device=device,
|
||||
camera_names=camera_names,
|
||||
num_queries=num_queries,
|
||||
obs_horizon=obs_horizon,
|
||||
use_smoothing=False,
|
||||
smooth_method='ema',
|
||||
smooth_alpha=0.3
|
||||
)
|
||||
|
||||
# Create environment
|
||||
env = make_sim_env(task_name)
|
||||
|
||||
# Run episodes
|
||||
for episode_idx in range(num_episodes):
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Episode {episode_idx + 1}/{num_episodes}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
# Reset environment and evaluator
|
||||
box_pos = sample_transfer_pose()
|
||||
env.reset(box_pos)
|
||||
evaluator.reset()
|
||||
|
||||
# Storage for visualization
|
||||
episode_images = []
|
||||
success = False
|
||||
success_timestep = 0
|
||||
|
||||
with torch.inference_mode():
|
||||
for t in tqdm(range(max_timesteps), desc=f"Episode {episode_idx + 1}"):
|
||||
# Get observation
|
||||
obs = env._get_image_obs()
|
||||
qpos_obs = env._get_qpos_obs()
|
||||
|
||||
# Merge observations
|
||||
obs['qpos'] = qpos_obs['qpos']
|
||||
|
||||
# Predict action
|
||||
action = evaluator.predict_action(obs)
|
||||
|
||||
# Execute action
|
||||
env.step_jnt(action)
|
||||
|
||||
# Save images for video
|
||||
if save_video:
|
||||
episode_images.append(obs['images'])
|
||||
|
||||
# Render
|
||||
env.render()
|
||||
|
||||
# Check if episode is done
|
||||
if env.rew == 1.0: # Success condition
|
||||
success = True
|
||||
success_timestep = t
|
||||
print(f"\n✅ Task completed at timestep {t}!")
|
||||
break
|
||||
|
||||
# Episode summary
|
||||
print(f"\nEpisode {episode_idx + 1} Summary:")
|
||||
print(f" Success: {success}")
|
||||
if success:
|
||||
print(f" Success Timestep: {success_timestep}")
|
||||
print(f" Length: {len(episode_images)} timesteps")
|
||||
|
||||
# Save video
|
||||
if save_video and episode_images:
|
||||
save_video_episode(
|
||||
episode_images,
|
||||
save_path=f"outputs/eval_vla_episode_{episode_idx}.mp4"
|
||||
)
|
||||
print(f" Video saved: outputs/eval_vla_episode_{episode_idx}.mp4")
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print("Evaluation complete!")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
|
||||
def save_video_episode(images: List[Dict], save_path: str, fps: int = 20):
|
||||
"""
|
||||
Save episode as video
|
||||
|
||||
Args:
|
||||
images: List of observation dicts containing images
|
||||
save_path: Path to save video
|
||||
fps: Frames per second
|
||||
"""
|
||||
try:
|
||||
import cv2
|
||||
from tqdm import tqdm
|
||||
|
||||
Path(save_path).parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Use first camera (e.g., 'r_vis') for visualization
|
||||
cam_name = list(images[0].keys())[0]
|
||||
|
||||
# Get image size
|
||||
H, W, C = images[0][cam_name].shape
|
||||
|
||||
# Create video writer
|
||||
fourcc = cv2.VideoWriter_fourcc(*'mp4v')
|
||||
video_writer = cv2.VideoWriter(save_path, fourcc, fps, (W, H))
|
||||
|
||||
# Write frames
|
||||
for img_dict in tqdm(images, desc="Saving video"):
|
||||
frame = img_dict[cam_name]
|
||||
# Convert RGB to BGR for OpenCV
|
||||
frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
|
||||
video_writer.write(frame_bgr)
|
||||
|
||||
video_writer.release()
|
||||
print(f"Video saved to {save_path}")
|
||||
|
||||
except ImportError:
|
||||
print("Warning: opencv-python not installed, skipping video save")
|
||||
print("Install with: pip install opencv-python")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Evaluate VLA Policy')
|
||||
parser.add_argument('--ckpt_path', type=str, required=True,
|
||||
help='Path to model checkpoint')
|
||||
parser.add_argument('--num_episodes', type=int, default=3,
|
||||
help='Number of evaluation episodes')
|
||||
parser.add_argument('--max_timesteps', type=int, default=700,
|
||||
help='Maximum timesteps per episode')
|
||||
parser.add_argument('--device', type=str, default='cuda',
|
||||
help='Device for inference')
|
||||
parser.add_argument('--camera_names', nargs='+', default=['r_vis', 'top'],
|
||||
help='Camera names to use')
|
||||
parser.add_argument('--num_queries', type=int, default=16,
|
||||
help='Policy query frequency (timesteps)')
|
||||
parser.add_argument('--obs_horizon', type=int, default=2,
|
||||
help='Observation horizon')
|
||||
parser.add_argument('--no_video', action='store_true',
|
||||
help='Do not save episode videos')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load model
|
||||
print(f"Loading model from {args.ckpt_path}...")
|
||||
agent = load_checkpoint(args.ckpt_path, device=args.device)
|
||||
|
||||
# Evaluate
|
||||
evaluate_policy(
|
||||
agent=agent,
|
||||
num_episodes=args.num_episodes,
|
||||
max_timesteps=args.max_timesteps,
|
||||
device=args.device,
|
||||
camera_names=args.camera_names,
|
||||
num_queries=args.num_queries,
|
||||
obs_horizon=args.obs_horizon,
|
||||
save_video=not args.no_video
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -1,100 +0,0 @@
|
||||
import sys
|
||||
import os
|
||||
import hydra
|
||||
import torch
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
from omegaconf import DictConfig, OmegaConf
|
||||
from hydra.utils import instantiate
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
# 确保能导入 roboimi
|
||||
sys.path.append(os.getcwd())
|
||||
from roboimi.vla.agent import VLAAgent
|
||||
|
||||
def recursive_to_device(data, device):
|
||||
if isinstance(data, torch.Tensor):
|
||||
return data.to(device)
|
||||
elif isinstance(data, dict):
|
||||
return {k: recursive_to_device(v, device) for k, v in data.items()}
|
||||
return data
|
||||
|
||||
@hydra.main(version_base=None, config_path="../../../roboimi/vla/conf", config_name="config")
|
||||
def main(cfg: DictConfig):
|
||||
print(">>> 🤖 Starting VLA Inference...")
|
||||
device = cfg.train.device
|
||||
|
||||
# 1. 实例化 Agent (结构必须与训练时完全一致)
|
||||
# 也可以在这里覆盖配置,例如 forcing freeze=True
|
||||
agent: VLAAgent = instantiate(cfg.agent)
|
||||
agent.to(device)
|
||||
agent.eval() # 关键:切换到 Eval 模式
|
||||
|
||||
# 2. 加载权重
|
||||
ckpt_path = "checkpoints/vla_model_final.pt"
|
||||
if not os.path.exists(ckpt_path):
|
||||
print(f"❌ Checkpoint not found at {ckpt_path}. Run training first!")
|
||||
return
|
||||
|
||||
print(f"Loading weights from {ckpt_path}...")
|
||||
# map_location='cpu' 防止在只有 CPU 的机器上加载 GPU 权重报错
|
||||
state_dict = torch.load(ckpt_path, map_location=device)
|
||||
agent.load_state_dict(state_dict)
|
||||
print("✅ Weights loaded successfully.")
|
||||
|
||||
# 3. 准备测试数据 (从 Dataset 里取一个样本)
|
||||
dataset = instantiate(cfg.data)
|
||||
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
|
||||
sample = next(iter(dataloader))
|
||||
|
||||
# 准备输入 (模拟机器人实时运行)
|
||||
# 注意:推理时不需要传 sample['actions']
|
||||
primary_cam_key = cfg.data.obs_keys[0]
|
||||
input_img = sample['obs'][primary_cam_key][:, -1, :, :, :] # (1, C, H, W)
|
||||
|
||||
agent_input = {
|
||||
"obs": {
|
||||
"image": input_img.to(device),
|
||||
"text": sample["language"] # 即使不用文本,占位符也要留着
|
||||
}
|
||||
# ⚠️ 关键:这里不传 'actions',触发 Agent 进入 Inference 分支
|
||||
}
|
||||
|
||||
# 4. 执行推理 (Reverse Diffusion)
|
||||
print("running reverse diffusion (this may take a moment)...")
|
||||
with torch.no_grad():
|
||||
# 这会触发 DiffusionHead 的分支 B (loop over timesteps)
|
||||
outputs = agent(agent_input)
|
||||
|
||||
# 5. 获取结果
|
||||
# 输出 shape: (1, Chunk_Size, Action_Dim)
|
||||
pred_actions = outputs['pred_actions'].cpu().numpy()[0]
|
||||
gt_actions = sample['actions'][0].numpy() # 用来对比
|
||||
|
||||
print(f"✅ Generated Action Chunk Shape: {pred_actions.shape}")
|
||||
|
||||
# 6. 可视化对比 (保存图片)
|
||||
plot_results(pred_actions, gt_actions)
|
||||
|
||||
def plot_results(pred, gt):
|
||||
"""
|
||||
简单的可视化:画出前几个维度的轨迹对比
|
||||
"""
|
||||
plt.figure(figsize=(10, 5))
|
||||
|
||||
# 比如只画前 3 个维度 (x, y, z)
|
||||
dims_to_plot = 3
|
||||
for i in range(dims_to_plot):
|
||||
plt.subplot(1, dims_to_plot, i+1)
|
||||
plt.plot(gt[:, i], 'g--', label='Ground Truth')
|
||||
plt.plot(pred[:, i], 'b-', label='Diffusion Pred')
|
||||
plt.title(f"Action Dim {i}")
|
||||
if i == 0: plt.legend()
|
||||
plt.ylim(-1, 1) # 假设动作是归一化的
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig("inference_result.png")
|
||||
print("📊 Result plot saved to 'inference_result.png'")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,6 +1,8 @@
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
import json
|
||||
import pickle
|
||||
import hydra
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
@@ -103,6 +105,46 @@ def main(cfg: DictConfig):
|
||||
log.error(f"❌ Failed to initialize agent: {e}")
|
||||
raise
|
||||
|
||||
# =========================================================================
|
||||
# 2.5. Save Dataset Statistics as JSON
|
||||
# =========================================================================
|
||||
log.info("💾 Saving dataset statistics...")
|
||||
try:
|
||||
# Get dataset_dir from config
|
||||
dataset_dir = cfg.data.get('dataset_dir', 'roboimi/demos/dataset/sim_transfer')
|
||||
stats_path = Path(dataset_dir) / 'data_stats.pkl'
|
||||
|
||||
if stats_path.exists():
|
||||
# Load pickle file
|
||||
with open(stats_path, 'rb') as f:
|
||||
stats = pickle.load(f)
|
||||
|
||||
# Extract action statistics
|
||||
action_mean = stats['action']['mean'].tolist() if 'action' in stats else []
|
||||
action_std = stats['action']['std'].tolist() if 'action' in stats else []
|
||||
qpos_mean = stats['qpos']['mean'].tolist() if 'qpos' in stats else []
|
||||
qpos_std = stats['qpos']['std'].tolist() if 'qpos' in stats else []
|
||||
|
||||
# Save as JSON
|
||||
json_stats = {
|
||||
'action_mean': action_mean,
|
||||
'action_std': action_std,
|
||||
'qpos_mean': qpos_mean,
|
||||
'qpos_std': qpos_std
|
||||
}
|
||||
json_path = checkpoint_dir / 'dataset_stats.json'
|
||||
with open(json_path, 'w') as f:
|
||||
json.dump(json_stats, f, indent=2)
|
||||
|
||||
log.info(f"✅ Dataset statistics saved to {json_path}")
|
||||
else:
|
||||
log.warning(f"⚠️ Statistics file not found: {stats_path}")
|
||||
log.warning("⚠️ Actions will not be denormalized during inference!")
|
||||
|
||||
except Exception as e:
|
||||
log.warning(f"⚠️ Failed to save statistics as JSON: {e}")
|
||||
log.warning("⚠️ Training will continue, but inference may not work correctly")
|
||||
|
||||
# =========================================================================
|
||||
# 3. Setup Optimizer
|
||||
# =========================================================================
|
||||
|
||||
239
roboimi/vla/VLA_EVALUATION_GUIDE.md
Normal file
239
roboimi/vla/VLA_EVALUATION_GUIDE.md
Normal file
@@ -0,0 +1,239 @@
|
||||
# VLA Evaluation Guide
|
||||
|
||||
This guide explains how to evaluate a trained Vision-Language-Action (VLA) policy in the MuJoCo simulation environment.
|
||||
|
||||
## Prerequisites
|
||||
|
||||
1. **Trained Model**: Train your VLA model first using `train_vla.py`
|
||||
2. **Checkpoints**: Ensure you have saved model checkpoints in `checkpoints/` directory
|
||||
3. **Dependencies**: Install required dependencies:
|
||||
```bash
|
||||
pip install opencv-python tqdm
|
||||
```
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Basic Evaluation
|
||||
|
||||
```bash
|
||||
# Evaluate with default settings (3 episodes)
|
||||
python roboimi/demos/eval_vla.py \
|
||||
--ckpt_path checkpoints/vla_model_best.pt
|
||||
|
||||
# Evaluate with custom settings
|
||||
python roboimi/demos/eval_vla.py \
|
||||
--ckpt_path checkpoints/vla_model_step_5000.pt \
|
||||
--num_episodes 5 \
|
||||
--max_timesteps 700 \
|
||||
--camera_names r_vis top angle \
|
||||
--num_queries 1 \
|
||||
--obs_horizon 2
|
||||
```
|
||||
|
||||
### Parameters
|
||||
|
||||
| Parameter | Description | Default |
|
||||
|-----------|-------------|---------|
|
||||
| `--ckpt_path` | Path to model checkpoint (.pt file) | Required |
|
||||
| `--num_episodes` | Number of evaluation episodes | 3 |
|
||||
| `--max_timesteps` | Maximum timesteps per episode | 700 |
|
||||
| `--device` | Device for inference (`cuda` or `cpu`) | `cuda` |
|
||||
| `--camera_names` | Camera names to use (space-separated) | `r_vis top` |
|
||||
| `--num_queries` | Policy query frequency (every N timesteps) | 1 |
|
||||
| `--obs_horizon` | Observation history length | 2 |
|
||||
| `--no_video` | Disable video saving | False |
|
||||
|
||||
## Usage Details
|
||||
|
||||
### Policy Query Frequency
|
||||
|
||||
The `--num_queries` parameter controls how often the policy is queried:
|
||||
|
||||
- `--num_queries 1`: Query every timestep (default, most accurate)
|
||||
- `--num_queries 4`: Query every 4 timesteps (faster, but uses cached actions)
|
||||
|
||||
When using cached actions (num_queries > 1), the policy predicts a chunk of actions (pred_horizon=16), and these actions are executed sequentially until the next query.
|
||||
|
||||
### Camera Selection
|
||||
|
||||
Available cameras depend on your environment:
|
||||
- `r_vis`: Right arm RealSense camera
|
||||
- `top`: Top-down view camera
|
||||
- `angle`: Angled view camera
|
||||
|
||||
Use `--camera_names` to specify which cameras to use:
|
||||
```bash
|
||||
--camera_names r_vis top # Use 2 cameras
|
||||
--camera_names r_vis top angle # Use all 3 cameras
|
||||
```
|
||||
|
||||
### Observation Horizon
|
||||
|
||||
The `--obs_horizon` parameter determines how many past observations to use as context:
|
||||
|
||||
```bash
|
||||
--obs_horizon 1 # Use only current observation
|
||||
--obs_horizon 2 # Use current + 1 past observation (default)
|
||||
--obs_horizon 4 # Use current + 3 past observations
|
||||
```
|
||||
|
||||
**Note**: Must match the value used during training.
|
||||
|
||||
## Output
|
||||
|
||||
### Console Output
|
||||
|
||||
During evaluation, you'll see:
|
||||
|
||||
```
|
||||
============================================================
|
||||
Episode 1/3
|
||||
============================================================
|
||||
|
||||
Episode 1: 100%|████████████████████| 700/700 [02:30<00:00, 4.64it/s]
|
||||
|
||||
✅ Task completed at timestep 453!
|
||||
|
||||
Episode 1 Summary:
|
||||
Total Reward: 1.0000
|
||||
Max Reward: 1.0000
|
||||
Length: 453 timesteps
|
||||
Video saved: outputs/eval_vla_episode_0.mp4
|
||||
```
|
||||
|
||||
### Video Output
|
||||
|
||||
Videos are saved to `outputs/eval_vla_episode_{N}.mp4` showing the robot's execution.
|
||||
|
||||
### Metrics
|
||||
|
||||
- **Total Reward**: Sum of rewards throughout the episode
|
||||
- **Max Reward**: Maximum reward achieved (1.0 = success)
|
||||
- **Length**: Number of timesteps executed
|
||||
|
||||
## Action Smoothing
|
||||
|
||||
The evaluator includes EMA (Exponential Moving Average) smoothing by default to reduce jitter:
|
||||
|
||||
```python
|
||||
# Default smoothing parameters
|
||||
smooth_method = 'ema'
|
||||
smooth_alpha = 0.3 # Lower = more smoothing
|
||||
```
|
||||
|
||||
To disable or modify smoothing, edit the `evaluate_policy()` call in `eval_vla.py`:
|
||||
|
||||
```python
|
||||
evaluator = VLAEvaluator(
|
||||
agent=agent,
|
||||
use_smoothing=False, # Disable smoothing
|
||||
# or
|
||||
smooth_method='moving_avg', # Use different method
|
||||
smooth_alpha=0.5 # Adjust smoothing strength
|
||||
)
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Issue: Checkpoint not found
|
||||
|
||||
```
|
||||
FileNotFoundError: Checkpoint not found: checkpoints/vla_model_best.pt
|
||||
```
|
||||
|
||||
**Solution**: Ensure you've trained the model and checkpoints exist:
|
||||
```bash
|
||||
ls -la checkpoints/
|
||||
# Should show: vla_model_best.pt, vla_model_final.pt, etc.
|
||||
```
|
||||
|
||||
### Issue: CUDA out of memory
|
||||
|
||||
**Solution**: Use CPU for inference:
|
||||
```bash
|
||||
python eval_vla.py --ckpt_path checkpoints/vla_model_best.pt --device cpu
|
||||
```
|
||||
|
||||
### Issue: Camera names don't match
|
||||
|
||||
**Solution**: Check your HDF5 files for available cameras:
|
||||
```python
|
||||
import h5py
|
||||
with h5py.File('roboimi/demos/dataset/sim_transfer/episode_0.hdf5', 'r') as f:
|
||||
print(list(f['observations/images'].keys()))
|
||||
# Output: ['angle', 'r_vis', 'top']
|
||||
```
|
||||
|
||||
Then use the correct camera names in your eval command.
|
||||
|
||||
### Issue: Mismatched obs_horizon
|
||||
|
||||
```
|
||||
RuntimeError: Tensor shape mismatch
|
||||
```
|
||||
|
||||
**Solution**: Ensure `--obs_horizon` matches the training config (`data.obs_horizon`).
|
||||
|
||||
## Advanced Usage
|
||||
|
||||
### Custom Evaluation Script
|
||||
|
||||
You can also use the evaluator in your own scripts:
|
||||
|
||||
```python
|
||||
from roboimi.demos.eval_vla import VLAEvaluator, load_checkpoint
|
||||
from roboimi.envs.double_pos_ctrl_env import make_sim_env
|
||||
|
||||
# Load model
|
||||
agent = load_checkpoint('checkpoints/vla_model_best.pt')
|
||||
|
||||
# Create evaluator
|
||||
evaluator = VLAEvaluator(
|
||||
agent=agent,
|
||||
device='cuda',
|
||||
camera_names=['r_vis', 'top'],
|
||||
num_queries=1,
|
||||
obs_horizon=2
|
||||
)
|
||||
|
||||
# Create environment
|
||||
env = make_sim_env('sim_transfer')
|
||||
env.reset()
|
||||
evaluator.reset()
|
||||
|
||||
# Run episode
|
||||
obs = env._get_image_obs()
|
||||
obs['qpos'] = env._get_qpos_obs()['qpos']
|
||||
|
||||
# Predict and execute action
|
||||
action = evaluator.predict_action(obs)
|
||||
env.step_jnt(action)
|
||||
```
|
||||
|
||||
### Batch Evaluation
|
||||
|
||||
Evaluate multiple checkpoints:
|
||||
|
||||
```bash
|
||||
for ckpt in checkpoints/vla_model_step_*.pt; do
|
||||
echo "Evaluating $ckpt"
|
||||
python roboimi/demos/eval_vla.py \
|
||||
--ckpt_path "$ckpt" \
|
||||
--num_episodes 1 \
|
||||
--no_video
|
||||
done
|
||||
```
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Train your model**: See [RESNET_TRAINING_GUIDE.md](roboimi/vla/RESNET_TRAINING_GUIDE.md)
|
||||
2. **Evaluate performance**: Use this evaluation script
|
||||
3. **Analyze results**: Compare different checkpoints
|
||||
4. **Deploy to real robot**: Adapt the evaluator for real robot control
|
||||
|
||||
## References
|
||||
|
||||
- Training Guide: [roboimi/vla/RESNET_TRAINING_GUIDE.md](roboimi/vla/RESNET_TRAINING_GUIDE.md)
|
||||
- Project Documentation: [CLAUDE.md](CLAUDE.md)
|
||||
- Original ACT Paper: https://arxiv.org/abs/2304.13705
|
||||
- Diffusion Policy: https://diffusion-policy.cs.columbia.edu/
|
||||
@@ -1,8 +1,10 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import numpy as np
|
||||
from typing import Dict, Optional, Any
|
||||
from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead
|
||||
from diffusers.schedulers.scheduling_ddpm import DDPMScheduler
|
||||
from diffusers.schedulers.scheduling_ddim import DDIMScheduler
|
||||
from roboimi.vla.models.heads.diffusion import ConditionalUnet1D
|
||||
|
||||
class VLAAgent(nn.Module):
|
||||
@@ -18,6 +20,13 @@ class VLAAgent(nn.Module):
|
||||
num_cams=2, # 视觉输入的摄像头数量
|
||||
):
|
||||
super().__init__()
|
||||
# Store parameters
|
||||
self.action_dim = action_dim
|
||||
self.obs_dim = obs_dim
|
||||
self.pred_horizon = pred_horizon
|
||||
self.obs_horizon = obs_horizon
|
||||
self.num_cams = num_cams
|
||||
|
||||
self.vision_encoder = vision_backbone
|
||||
single_img_feat_dim = self.vision_encoder.output_dim
|
||||
total_vision_dim = single_img_feat_dim * num_cams * obs_horizon
|
||||
@@ -31,6 +40,14 @@ class VLAAgent(nn.Module):
|
||||
prediction_type='epsilon' # 预测噪声
|
||||
)
|
||||
|
||||
# DDIM scheduler for faster inference
|
||||
self.infer_scheduler = DDIMScheduler(
|
||||
num_train_timesteps=diffusion_steps,
|
||||
beta_schedule='squaredcos_cap_v2',
|
||||
clip_sample=True,
|
||||
prediction_type='epsilon'
|
||||
)
|
||||
|
||||
self.noise_pred_net = ConditionalUnet1D(
|
||||
input_dim=action_dim,
|
||||
global_cond_dim=self.global_cond_dim
|
||||
@@ -70,18 +87,12 @@ class VLAAgent(nn.Module):
|
||||
)
|
||||
|
||||
# 6. 网络预测噪声
|
||||
# 注意:U-Net 1D 通常期望 channel 在中间: (B, C, T)
|
||||
# noisy_actions_inp = noisy_actions.permute(0, 2, 1)
|
||||
|
||||
pred_noise = self.noise_pred_net(
|
||||
sample=noisy_actions,
|
||||
timestep=timesteps,
|
||||
global_cond=global_cond
|
||||
)
|
||||
|
||||
# 还原维度 (B, T, C)
|
||||
pred_noise = pred_noise.permute(0, 2, 1)
|
||||
|
||||
# 7. 计算 Loss (MSE)
|
||||
loss = nn.functional.mse_loss(pred_noise, noise)
|
||||
return loss
|
||||
@@ -96,20 +107,27 @@ class VLAAgent(nn.Module):
|
||||
# 1. 提取当前观测特征 (只做一次)
|
||||
visual_features = self.vision_encoder(images).view(B, -1)
|
||||
proprioception = proprioception.view(B, -1)
|
||||
if hasattr(self, 'qpos_mean') and hasattr(self, 'qpos_std') and self.qpos_mean is not None:
|
||||
# Convert to tensor for normalization
|
||||
qpos_mean = torch.from_numpy(self.qpos_mean).float().to(proprioception.device)
|
||||
qpos_std = torch.from_numpy(self.qpos_std).float().to(proprioception.device)
|
||||
qpos_mean = qpos_mean.repeat(2)
|
||||
qpos_std = qpos_std.repeat(2)
|
||||
# Normalize: (qpos - mean) / std
|
||||
proprioception = (proprioception - qpos_mean.unsqueeze(0)) / qpos_std.unsqueeze(0)
|
||||
global_cond = torch.cat([visual_features, proprioception], dim=-1)
|
||||
|
||||
# 2. 初始化纯高斯噪声动作
|
||||
# Shape: (B, Horizon, Action_Dim)
|
||||
# Shape: (B, pred_horizon, action_dim)
|
||||
current_actions = torch.randn(
|
||||
(B, 16, 7), device=global_cond.device
|
||||
(B, self.pred_horizon, self.action_dim), device=global_cond.device
|
||||
)
|
||||
|
||||
# 3. 逐步去噪循环 (Reverse Diffusion)
|
||||
self.noise_scheduler.set_timesteps(10) # 推理时可以用更少步加速 (如 DDIM)
|
||||
self.infer_scheduler.set_timesteps(10) # DDIM 推理步数
|
||||
|
||||
for t in self.noise_scheduler.timesteps:
|
||||
# 调整输入格式适应 1D CNN
|
||||
model_input = current_actions.permute(0, 2, 1)
|
||||
for t in self.infer_scheduler.timesteps:
|
||||
model_input = current_actions
|
||||
|
||||
# 预测噪声
|
||||
noise_pred = self.noise_pred_net(
|
||||
@@ -117,12 +135,19 @@ class VLAAgent(nn.Module):
|
||||
timestep=t,
|
||||
global_cond=global_cond
|
||||
)
|
||||
# noise_pred = noise_pred.permute(0, 2, 1)
|
||||
|
||||
# 移除噪声,更新 current_actions
|
||||
current_actions = self.noise_scheduler.step(
|
||||
current_actions = self.infer_scheduler.step(
|
||||
noise_pred, t, current_actions
|
||||
).prev_sample
|
||||
|
||||
# 4. 输出最终动作序列
|
||||
# 4. 反归一化动作 (Denormalize actions)
|
||||
if hasattr(self, 'action_mean') and hasattr(self, 'action_std') and self.action_mean is not None:
|
||||
# Convert to numpy for denormalization
|
||||
action_mean = torch.from_numpy(self.action_mean).float().to(current_actions.device)
|
||||
action_std = torch.from_numpy(self.action_std).float().to(current_actions.device)
|
||||
# Denormalize: action * std + mean
|
||||
current_actions = current_actions * action_std.unsqueeze(0).unsqueeze(0) + action_mean.unsqueeze(0).unsqueeze(0)
|
||||
|
||||
# 5. 输出最终动作序列
|
||||
return current_actions # 返回去噪后的干净动作
|
||||
@@ -4,10 +4,10 @@ defaults:
|
||||
- data: resnet_dataset
|
||||
|
||||
train:
|
||||
batch_size: 8 # Batch size for training
|
||||
batch_size: 32 # Batch size for training
|
||||
lr: 1e-4 # Learning rate
|
||||
max_steps: 10000 # Maximum training steps
|
||||
max_steps: 20000 # Maximum training steps
|
||||
log_freq: 100 # Log frequency (steps)
|
||||
save_freq: 1000 # Save checkpoint frequency (steps)
|
||||
save_freq: 2000 # Save checkpoint frequency (steps)
|
||||
device: "cuda" # Device: "cuda" or "cpu"
|
||||
num_workers: 8 # DataLoader workers (set to 0 for debugging, 8 for production)
|
||||
@@ -11,7 +11,7 @@ class RobotDiffusionDataset(Dataset):
|
||||
def __init__(self,
|
||||
dataset_dir,
|
||||
pred_horizon=16,
|
||||
obs_horizon=1,
|
||||
obs_horizon=2,
|
||||
action_horizon=8,
|
||||
camera_names=['r_vis', 'top'],
|
||||
normalization_type='gaussian'):
|
||||
|
||||
Reference in New Issue
Block a user