diff --git a/roboimi/demos/config.yaml b/roboimi/demos/config.yaml index efb6f1c..cf754d7 100644 --- a/roboimi/demos/config.yaml +++ b/roboimi/demos/config.yaml @@ -44,7 +44,7 @@ smooth_method: "ema" # Options: "ema", "moving_avg", "lowpass", "none" smooth_alpha: 0.3 # Smoothing factor (0-1), smaller = smoother # transformer settings -batch_size: 15 +batch_size: 10 state_dim: 16 action_dim: 16 lr_backbone: 0.00001 diff --git a/roboimi/demos/vla_scripts/train_vla.py b/roboimi/demos/vla_scripts/train_vla.py index 8206c1d..7faf1a9 100644 --- a/roboimi/demos/vla_scripts/train_vla.py +++ b/roboimi/demos/vla_scripts/train_vla.py @@ -16,7 +16,7 @@ from hydra.utils import instantiate log = logging.getLogger(__name__) -@hydra.main(version_base=None, config_path="../../../roboimi/vla/conf", config_name="config") +@hydra.main(version_base=None, config_path="../../vla/conf", config_name="config") def main(cfg: DictConfig): print(OmegaConf.to_yaml(cfg)) log.info(f"🚀 Starting VLA Training with Real Data (Device: {cfg.train.device})") @@ -64,21 +64,30 @@ def main(cfg: DictConfig): # 我们在这里做一个映射,模拟多模态融合前的处理 # 假设我们只用配置里的第一个 key 作为主视觉 - primary_cam_key = cfg.data.obs_keys[0] + # primary_cam_key = cfg.data.obs_keys[0] # Dataset 返回 shape: (B, Obs_Horizon, C, H, W) # DebugBackbone 期望: (B, C, H, W) 或者 (B, Seq, Dim) # 这里我们取 Obs_Horizon 的最后一帧 (Current Frame) - input_img = batch['obs'][primary_cam_key][:, -1, :, :, :] + # input_img = batch['obs'][primary_cam_key][:, -1, :, :, :] + # agent_input = { + # "obs": { + # "image": input_img, + # "text": batch["language"] # 传递语言指令 + # }, + # "actions": batch["actions"] # (B, Chunk, Dim) + # } agent_input = { - "obs": { - "image": input_img, - "text": batch["language"] # 传递语言指令 - }, - "actions": batch["actions"] # (B, Chunk, Dim) + "action": batch["action"], + "qpos": batch["qpos"], + "images": {} } + for cam_name in cfg.data.camera_names: + key = f"image_{cam_name}" + agent_input["images"][cam_name] = batch[key].squeeze(1) + # --- 5. Forward & Backward --- outputs = agent(agent_input) diff --git a/roboimi/utils/constants.py b/roboimi/utils/constants.py index 22bc3d6..dd1d4ec 100644 --- a/roboimi/utils/constants.py +++ b/roboimi/utils/constants.py @@ -18,7 +18,7 @@ SIM_TASK_CONFIGS = { # }, 'sim_transfer': { 'dataset_dir': DATASET_DIR + '/sim_transfer', - 'num_episodes': 7, + 'num_episodes': 20, 'episode_len': 700, 'camera_names': ['top','r_vis'], 'xml_dir': HOME_PATH + '/assets' diff --git a/roboimi/vla/agent.py b/roboimi/vla/agent.py index e3133ab..c60585f 100644 --- a/roboimi/vla/agent.py +++ b/roboimi/vla/agent.py @@ -4,29 +4,27 @@ from typing import Dict, Optional, Any from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead class VLAAgent(nn.Module): - """ - The main assembly class. - Flow: Obs -> Backbone -> Projector -> Head -> Action/Loss - """ + def __init__( self, backbone: VLABackbone, projector: VLAProjector, - head: VLAHead + head: VLAHead, + state_encoder: nn.Module ): super().__init__() self.backbone = backbone self.projector = projector self.head = head + self.state_encoder = state_encoder def forward(self, batch: Dict[str, Any]) -> Dict[str, torch.Tensor]: - """ - Args: - batch: Dict containing 'obs' (image/text) and 'actions' (ground truth) - """ - # 1. Extract Features - # Shape: (B, Seq, Backbone_Dim) - features = self.backbone(batch['obs']) + + action = batch["action"] + state = batch["qpos"] + images = batch["images"] + + state_emb = self.state_encoder(state) # 2. Project Features # Shape: (B, Seq, Head_Dim) diff --git a/roboimi/vla/conf/data/custom_hdf5.yaml b/roboimi/vla/conf/data/custom_hdf5.yaml deleted file mode 100644 index 6d27a55..0000000 --- a/roboimi/vla/conf/data/custom_hdf5.yaml +++ /dev/null @@ -1,10 +0,0 @@ -_target_: roboimi.vla.data.dataset.VLAChunkedDataset - -data_path: "/home/d51/workspace/work/robo-imi-act/roboimi/demos/dataset/sim_transfer" -pred_horizon: 16 -obs_horizon: 1 -obs_keys: ["top"] - -# 【新增】SigLIP 必须参数 -resize_resolution: 384 -train: true # 开启数据增强 \ No newline at end of file diff --git a/roboimi/vla/conf/data/siglip2.yaml b/roboimi/vla/conf/data/siglip2.yaml new file mode 100644 index 0000000..e37b284 --- /dev/null +++ b/roboimi/vla/conf/data/siglip2.yaml @@ -0,0 +1,8 @@ +_target_: roboimi.vla.data.dataset.RobotDiffusionDataset + +dataset_dir: "/home/d51/workspace/work/robo-imi-act/roboimi/demos/dataset/sim_transfer" +pred_horizon: 16 +obs_horizon: 1 +action_horizon: 8 +camera_names: ['r_vis', 'top'] # ['angle', 'r_vis', 'top'] +normalization_type: 'gaussian' # 'min_max' or 'gaussian' \ No newline at end of file diff --git a/roboimi/vla/core/interfaces.py b/roboimi/vla/core/interfaces.py index 6c22139..ea02094 100644 --- a/roboimi/vla/core/interfaces.py +++ b/roboimi/vla/core/interfaces.py @@ -18,11 +18,6 @@ class VLABackbone(nn.Module, abc.ABC): """ pass - @property - @abc.abstractmethod - def embed_dim(self) -> int: - pass - class VLAProjector(nn.Module, abc.ABC): """ diff --git a/roboimi/vla/data/dataset.py b/roboimi/vla/data/dataset.py index 8dd571e..7e286f9 100644 --- a/roboimi/vla/data/dataset.py +++ b/roboimi/vla/data/dataset.py @@ -1,103 +1,156 @@ -import h5py import torch +import torch.nn as nn +from torch.utils.data import Dataset +import h5py import numpy as np import os import glob -from torch.utils.data import Dataset -from typing import Dict, List, Any +import pickle -# 【新增】导入刚才写好的处理器 -from .image_transform import VLAImageProcessor - -class VLAChunkedDataset(Dataset): - def __init__( - self, - data_path: str, - pred_horizon: int = 16, - obs_horizon: int = 1, - obs_keys: List[str] = ["top"], - resize_resolution: int = 384, # SigLIP 默认 384 - train: bool = True # 【新增】控制是否增强 - ): - self.data_path = data_path +class RobotDiffusionDataset(Dataset): + def __init__(self, + dataset_dir, + pred_horizon=16, + obs_horizon=1, + action_horizon=8, + camera_names=['r_vis', 'top'], + normalization_type='gaussian'): + """ + Args: + dataset_dir: 存放 episode_*.hdf5 的文件夹路径 + pred_horizon: 预测未来动作的长度 (Tp) + obs_horizon: 历史观测长度 (To) + action_horizon: 执行动作长度 (Ta) - 在Dataset中主要影响Evaluation,这里作为参数保留 + """ + self.dataset_dir = dataset_dir self.pred_horizon = pred_horizon self.obs_horizon = obs_horizon - self.obs_keys = obs_keys + self.action_horizon = action_horizon + self.camera_names = camera_names + self.normalization_type = normalization_type + # 1. 扫描所有HDF5文件并建立索引 + # 格式: [(file_path, episode_length), ...] + self.episode_files = sorted(glob.glob(os.path.join(dataset_dir, 'episode_*.hdf5'))) + self.indices = [] - # ... (这里保留之前的扫描文件代码 self.file_paths ...) ... - if os.path.isdir(data_path): - self.file_paths = sorted(glob.glob(os.path.join(data_path, "*.hdf5"))) - else: - self.file_paths = [data_path] - - # ... (这里保留之前的建立索引代码 self.index_map ...) ... - self.index_map = [] - for i, path in enumerate(self.file_paths): - with h5py.File(path, 'r') as f: - total_len = f["action"].shape[0] - for t in range(total_len): - self.index_map.append((i, t)) - - # 【核心修改】实例化处理器 - self.image_processor = VLAImageProcessor( - resolution=resize_resolution, - enable_augmentation=train, # 训练集开启增强 - aug_strength=0.1 - ) - print(f"✅ Image Processor: {self.image_processor}") + print(f"Found {len(self.episode_files)} episodes. Building index...") + + for file_path in self.episode_files: + with h5py.File(file_path, 'r') as f: + # 获取该 episode 的长度 (例如 700) + l = f['action'].shape[0] + # 保存每个有效 step 的索引信息 + # (file_path, episode_length, current_step_index) + for i in range(l): + self.indices.append((file_path, l, i)) + + # 2. 统计数据 + with open(os.path.join(dataset_dir, 'data_stats.pkl'), 'rb') as f: + self.stats = pickle.load(f) def __len__(self): - return len(self.index_map) + return len(self.indices) - def __getitem__(self, idx: int) -> Dict[str, Any]: - file_idx, t_start = self.index_map[idx] - file_path = self.file_paths[file_idx] + def __getitem__(self, idx): + file_path, episode_len, start_ts = self.indices[idx] - with h5py.File(file_path, 'r') as f: - # ... (Action读取代码保持不变) ... - total_len = f["action"].shape[0] - t_end = min(t_start + self.pred_horizon, total_len) - actions_np = f["action"][t_start:t_end] - # ... (Padding 逻辑保持不变) ... - actual_len = actions_np.shape[0] - if actual_len < self.pred_horizon: - pad_len = self.pred_horizon - actual_len - pad_block = np.tile(actions_np[-1], (pad_len, 1)) - actions_np = np.concatenate([actions_np, pad_block], axis=0) + # ----------------------------- + # 1. 打开文件 + # ----------------------------- + # 注意: 在 __getitem__ 中打开文件对多进程 DataLoader 更友好 + # 如果追求极致IO性能,可以考虑使用 h5py 的 swmr 模式或内存缓存 + with h5py.File(file_path, 'r') as root: - # --- 图像处理部分 --- - obs_dict = {} - for key in self.obs_keys: - imgs = [] - for i in range(self.obs_horizon): - # 计算历史帧索引 - query_t = max(0, t_start - (self.obs_horizon - 1) + i) - - # 1. 读取原始数据 (Numpy uint8) - raw_img = f[f"observations/images/{key}"][query_t] - - # 2. 【调用处理器】 Numpy -> Tensor (384, 384) Normalized - processed_img = self.image_processor(raw_img) - - imgs.append(processed_img) + # ----------------------------- + # 2. 处理 Action (Prediction Target) + # ----------------------------- + # 目标: 获取 [t, t + pred_horizon] 的动作 + action_start = start_ts + action_end = min(start_ts + self.pred_horizon, episode_len) + + actions = root['action'][action_start:action_end] # shape: (T_subset, 16) + + # Padding: 如果剩余动作不足 pred_horizon,复制最后一步 + if len(actions) < self.pred_horizon: + pad_len = self.pred_horizon - len(actions) + last_action = actions[-1] + # 重复最后一行 + pad_content = np.repeat(last_action[np.newaxis, :], pad_len, axis=0) + actions = np.concatenate([actions, pad_content], axis=0) + + # 归一化 Action + if self.stats: + actions = self._normalize_data(actions, self.stats['action']) + + # ----------------------------- + # 3. 处理 Observations (History) + # ----------------------------- + # 目标: 获取 [t - obs_horizon + 1, t + 1] 的观测 + # 索引逻辑: + # 如果 obs_horizon=2, current_ts=0 -> indices=[0, 0] (Padding) + # 如果 obs_horizon=2, current_ts=5 -> indices=[4, 5] + + indices = [] + for i in range(self.obs_horizon): + # t - (To - 1) + i + query_ts = start_ts - (self.obs_horizon - 1) + i + # 边界处理 (Padding first frame) + query_ts = max(query_ts, 0) + indices.append(query_ts) - # Stack -> (T, C, H, W) - obs_dict[key] = torch.stack(imgs) + # 读取 qpos (proprioception) + qpos_data = root['observations/qpos'] + qpos = qpos_data[indices] # smart indexing + if self.stats: + qpos = self._normalize_data(qpos, self.stats['qpos']) - # ... (QPos 和 Language 读取保持不变) ... - qpos = f["observations/qpos"][t_start].astype(np.float32) - lang = f.attrs.get("language", "placeholder") - if isinstance(lang, bytes): lang = lang.decode("utf-8") - - # 这里的 action_mask 只是临时补全代码,你原来的逻辑是对的 - action_mask = torch.ones(self.pred_horizon, dtype=torch.float32) - if actual_len < self.pred_horizon: - action_mask[actual_len:] = 0.0 + # 读取 Images + # 你有三个视角: angle, r_vis, top + # 建议将它们分开返回,或者在 Dataset 里 Concat + image_dict = {} + for cam_name in self.camera_names: + # HDF5 dataset + img_dset = root['observations']['images'][cam_name] + + imgs = [] + for t in indices: + img = img_dset[t] # (480, 640, 3) uint8 + img = torch.from_numpy(img).permute(2, 0, 1).float() / 255.0 # (C, H, W) + imgs.append(img) + + # Stack time dimension: (obs_horizon, 3, H, W) + image_dict[cam_name] = torch.stack(imgs) - return { - "obs": obs_dict, - "qpos": torch.from_numpy(qpos), - "actions": torch.from_numpy(actions_np).float(), - "action_mask": action_mask, - "language": lang - } \ No newline at end of file + # ----------------------------- + # 4. 组装 Batch + # ----------------------------- + data_batch = { + 'action': torch.from_numpy(actions).float(), # (Tp, 16) + 'qpos': torch.from_numpy(qpos).float(), # (To, 16) + } + # 将图像放入 batch + for cam_name, img_tensor in image_dict.items(): + data_batch[f'image_{cam_name}'] = img_tensor # (To, 3, H, W) + + # TODO: 添加 Language Instruction + # 如果所有 episode 共享任务,这里可以是固定 embedding + # 如果每个 episode 任务不同,你需要一个额外的 meta json 来映射 file_path -> text + # data_batch['lang_text'] = "pick up the red cube" + + return data_batch + + def _normalize_data(self, data, stats): + if self.normalization_type == 'min_max': + # 之前的逻辑: [-1, 1] + min_val = stats['min'] + max_val = stats['max'] + data = (data - min_val) / (max_val - min_val + 1e-8) + return data * 2 - 1 + + elif self.normalization_type == 'gaussian': + # 新逻辑: Mean/Std + mean = stats['mean'] + std = stats['std'] + # (data - mean) / std + # 这里的 data 是 numpy array + return (data - mean) / (std + 1e-8) \ No newline at end of file diff --git a/roboimi/vla/models/backbones/siglip2.py b/roboimi/vla/models/backbones/siglip2.py new file mode 100644 index 0000000..a44997a --- /dev/null +++ b/roboimi/vla/models/backbones/siglip2.py @@ -0,0 +1,37 @@ +from transformers import SiglipVisionModel +from roboimi.vla.core.interfaces import VLABackbone +from torchvision import transforms + +class SigLIP2(VLABackbone): + def __init__( + self, + model_name = "google/siglip2-base-patch16-384", + freeze: bool = True, + ): + super().__init__() + + self.vision_model = SiglipVisionModel.from_pretrained(model_name) + self.transform = transforms.Compose([ + transforms.Resize((384, 384), antialias=True), + transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) + ]) + + if freeze: + self._freeze_parameters() + + def _freeze_parameters(self): + print("❄️ Freezing Vision Backbone parameters") + for param in self.vision_model.parameters(): + param.requires_grad = False + self.vision_model.eval() + + def forward( + self, + images + ): + # images: (B, C, H, W), 归一化到 [0, 1] + images = self.transform(images) # 归一化到 [-1, 1] + + outputs = self.vision_model(pixel_values=images) + + return outputs.last_hidden_state \ No newline at end of file diff --git a/roboimi/vla/modules/encoders.py b/roboimi/vla/modules/encoders.py index 8e8c411..2d600d2 100644 --- a/roboimi/vla/modules/encoders.py +++ b/roboimi/vla/modules/encoders.py @@ -30,17 +30,17 @@ class MLP(nn.Module): class SinusoidalPositionalEncoding(nn.Module): def __init__( self, - emb_dim + embed_dim ): super().__init__() - self.emb_dim = emb_dim + self.embed_dim = embed_dim def forward(self, timesteps): timesteps = timesteps.float() B, T = timesteps.shape device = timesteps.device - half_dim = self.emb_dim // 2 + half_dim = self.embed_dim // 2 exponent = -torch.arange(half_dim, dtype=torch.float, device=device) * ( torch.log(torch.tensor(10000.0)) / half_dim @@ -58,14 +58,14 @@ class ActionEncoder(nn.Module): def __init__( self, action_dim, - emb_dim, + embed_dim, ): super().__init__() - self.W1 = nn.Linear(action_dim, emb_dim) + self.W1 = nn.Linear(action_dim, embed_dim) self.W2 = nn.Linear(2 * action_dim, action_dim) - self.W3 = nn.Linear(emb_dim, emb_dim) - self.pos_encoder = SinusoidalPositionalEncoding(emb_dim) + self.W3 = nn.Linear(embed_dim, embed_dim) + self.pos_encoder = SinusoidalPositionalEncoding(embed_dim) def forward( self, @@ -89,13 +89,13 @@ class StateEncoder(nn.Module): self, state_dim, hidden_dim, - emb_dim + embed_dim ): super().__init__() self.mlp = MLP( state_dim, hidden_dim, - emb_dim + embed_dim ) def forward( @@ -103,4 +103,4 @@ class StateEncoder(nn.Module): states ): state_emb = self.mlp(states) - return state_emb # [B, 1, emb_dim] \ No newline at end of file + return state_emb # [B, 1, embed_dim] \ No newline at end of file