feat: 更新框架，新增数据及定义和backbone

2026-02-05 01:37:55 +08:00
parent 92660562fb
commit dd2749cb12
10 changed files with 224 additions and 134 deletions
--- a/roboimi/demos/config.yaml
+++ b/roboimi/demos/config.yaml
@@ -44,7 +44,7 @@ smooth_method: "ema"     # Options: "ema", "moving_avg", "lowpass", "none"
 smooth_alpha: 0.3        # Smoothing factor (0-1), smaller = smoother

 # transformer settings
-batch_size: 15                          
+batch_size: 10                          
 state_dim: 16            
 action_dim: 16      
 lr_backbone: 0.00001        
--- a/roboimi/demos/vla_scripts/train_vla.py
+++ b/roboimi/demos/vla_scripts/train_vla.py
@@ -16,7 +16,7 @@ from hydra.utils import instantiate

 log = logging.getLogger(__name__)

-@hydra.main(version_base=None, config_path="../../../roboimi/vla/conf", config_name="config")
+@hydra.main(version_base=None, config_path="../../vla/conf", config_name="config")
 def main(cfg: DictConfig):
    print(OmegaConf.to_yaml(cfg))
    log.info(f"🚀 Starting VLA Training with Real Data (Device: {cfg.train.device})")
@@ -64,21 +64,30 @@ def main(cfg: DictConfig):
        # 我们在这里做一个映射，模拟多模态融合前的处理
        
        # 假设我们只用配置里的第一个 key 作为主视觉
-        primary_cam_key = cfg.data.obs_keys[0] 
+        # primary_cam_key = cfg.data.obs_keys[0] 
        
        # Dataset 返回 shape: (B, Obs_Horizon, C, H, W)
        # DebugBackbone 期望: (B, C, H, W) 或者 (B, Seq, Dim)
        # 这里我们取 Obs_Horizon 的最后一帧 (Current Frame)
-        input_img = batch['obs'][primary_cam_key][:, -1, :, :, :]
+        # input_img = batch['obs'][primary_cam_key][:, -1, :, :, :]
        
+        # agent_input = {
+        #     "obs": {
+        #         "image": input_img,
+        #         "text": batch["language"] # 传递语言指令
+        #     },
+        #     "actions": batch["actions"] # (B, Chunk, Dim)
+        # }
        agent_input = {
-            "obs": {
-                "image": input_img,
-                "text": batch["language"] # 传递语言指令
-            },
-            "actions": batch["actions"] # (B, Chunk, Dim)
+            "action": batch["action"],
+            "qpos": batch["qpos"],
+            "images": {}
        }

+        for cam_name in cfg.data.camera_names:
+            key = f"image_{cam_name}"
+            agent_input["images"][cam_name] = batch[key].squeeze(1)
+
        # --- 5. Forward & Backward ---
        outputs = agent(agent_input)
        
--- a/roboimi/utils/constants.py
+++ b/roboimi/utils/constants.py
@@ -18,7 +18,7 @@ SIM_TASK_CONFIGS = {
    # },
    'sim_transfer': {
        'dataset_dir': DATASET_DIR + '/sim_transfer',
-        'num_episodes': 7,
+        'num_episodes': 20,
        'episode_len': 700,
        'camera_names': ['top','r_vis'],
        'xml_dir': HOME_PATH + '/assets'
--- a/roboimi/vla/agent.py
+++ b/roboimi/vla/agent.py
@@ -4,29 +4,27 @@ from typing import Dict, Optional, Any
 from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead

 class VLAAgent(nn.Module):
-    """
-    The main assembly class.
-    Flow: Obs -> Backbone -> Projector -> Head -> Action/Loss
-    """
+
    def __init__(
        self,
        backbone: VLABackbone,
        projector: VLAProjector,
-        head: VLAHead
+        head: VLAHead,
+        state_encoder: nn.Module
    ):
        super().__init__()
        self.backbone = backbone
        self.projector = projector
        self.head = head
+        self.state_encoder = state_encoder

    def forward(self, batch: Dict[str, Any]) -> Dict[str, torch.Tensor]:
-        """
-        Args:
-            batch: Dict containing 'obs' (image/text) and 'actions' (ground truth)
-        """
-        # 1. Extract Features
-        # Shape: (B, Seq, Backbone_Dim)
-        features = self.backbone(batch['obs'])
+
+        action = batch["action"]
+        state = batch["qpos"]
+        images = batch["images"]
+
+        state_emb = self.state_encoder(state)

        # 2. Project Features
        # Shape: (B, Seq, Head_Dim)
--- a/roboimi/vla/conf/data/custom_hdf5.yaml
+++ b/roboimi/vla/conf/data/custom_hdf5.yaml
@@ -1,10 +0,0 @@
-_target_: roboimi.vla.data.dataset.VLAChunkedDataset
-
-data_path: "/home/d51/workspace/work/robo-imi-act/roboimi/demos/dataset/sim_transfer"
-pred_horizon: 16
-obs_horizon: 1
-obs_keys: ["top"] 
-
-# 【新增】SigLIP 必须参数
-resize_resolution: 384 
-train: true  # 开启数据增强
--- a/roboimi/vla/conf/data/siglip2.yaml
+++ b/roboimi/vla/conf/data/siglip2.yaml
@@ -0,0 +1,8 @@
+_target_: roboimi.vla.data.dataset.RobotDiffusionDataset
+
+dataset_dir: "/home/d51/workspace/work/robo-imi-act/roboimi/demos/dataset/sim_transfer"
+pred_horizon: 16
+obs_horizon: 1
+action_horizon: 8
+camera_names: ['r_vis', 'top'] # ['angle', 'r_vis', 'top']
+normalization_type: 'gaussian' # 'min_max' or 'gaussian'
--- a/roboimi/vla/core/interfaces.py
+++ b/roboimi/vla/core/interfaces.py
@@ -18,11 +18,6 @@ class VLABackbone(nn.Module, abc.ABC):
        """
        pass

-    @property
-    @abc.abstractmethod
-    def embed_dim(self) -> int:
-        pass
-

 class VLAProjector(nn.Module, abc.ABC):
    """
--- a/roboimi/vla/data/dataset.py
+++ b/roboimi/vla/data/dataset.py
@@ -1,103 +1,156 @@
-import h5py
 import torch
+import torch.nn as nn
+from torch.utils.data import Dataset
+import h5py
 import numpy as np
 import os
 import glob
-from torch.utils.data import Dataset
-from typing import Dict, List, Any
+import pickle

-# 【新增】导入刚才写好的处理器
-from .image_transform import VLAImageProcessor
-
-class VLAChunkedDataset(Dataset):
-    def __init__(
-        self, 
-        data_path: str, 
-        pred_horizon: int = 16,
-        obs_horizon: int = 1,
-        obs_keys: List[str] = ["top"],
-        resize_resolution: int = 384,  # SigLIP 默认 384
-        train: bool = True             # 【新增】控制是否增强
-    ):
-        self.data_path = data_path
+class RobotDiffusionDataset(Dataset):
+    def __init__(self, 
+                 dataset_dir, 
+                 pred_horizon=16, 
+                 obs_horizon=1, 
+                 action_horizon=8,
+                 camera_names=['r_vis', 'top'],
+                 normalization_type='gaussian'): 
+        """
+        Args:
+            dataset_dir: 存放 episode_*.hdf5 的文件夹路径
+            pred_horizon: 预测未来动作的长度 (Tp)
+            obs_horizon: 历史观测长度 (To)
+            action_horizon: 执行动作长度 (Ta) - 在Dataset中主要影响Evaluation，这里作为参数保留
+        """
+        self.dataset_dir = dataset_dir
        self.pred_horizon = pred_horizon
        self.obs_horizon = obs_horizon
-        self.obs_keys = obs_keys
+        self.action_horizon = action_horizon
+        self.camera_names = camera_names
+        self.normalization_type = normalization_type
+        # 1. 扫描所有HDF5文件并建立索引
+        # 格式: [(file_path, episode_length), ...]
+        self.episode_files = sorted(glob.glob(os.path.join(dataset_dir, 'episode_*.hdf5')))
+        self.indices = []
        
-        # ... (这里保留之前的扫描文件代码 self.file_paths ...) ...
-        if os.path.isdir(data_path):
-            self.file_paths = sorted(glob.glob(os.path.join(data_path, "*.hdf5")))
-        else:
-            self.file_paths = [data_path]
-            
-        # ... (这里保留之前的建立索引代码 self.index_map ...) ...
-        self.index_map = []
-        for i, path in enumerate(self.file_paths):
-            with h5py.File(path, 'r') as f:
-                total_len = f["action"].shape[0]
-                for t in range(total_len):
-                    self.index_map.append((i, t))
-
-        # 【核心修改】实例化处理器
-        self.image_processor = VLAImageProcessor(
-            resolution=resize_resolution,
-            enable_augmentation=train, # 训练集开启增强
-            aug_strength=0.1
-        )
-        print(f"✅ Image Processor: {self.image_processor}")
+        print(f"Found {len(self.episode_files)} episodes. Building index...")
+        
+        for file_path in self.episode_files:
+            with h5py.File(file_path, 'r') as f:
+                # 获取该 episode 的长度 (例如 700)
+                l = f['action'].shape[0]
+                # 保存每个有效 step 的索引信息
+                # (file_path, episode_length, current_step_index)
+                for i in range(l):
+                    self.indices.append((file_path, l, i))
+        
+        # 2. 统计数据
+        with open(os.path.join(dataset_dir, 'data_stats.pkl'), 'rb') as f:
+            self.stats = pickle.load(f)

    def __len__(self):
-        return len(self.index_map)
+        return len(self.indices)

-    def __getitem__(self, idx: int) -> Dict[str, Any]:
-        file_idx, t_start = self.index_map[idx]
-        file_path = self.file_paths[file_idx]
+    def __getitem__(self, idx):
+        file_path, episode_len, start_ts = self.indices[idx]
        
-        with h5py.File(file_path, 'r') as f:
-            # ... (Action读取代码保持不变) ...
-            total_len = f["action"].shape[0]
-            t_end = min(t_start + self.pred_horizon, total_len)
-            actions_np = f["action"][t_start:t_end]
-            # ... (Padding 逻辑保持不变) ...
-            actual_len = actions_np.shape[0]
-            if actual_len < self.pred_horizon:
-                pad_len = self.pred_horizon - actual_len
-                pad_block = np.tile(actions_np[-1], (pad_len, 1))
-                actions_np = np.concatenate([actions_np, pad_block], axis=0)
+        # -----------------------------
+        # 1. 打开文件
+        # -----------------------------
+        # 注意: 在 __getitem__ 中打开文件对多进程 DataLoader 更友好
+        # 如果追求极致IO性能，可以考虑使用 h5py 的 swmr 模式或内存缓存
+        with h5py.File(file_path, 'r') as root:
            
-            # --- 图像处理部分 ---
-            obs_dict = {}
-            for key in self.obs_keys:
-                imgs = []
-                for i in range(self.obs_horizon):
-                    # 计算历史帧索引
-                    query_t = max(0, t_start - (self.obs_horizon - 1) + i)
-                    
-                    # 1. 读取原始数据 (Numpy uint8)
-                    raw_img = f[f"observations/images/{key}"][query_t]
-                    
-                    # 2. 【调用处理器】 Numpy -> Tensor (384, 384) Normalized
-                    processed_img = self.image_processor(raw_img)
-                    
-                    imgs.append(processed_img)
+            # -----------------------------
+            # 2. 处理 Action (Prediction Target)
+            # -----------------------------
+            # 目标: 获取 [t, t + pred_horizon] 的动作
+            action_start = start_ts
+            action_end = min(start_ts + self.pred_horizon, episode_len)
+            
+            actions = root['action'][action_start:action_end] # shape: (T_subset, 16)
+            
+            # Padding: 如果剩余动作不足 pred_horizon，复制最后一步
+            if len(actions) < self.pred_horizon:
+                pad_len = self.pred_horizon - len(actions)
+                last_action = actions[-1]
+                # 重复最后一行
+                pad_content = np.repeat(last_action[np.newaxis, :], pad_len, axis=0)
+                actions = np.concatenate([actions, pad_content], axis=0)
+            
+            # 归一化 Action
+            if self.stats:
+                actions = self._normalize_data(actions, self.stats['action'])
+
+            # -----------------------------
+            # 3. 处理 Observations (History)
+            # -----------------------------
+            # 目标: 获取 [t - obs_horizon + 1, t + 1] 的观测
+            # 索引逻辑:
+            # 如果 obs_horizon=2, current_ts=0 -> indices=[0, 0] (Padding)
+            # 如果 obs_horizon=2, current_ts=5 -> indices=[4, 5]
+            
+            indices = []
+            for i in range(self.obs_horizon):
+                # t - (To - 1) + i
+                query_ts = start_ts - (self.obs_horizon - 1) + i
+                # 边界处理 (Padding first frame)
+                query_ts = max(query_ts, 0)
+                indices.append(query_ts)
                
-                # Stack -> (T, C, H, W)
-                obs_dict[key] = torch.stack(imgs)
+            # 读取 qpos (proprioception)
+            qpos_data = root['observations/qpos']
+            qpos = qpos_data[indices] # smart indexing
+            if self.stats:
+                qpos = self._normalize_data(qpos, self.stats['qpos'])

-            # ... (QPos 和 Language 读取保持不变) ...
-            qpos = f["observations/qpos"][t_start].astype(np.float32)
-            lang = f.attrs.get("language", "placeholder")
-            if isinstance(lang, bytes): lang = lang.decode("utf-8")
-            
-            # 这里的 action_mask 只是临时补全代码，你原来的逻辑是对的
-            action_mask = torch.ones(self.pred_horizon, dtype=torch.float32)
-            if actual_len < self.pred_horizon:
-                action_mask[actual_len:] = 0.0
+            # 读取 Images
+            # 你有三个视角: angle, r_vis, top
+            # 建议将它们分开返回，或者在 Dataset 里 Concat
+            image_dict = {}
+            for cam_name in self.camera_names:
+                # HDF5 dataset
+                img_dset = root['observations']['images'][cam_name]
+                
+                imgs = []
+                for t in indices:
+                    img = img_dset[t] # (480, 640, 3) uint8
+                    img = torch.from_numpy(img).permute(2, 0, 1).float() / 255.0  # (C, H, W)
+                    imgs.append(img)
+                
+                # Stack time dimension: (obs_horizon, 3, H, W)
+                image_dict[cam_name] = torch.stack(imgs)

-        return {
-            "obs": obs_dict,
-            "qpos": torch.from_numpy(qpos),
-            "actions": torch.from_numpy(actions_np).float(),
-            "action_mask": action_mask,
-            "language": lang
-        }
+            # -----------------------------
+            # 4. 组装 Batch
+            # -----------------------------
+            data_batch = {
+                'action': torch.from_numpy(actions).float(), # (Tp, 16)
+                'qpos': torch.from_numpy(qpos).float(),      # (To, 16)
+            }
+            # 将图像放入 batch
+            for cam_name, img_tensor in image_dict.items():
+                data_batch[f'image_{cam_name}'] = img_tensor # (To, 3, H, W)
+
+            # TODO: 添加 Language Instruction
+            # 如果所有 episode 共享任务，这里可以是固定 embedding
+            # 如果每个 episode 任务不同，你需要一个额外的 meta json 来映射 file_path -> text
+            # data_batch['lang_text'] = "pick up the red cube" 
+
+            return data_batch
+
+    def _normalize_data(self, data, stats):
+            if self.normalization_type == 'min_max':
+                # 之前的逻辑: [-1, 1]
+                min_val = stats['min']
+                max_val = stats['max']
+                data = (data - min_val) / (max_val - min_val + 1e-8)
+                return data * 2 - 1
+                
+            elif self.normalization_type == 'gaussian':
+                # 新逻辑: Mean/Std
+                mean = stats['mean']
+                std = stats['std']
+                # (data - mean) / std
+                # 这里的 data 是 numpy array
+                return (data - mean) / (std + 1e-8)
--- a/roboimi/vla/models/backbones/siglip2.py
+++ b/roboimi/vla/models/backbones/siglip2.py
@@ -0,0 +1,37 @@
+from transformers import SiglipVisionModel
+from roboimi.vla.core.interfaces import VLABackbone
+from torchvision import transforms
+
+class SigLIP2(VLABackbone):
+    def __init__(
+            self,
+            model_name = "google/siglip2-base-patch16-384",
+            freeze: bool = True,
+    ):
+        super().__init__()
+
+        self.vision_model = SiglipVisionModel.from_pretrained(model_name)
+        self.transform = transforms.Compose([
+            transforms.Resize((384, 384), antialias=True),
+            transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+        ])
+
+        if freeze:
+            self._freeze_parameters()
+
+    def _freeze_parameters(self):
+        print("❄️ Freezing Vision Backbone parameters")
+        for param in self.vision_model.parameters():
+            param.requires_grad = False
+        self.vision_model.eval()
+
+    def forward(
+            self,
+            images
+    ):
+        # images: (B, C, H, W), 归一化到 [0, 1]
+        images = self.transform(images)  # 归一化到 [-1, 1]
+
+        outputs = self.vision_model(pixel_values=images)
+
+        return outputs.last_hidden_state
--- a/roboimi/vla/modules/encoders.py
+++ b/roboimi/vla/modules/encoders.py
@@ -30,17 +30,17 @@ class MLP(nn.Module):
 class SinusoidalPositionalEncoding(nn.Module):
    def __init__(
            self,
-            emb_dim
+            embed_dim
    ):
        super().__init__()
-        self.emb_dim = emb_dim
+        self.embed_dim = embed_dim

    def forward(self, timesteps):
        timesteps = timesteps.float()
        B, T = timesteps.shape
        device = timesteps.device

-        half_dim = self.emb_dim // 2
+        half_dim = self.embed_dim // 2

        exponent = -torch.arange(half_dim, dtype=torch.float, device=device) * (
            torch.log(torch.tensor(10000.0)) / half_dim
@@ -58,14 +58,14 @@ class ActionEncoder(nn.Module):
    def __init__(
            self,
            action_dim,
-            emb_dim,
+            embed_dim,
            
    ):
        super().__init__()
-        self.W1 = nn.Linear(action_dim, emb_dim)
+        self.W1 = nn.Linear(action_dim, embed_dim)
        self.W2 = nn.Linear(2 * action_dim, action_dim)
-        self.W3 = nn.Linear(emb_dim, emb_dim)
-        self.pos_encoder = SinusoidalPositionalEncoding(emb_dim)
+        self.W3 = nn.Linear(embed_dim, embed_dim)
+        self.pos_encoder = SinusoidalPositionalEncoding(embed_dim)

    def forward(
            self,
@@ -89,13 +89,13 @@ class StateEncoder(nn.Module):
            self,
            state_dim,
            hidden_dim,
-            emb_dim
+            embed_dim
    ):
        super().__init__()
        self.mlp = MLP(
            state_dim,
            hidden_dim,
-            emb_dim
+            embed_dim
        )

    def forward(
@@ -103,4 +103,4 @@ class StateEncoder(nn.Module):
            states
    ):
        state_emb = self.mlp(states)
-        return state_emb # [B, 1, emb_dim]
+        return state_emb # [B, 1, embed_dim]