debug: 核心骨架伪实现

2026-02-03 16:14:54 +08:00
parent d3863ea1dd
commit bd8bbb0cfc
15 changed files with 348 additions and 85 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -124,3 +124,5 @@ GEMINI.md
 # Copilot
 .github/copilot-instructions.md
 .hydra/
--- a/roboimi/init.py
+++ b/roboimi/init.py
--- a/roboimi/vla/agent.py
+++ b/roboimi/vla/agent.py
@@ -1,73 +1,114 @@
 # roboimi/vla/agent.py
 import torch
 import torch.nn as nn
-from typing import Optional, Dict, Union
+from typing import Dict, Optional, Any
 from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead
 class VLAAgent(nn.Module):
-    def __init__(self, 
+    """
-                 vlm_backbone: nn.Module,
+    The main assembly class.
-                 img_projector: nn.Module,
+    Flow: Obs -> Backbone -> Projector -> Head -> Action/Loss
-                 action_head: nn.Module,
+    """
-                 state_dim: int,
+    def __init__(
-                 embed_dim: int):
+        self,
        backbone: VLABackbone,
        projector: VLAProjector,
        head: VLAHead
    ):
        super().__init__()
-        self.vlm_backbone = vlm_backbone
+        self.backbone = backbone
-        self.img_projector = img_projector
+        self.projector = projector
-        self.action_head = action_head
+        self.head = head
-        # 简单的状态编码器 (通常不需要复杂的 config，直接写在这里即可)
+    def forward(self, batch: Dict[str, Any]) -> Dict[str, torch.Tensor]:
        self.state_encoder = nn.Sequential(
            nn.Linear(state_dim, embed_dim),
            nn.Mish(),
            nn.Linear(embed_dim, embed_dim)
        )
    def forward(self, 
                images: torch.Tensor, 
                state: torch.Tensor, 
                text: Optional[Union[str, list]] = None, 
                actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]:
        """
        Args:
-            images: [Batch, Obs_Horizon, C, H, W]  注意: 这里需要处理时间维度
+            batch: Dict containing 'obs' (image/text) and 'actions' (ground truth)
            state:  [Batch, Obs_Horizon, State_Dim]
            text:   Optional text instructions
            actions: [Batch, Pred_Horizon, Action_Dim] (Training only)
        Returns:
            Training: Loss scalar
            Inference: Predicted actions
        """
        # 1. Extract Features
        # Shape: (B, Seq, Backbone_Dim)
        features = self.backbone(batch['obs'])
-        B, T, C, H, W = images.shape
+        # 2. Project Features
        # Shape: (B, Seq, Head_Dim)
        embeddings = self.projector(features)
-        # 1. 图像编码 (Flatten time dimension for efficiency)
+        # 3. Compute Action/Loss
-        # [B*T, C, H, W] -> [B*T, Vision_Dim]
+        # We pass actions if they exist (training mode)
-        flat_images = images.view(B * T, C, H, W)
+        actions = batch.get('actions', None)
-        vision_feats_dict = self.vlm_backbone(flat_images)
+        outputs = self.head(embeddings=embeddings, actions=actions)
        raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim]
-        # 投影并还原时间维度 -> [B, T, Embed_Dim]
+        return outputs
        img_emb = self.img_projector(raw_img_emb)
        img_emb = img_emb.view(B, T, -1)
-        # 2. 状态编码
+# # roboimi/vla/agent.py
        state_emb = self.state_encoder(state) # [B, T, Embed_Dim]
-        # 3. 特征融合 (这里做一个简单的 Early Fusion 示例)
+# import torch
-        # 将图像特征和状态特征在特征维度拼接，或在时间维度拼接
+# import torch.nn as nn
-        # 假设我们只用最近的一帧图像作为 Context，或者将所有历史特征作为 Context
+# from typing import Optional, Dict, Union
        # 这里演示：Context = (Image_History + State_History)
        # [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time)
        context = torch.cat([img_emb, state_emb], dim=1) 
-        # 4. Action Head 分支
+# class VLAAgent(nn.Module):
-        if actions is not None:
+#     def __init__(self, 
-            # --- Training Mode ---
+#                  vlm_backbone: nn.Module,
-            # 必须返回 Loss
+#                  img_projector: nn.Module,
-            return self.action_head.compute_loss(context, actions)
+#                  action_head: nn.Module,
-        else:
+#                  state_dim: int,
-            # --- Inference Mode ---
+#                  embed_dim: int):
-            # 必须返回预测的动作序列
+#         super().__init__()
-            return self.action_head.predict_action(context)
+#         self.vlm_backbone = vlm_backbone
 #         self.img_projector = img_projector
 #         self.action_head = action_head
 #         # 简单的状态编码器 (通常不需要复杂的 config，直接写在这里即可)
 #         self.state_encoder = nn.Sequential(
 #             nn.Linear(state_dim, embed_dim),
 #             nn.Mish(),
 #             nn.Linear(embed_dim, embed_dim)
 #         )
 #     def forward(self, 
 #                 images: torch.Tensor, 
 #                 state: torch.Tensor, 
 #                 text: Optional[Union[str, list]] = None, 
 #                 actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]:
 #         """
 #         Args:
 #             images: [Batch, Obs_Horizon, C, H, W]  注意: 这里需要处理时间维度
 #             state:  [Batch, Obs_Horizon, State_Dim]
 #             text:   Optional text instructions
 #             actions: [Batch, Pred_Horizon, Action_Dim] (Training only)
 #         Returns:
 #             Training: Loss scalar
 #             Inference: Predicted actions
 #         """
 #         B, T, C, H, W = images.shape
 #         # 1. 图像编码 (Flatten time dimension for efficiency)
 #         # [B*T, C, H, W] -> [B*T, Vision_Dim]
 #         flat_images = images.view(B * T, C, H, W)
 #         vision_feats_dict = self.vlm_backbone(flat_images)
 #         raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim]
 #         # 投影并还原时间维度 -> [B, T, Embed_Dim]
 #         img_emb = self.img_projector(raw_img_emb)
 #         img_emb = img_emb.view(B, T, -1)
 #         # 2. 状态编码
 #         state_emb = self.state_encoder(state) # [B, T, Embed_Dim]
 #         # 3. 特征融合 (这里做一个简单的 Early Fusion 示例)
 #         # 将图像特征和状态特征在特征维度拼接，或在时间维度拼接
 #         # 假设我们只用最近的一帧图像作为 Context，或者将所有历史特征作为 Context
 #         # 这里演示：Context = (Image_History + State_History)
 #         # [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time)
 #         context = torch.cat([img_emb, state_emb], dim=1) 
 #         # 4. Action Head 分支
 #         if actions is not None:
 #             # --- Training Mode ---
 #             # 必须返回 Loss
 #             return self.action_head.compute_loss(context, actions)
 #         else:
 #             # --- Inference Mode ---
 #             # 必须返回预测的动作序列
 #             return self.action_head.predict_action(context)
--- a/roboimi/vla/conf/agent/debug_vla.yaml
+++ b/roboimi/vla/conf/agent/debug_vla.yaml
@@ -0,0 +1,24 @@
 _target_: roboimi.vla.agent.VLAAgent
 # 1. Backbone Configuration
 backbone:
  _target_: roboimi.vla.models.backbones.debug.DebugBackbone
  embed_dim: 768  # Variable A
  seq_len: 10
 # 2. Projector Configuration
 projector:
  _target_: roboimi.vla.models.projectors.mlp.MLPProjector
  # Dependency Injection via Interpolation:
  # Takes 'embed_dim' from the sibling 'backbone' config above.
  input_dim: ${..backbone.embed_dim} 
  output_dim: 512 # Variable B (The bottleneck size)
 # 3. Head Configuration
 head:
  _target_: roboimi.vla.models.heads.debug.DebugHead
  # Dependency Injection via Interpolation:
  # Takes 'output_dim' from the sibling 'projector' config above.
  input_dim: ${..projector.output_dim}
  action_dim: 7  # (x,y,z, r,p,y, gripper)
  chunk_size: 16
--- a/roboimi/vla/conf/config.yaml
+++ b/roboimi/vla/conf/config.yaml
@@ -1,12 +1,9 @@
 defaults:
  - _self_
-  - agent: default      # 所有的子模块选择都在 agent/default.yaml 中完成了
+  - agent: debug_vla  # <--- This tells Hydra to look in conf/agent/ and load debug_vla.yaml
-  - data: default_dataset
+  # Future expansions:
-  - train: gpu
+  # - data: robomimic_hdf5
  # - train: standard
-project_name: "vla_frame_refactored"
+# Global settings (optional for now)
 seed: 42
 hydra:
  run:
    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
--- a/roboimi/vla/core/interfaces.py
+++ b/roboimi/vla/core/interfaces.py
@@ -0,0 +1,51 @@
 import abc
 import torch
 import torch.nn as nn
 from typing import Dict, Any, Optional
 class VLABackbone(nn.Module, abc.ABC):
    """
    Contract for Vision/Language Backbones.
    Must return a feature tensor of shape (B, Seq, Embed_Dim).
    """
    @abc.abstractmethod
    def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
        """
        Args:
            obs: Dictionary containing 'image' and optionally 'text'.
        Returns:
            features: (B, S, D) embedding.
        """
        pass
    @property
    @abc.abstractmethod
    def embed_dim(self) -> int:
        pass
 class VLAProjector(nn.Module, abc.ABC):
    """
    Contract for the adaptation layer (Projector).
    Connects Backbone features to the Policy Head.
    """
    @abc.abstractmethod
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        pass
 class VLAHead(nn.Module, abc.ABC):
    """
    Contract for Action Generation Heads (Policies).
    Handles both training (loss calculation) and inference (action generation).
    """
    @abc.abstractmethod
    def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        """
        Args:
            embeddings: (B, S, Hidden) from Projector.
            actions: (B, Pred_Horizon, Action_Dim) - Ground truth for training.
        Returns:
            Dict containing 'loss' (if actions provided) or 'pred_actions'.
        """
        pass
--- a/roboimi/vla/models/init.py
+++ b/roboimi/vla/models/init.py
--- a/roboimi/vla/models/backbones/init.py
+++ b/roboimi/vla/models/backbones/init.py
@@ -1,6 +1,8 @@
 # Backbone models
-from .siglip import SigLIPBackbone
+# Uncomment when these are implemented:
-from .clip import CLIPBackbone
+# from .siglip import SigLIPBackbone
-from .dinov2 import DinoV2Backbone
+# from .clip import CLIPBackbone
 # from .dinov2 import DinoV2Backbone
 from .debug import DebugBackbone
-__all__ = ["SigLIPBackbone", "CLIPBackbone", "DinoV2Backbone"]
+__all__ = ["DebugBackbone"]
--- a/roboimi/vla/models/backbones/debug.py
+++ b/roboimi/vla/models/backbones/debug.py
@@ -0,0 +1,30 @@
 import torch
 import torch.nn as nn
 from typing import Dict
 from roboimi.vla.core.interfaces import VLABackbone
 class DebugBackbone(VLABackbone):
    """
    A fake backbone that outputs random tensors.
    """
    def __init__(self, embed_dim: int = 768, seq_len: int = 10):
        super().__init__()
        self._embed_dim = embed_dim
        self.seq_len = seq_len
        # A dummy trainable parameter
        self.dummy_param = nn.Parameter(torch.zeros(1))
    def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
        batch_size = obs['image'].shape[0]
        # 1. Generate random noise
        noise = torch.randn(batch_size, self.seq_len, self._embed_dim, device=obs['image'].device)
        # 2. CRITICAL FIX: Add the dummy parameter to the noise.
        # This connects 'noise' to 'self.dummy_param' in the computation graph.
        # The value doesn't change (since param is 0), but the gradient path is established.
        return noise + self.dummy_param
    @property
    def embed_dim(self) -> int:
        return self._embed_dim
--- a/roboimi/vla/models/heads/init.py
+++ b/roboimi/vla/models/heads/init.py
@@ -1,5 +1,9 @@
-# Action Head models
+# # Action Head models
-from .diffusion import DiffusionActionHead
+# from .diffusion import DiffusionActionHead
-from .act import ACTHead
+# from .act import ACTHead
-__all__ = ["DiffusionActionHead", "ACTHead"]
+# __all__ = ["DiffusionActionHead", "ACTHead"]
 from .debug import DebugHead
 __all__ = ["DebugHead"]
--- a/roboimi/vla/models/heads/debug.py
+++ b/roboimi/vla/models/heads/debug.py
@@ -0,0 +1,33 @@
 import torch
 import torch.nn as nn
 from typing import Dict, Optional
 from roboimi.vla.core.interfaces import VLAHead
 class DebugHead(VLAHead):
    """
    A fake Action Head using MSE Loss.
    Replaces complex Diffusion/ACT policies for architecture verification.
    """
    def __init__(self, input_dim: int, action_dim: int, chunk_size: int = 16):
        super().__init__()
        # Simple regression from embedding -> action chunk
        self.regressor = nn.Linear(input_dim, chunk_size * action_dim)
        self.action_dim = action_dim
        self.chunk_size = chunk_size
        self.loss_fn = nn.MSELoss()
    def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
        # Simple pooling over sequence dimension to get (B, Hidden)
        pooled_embed = embeddings.mean(dim=1) 
        # Predict actions: (B, Chunk * Act_Dim) -> (B, Chunk, Act_Dim)
        pred_flat = self.regressor(pooled_embed)
        pred_actions = pred_flat.view(-1, self.chunk_size, self.action_dim)
        output = {"pred_actions": pred_actions}
        if actions is not None:
            # Calculate MSE Loss against ground truth
            output["loss"] = self.loss_fn(pred_actions, actions)
        return output
--- a/roboimi/vla/models/projectors/init.py
+++ b/roboimi/vla/models/projectors/init.py
@@ -1,5 +1,9 @@
 # Projector models
-from .mlp import MLPProjector
+# from .mlp import MLPProjector
-from .perceiver import PerceiverResampler
+# from .perceiver import PerceiverResampler
-__all__ = ["MLPProjector", "PerceiverResampler"]
+# __all__ = ["MLPProjector", "PerceiverResampler"]
 from .mlp import MLPProjector
 __all__ = ["MLPProjector"]
--- a/roboimi/vla/models/projectors/mlp.py
+++ b/roboimi/vla/models/projectors/mlp.py
@@ -1 +1,19 @@
-# MLP Projector 实现
+import torch
 import torch.nn as nn
 from roboimi.vla.core.interfaces import VLAProjector
 class MLPProjector(VLAProjector):
    """
    A simple Linear Projection layer.
    First-class citizen: Adapts Backbone dim -> Head dim.
    """
    def __init__(self, input_dim: int, output_dim: int):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.GELU(),
            nn.Linear(output_dim, output_dim)
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.net(x)
--- a/roboimi/vla/scripts/convert_to_hdf5.py
+++ b/roboimi/vla/scripts/convert_to_hdf5.py
@@ -1 +0,0 @@
 # 将图片文件夹转为 HDF5 格式
--- a/roboimi/vla/scripts/verify_arch.py
+++ b/roboimi/vla/scripts/verify_arch.py
@@ -0,0 +1,58 @@
 import hydra
 import torch
 from omegaconf import DictConfig, OmegaConf
 from roboimi.vla.agent import VLAAgent
@hydra.main(version_base=None, config_path="../conf", config_name="config")
 def main(cfg: DictConfig):
    print(">>> Initializing VLA Agent (Skeleton Phase)...")
    # For this test, we override the default agent with our debug config
    # In a real run, this would be set via command line or defaults list
    from hydra.utils import instantiate
    # Instantiate the agent using the debug configuration
    # Assuming 'agent' is a key in your root config.yaml that points to debug_vla
    # If testing isolated, we instantiate the structure directly.
    agent: VLAAgent = instantiate(cfg.agent)
    print(f"✅ Agent assembled: {type(agent).__name__}")
    print(f"   - Backbone: {type(agent.backbone).__name__}")
    print(f"   - Projector: {type(agent.projector).__name__}")
    print(f"   - Head: {type(agent.head).__name__}")
    # Mock Data
    batch_size = 2
    dummy_obs = {
        'image': torch.randn(batch_size, 3, 224, 224),
        'text': ["pick up apple"] * batch_size
    }
    dummy_actions = torch.randn(batch_size, 16, 7) # (B, Chunk, Act_Dim)
    batch = {
        'obs': dummy_obs,
        'actions': dummy_actions
    }
    # Forward Pass
    print("\n>>> Running Forward Pass...")
    outputs = agent(batch)
    loss = outputs['loss']
    print(f"✅ Forward successful. Loss: {loss.item():.4f}")
    # Backward Pass (Check Autograd Graph)
    print("\n>>> Running Backward Pass...")
    loss.backward()
    # Verify gradients exist in the backbone (proving the chain is intact)
    # Note: DebugBackbone needs a dummy parameter to show grad
    backbone_has_grad = agent.backbone.dummy_param.grad is not None or \
                        any(p.grad is not None for p in agent.backbone.parameters())
    if backbone_has_grad:
        print("✅ Backward successful. Gradients reached Backbone.")
    else:
        print("❌ Warning: No gradients found in Backbone.")
 if __name__ == "__main__":
    main()