debug: 核心骨架伪实现

2026-02-03 16:14:54 +08:00
parent d3863ea1dd
commit bd8bbb0cfc
15 changed files with 348 additions and 85 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -124,3 +124,5 @@ GEMINI.md

 # Copilot
 .github/copilot-instructions.md
+
+.hydra/
--- a/roboimi/init.py
+++ b/roboimi/init.py
--- a/roboimi/vla/agent.py
+++ b/roboimi/vla/agent.py
@@ -1,73 +1,114 @@
-# roboimi/vla/agent.py
-
 import torch
 import torch.nn as nn
-from typing import Optional, Dict, Union
+from typing import Dict, Optional, Any
+from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead

 class VLAAgent(nn.Module):
-    def __init__(self, 
-                 vlm_backbone: nn.Module,
-                 img_projector: nn.Module,
-                 action_head: nn.Module,
-                 state_dim: int,
-                 embed_dim: int):
+    """
+    The main assembly class.
+    Flow: Obs -> Backbone -> Projector -> Head -> Action/Loss
+    """
+    def __init__(
+        self,
+        backbone: VLABackbone,
+        projector: VLAProjector,
+        head: VLAHead
+    ):
        super().__init__()
-        self.vlm_backbone = vlm_backbone
-        self.img_projector = img_projector
-        self.action_head = action_head
+        self.backbone = backbone
+        self.projector = projector
+        self.head = head

-        # 简单的状态编码器 (通常不需要复杂的 config，直接写在这里即可)
-        self.state_encoder = nn.Sequential(
-            nn.Linear(state_dim, embed_dim),
-            nn.Mish(),
-            nn.Linear(embed_dim, embed_dim)
-        )
-
-    def forward(self, 
-                images: torch.Tensor, 
-                state: torch.Tensor, 
-                text: Optional[Union[str, list]] = None, 
-                actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]:
+    def forward(self, batch: Dict[str, Any]) -> Dict[str, torch.Tensor]:
        """
        Args:
-            images: [Batch, Obs_Horizon, C, H, W]  注意: 这里需要处理时间维度
-            state:  [Batch, Obs_Horizon, State_Dim]
-            text:   Optional text instructions
-            actions: [Batch, Pred_Horizon, Action_Dim] (Training only)
-            
-        Returns:
-            Training: Loss scalar
-            Inference: Predicted actions
+            batch: Dict containing 'obs' (image/text) and 'actions' (ground truth)
        """
+        # 1. Extract Features
+        # Shape: (B, Seq, Backbone_Dim)
+        features = self.backbone(batch['obs'])

-        B, T, C, H, W = images.shape
+        # 2. Project Features
+        # Shape: (B, Seq, Head_Dim)
+        embeddings = self.projector(features)

-        # 1. 图像编码 (Flatten time dimension for efficiency)
-        # [B*T, C, H, W] -> [B*T, Vision_Dim]
-        flat_images = images.view(B * T, C, H, W)
-        vision_feats_dict = self.vlm_backbone(flat_images)
-        raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim]
+        # 3. Compute Action/Loss
+        # We pass actions if they exist (training mode)
+        actions = batch.get('actions', None)
+        outputs = self.head(embeddings=embeddings, actions=actions)

-        # 投影并还原时间维度 -> [B, T, Embed_Dim]
-        img_emb = self.img_projector(raw_img_emb)
-        img_emb = img_emb.view(B, T, -1)
+        return outputs

-        # 2. 状态编码
-        state_emb = self.state_encoder(state) # [B, T, Embed_Dim]
+# # roboimi/vla/agent.py

-        # 3. 特征融合 (这里做一个简单的 Early Fusion 示例)
-        # 将图像特征和状态特征在特征维度拼接，或在时间维度拼接
-        # 假设我们只用最近的一帧图像作为 Context，或者将所有历史特征作为 Context
-        # 这里演示：Context = (Image_History + State_History)
-        # [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time)
-        context = torch.cat([img_emb, state_emb], dim=1) 
+# import torch
+# import torch.nn as nn
+# from typing import Optional, Dict, Union

-        # 4. Action Head 分支
-        if actions is not None:
-            # --- Training Mode ---
-            # 必须返回 Loss
-            return self.action_head.compute_loss(context, actions)
-        else:
-            # --- Inference Mode ---
-            # 必须返回预测的动作序列
-            return self.action_head.predict_action(context)
+# class VLAAgent(nn.Module):
+#     def __init__(self, 
+#                  vlm_backbone: nn.Module,
+#                  img_projector: nn.Module,
+#                  action_head: nn.Module,
+#                  state_dim: int,
+#                  embed_dim: int):
+#         super().__init__()
+#         self.vlm_backbone = vlm_backbone
+#         self.img_projector = img_projector
+#         self.action_head = action_head
+        
+#         # 简单的状态编码器 (通常不需要复杂的 config，直接写在这里即可)
+#         self.state_encoder = nn.Sequential(
+#             nn.Linear(state_dim, embed_dim),
+#             nn.Mish(),
+#             nn.Linear(embed_dim, embed_dim)
+#         )
+
+#     def forward(self, 
+#                 images: torch.Tensor, 
+#                 state: torch.Tensor, 
+#                 text: Optional[Union[str, list]] = None, 
+#                 actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]:
+#         """
+#         Args:
+#             images: [Batch, Obs_Horizon, C, H, W]  注意: 这里需要处理时间维度
+#             state:  [Batch, Obs_Horizon, State_Dim]
+#             text:   Optional text instructions
+#             actions: [Batch, Pred_Horizon, Action_Dim] (Training only)
+            
+#         Returns:
+#             Training: Loss scalar
+#             Inference: Predicted actions
+#         """
+        
+#         B, T, C, H, W = images.shape
+        
+#         # 1. 图像编码 (Flatten time dimension for efficiency)
+#         # [B*T, C, H, W] -> [B*T, Vision_Dim]
+#         flat_images = images.view(B * T, C, H, W)
+#         vision_feats_dict = self.vlm_backbone(flat_images)
+#         raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim]
+        
+#         # 投影并还原时间维度 -> [B, T, Embed_Dim]
+#         img_emb = self.img_projector(raw_img_emb)
+#         img_emb = img_emb.view(B, T, -1)
+        
+#         # 2. 状态编码
+#         state_emb = self.state_encoder(state) # [B, T, Embed_Dim]
+
+#         # 3. 特征融合 (这里做一个简单的 Early Fusion 示例)
+#         # 将图像特征和状态特征在特征维度拼接，或在时间维度拼接
+#         # 假设我们只用最近的一帧图像作为 Context，或者将所有历史特征作为 Context
+#         # 这里演示：Context = (Image_History + State_History)
+#         # [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time)
+#         context = torch.cat([img_emb, state_emb], dim=1) 
+        
+#         # 4. Action Head 分支
+#         if actions is not None:
+#             # --- Training Mode ---
+#             # 必须返回 Loss
+#             return self.action_head.compute_loss(context, actions)
+#         else:
+#             # --- Inference Mode ---
+#             # 必须返回预测的动作序列
+#             return self.action_head.predict_action(context)
--- a/roboimi/vla/conf/agent/debug_vla.yaml
+++ b/roboimi/vla/conf/agent/debug_vla.yaml
@@ -0,0 +1,24 @@
+_target_: roboimi.vla.agent.VLAAgent
+
+# 1. Backbone Configuration
+backbone:
+  _target_: roboimi.vla.models.backbones.debug.DebugBackbone
+  embed_dim: 768  # Variable A
+  seq_len: 10
+
+# 2. Projector Configuration
+projector:
+  _target_: roboimi.vla.models.projectors.mlp.MLPProjector
+  # Dependency Injection via Interpolation:
+  # Takes 'embed_dim' from the sibling 'backbone' config above.
+  input_dim: ${..backbone.embed_dim} 
+  output_dim: 512 # Variable B (The bottleneck size)
+
+# 3. Head Configuration
+head:
+  _target_: roboimi.vla.models.heads.debug.DebugHead
+  # Dependency Injection via Interpolation:
+  # Takes 'output_dim' from the sibling 'projector' config above.
+  input_dim: ${..projector.output_dim}
+  action_dim: 7  # (x,y,z, r,p,y, gripper)
+  chunk_size: 16
--- a/roboimi/vla/conf/config.yaml
+++ b/roboimi/vla/conf/config.yaml
@@ -1,12 +1,9 @@
 defaults:
  - _self_
-  - agent: default      # 所有的子模块选择都在 agent/default.yaml 中完成了
-  - data: default_dataset
-  - train: gpu
+  - agent: debug_vla  # <--- This tells Hydra to look in conf/agent/ and load debug_vla.yaml
+  # Future expansions:
+  # - data: robomimic_hdf5
+  # - train: standard

-project_name: "vla_frame_refactored"
+# Global settings (optional for now)
 seed: 42
-
-hydra:
-  run:
-    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
--- a/roboimi/vla/core/interfaces.py
+++ b/roboimi/vla/core/interfaces.py
@@ -0,0 +1,51 @@
+import abc
+import torch
+import torch.nn as nn
+from typing import Dict, Any, Optional
+
+class VLABackbone(nn.Module, abc.ABC):
+    """
+    Contract for Vision/Language Backbones.
+    Must return a feature tensor of shape (B, Seq, Embed_Dim).
+    """
+    @abc.abstractmethod
+    def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Args:
+            obs: Dictionary containing 'image' and optionally 'text'.
+        Returns:
+            features: (B, S, D) embedding.
+        """
+        pass
+
+    @property
+    @abc.abstractmethod
+    def embed_dim(self) -> int:
+        pass
+
+
+class VLAProjector(nn.Module, abc.ABC):
+    """
+    Contract for the adaptation layer (Projector).
+    Connects Backbone features to the Policy Head.
+    """
+    @abc.abstractmethod
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pass
+
+
+class VLAHead(nn.Module, abc.ABC):
+    """
+    Contract for Action Generation Heads (Policies).
+    Handles both training (loss calculation) and inference (action generation).
+    """
+    @abc.abstractmethod
+    def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
+        """
+        Args:
+            embeddings: (B, S, Hidden) from Projector.
+            actions: (B, Pred_Horizon, Action_Dim) - Ground truth for training.
+        Returns:
+            Dict containing 'loss' (if actions provided) or 'pred_actions'.
+        """
+        pass
--- a/roboimi/vla/models/init.py
+++ b/roboimi/vla/models/init.py
--- a/roboimi/vla/models/backbones/init.py
+++ b/roboimi/vla/models/backbones/init.py
@@ -1,6 +1,8 @@
 # Backbone models
-from .siglip import SigLIPBackbone
-from .clip import CLIPBackbone
-from .dinov2 import DinoV2Backbone
+# Uncomment when these are implemented:
+# from .siglip import SigLIPBackbone
+# from .clip import CLIPBackbone
+# from .dinov2 import DinoV2Backbone
+from .debug import DebugBackbone

-__all__ = ["SigLIPBackbone", "CLIPBackbone", "DinoV2Backbone"]
+__all__ = ["DebugBackbone"]
--- a/roboimi/vla/models/backbones/debug.py
+++ b/roboimi/vla/models/backbones/debug.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+from typing import Dict
+from roboimi.vla.core.interfaces import VLABackbone
+
+class DebugBackbone(VLABackbone):
+    """
+    A fake backbone that outputs random tensors.
+    """
+    def __init__(self, embed_dim: int = 768, seq_len: int = 10):
+        super().__init__()
+        self._embed_dim = embed_dim
+        self.seq_len = seq_len
+        # A dummy trainable parameter
+        self.dummy_param = nn.Parameter(torch.zeros(1))
+
+    def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
+        batch_size = obs['image'].shape[0]
+        
+        # 1. Generate random noise
+        noise = torch.randn(batch_size, self.seq_len, self._embed_dim, device=obs['image'].device)
+        
+        # 2. CRITICAL FIX: Add the dummy parameter to the noise.
+        # This connects 'noise' to 'self.dummy_param' in the computation graph.
+        # The value doesn't change (since param is 0), but the gradient path is established.
+        return noise + self.dummy_param
+
+    @property
+    def embed_dim(self) -> int:
+        return self._embed_dim
--- a/roboimi/vla/models/heads/init.py
+++ b/roboimi/vla/models/heads/init.py
@@ -1,5 +1,9 @@
-# Action Head models
-from .diffusion import DiffusionActionHead
-from .act import ACTHead
+# # Action Head models
+# from .diffusion import DiffusionActionHead
+# from .act import ACTHead

-__all__ = ["DiffusionActionHead", "ACTHead"]
+# __all__ = ["DiffusionActionHead", "ACTHead"]
+
+from .debug import DebugHead
+
+__all__ = ["DebugHead"]
--- a/roboimi/vla/models/heads/debug.py
+++ b/roboimi/vla/models/heads/debug.py
@@ -0,0 +1,33 @@
+import torch
+import torch.nn as nn
+from typing import Dict, Optional
+from roboimi.vla.core.interfaces import VLAHead
+
+class DebugHead(VLAHead):
+    """
+    A fake Action Head using MSE Loss.
+    Replaces complex Diffusion/ACT policies for architecture verification.
+    """
+    def __init__(self, input_dim: int, action_dim: int, chunk_size: int = 16):
+        super().__init__()
+        # Simple regression from embedding -> action chunk
+        self.regressor = nn.Linear(input_dim, chunk_size * action_dim)
+        self.action_dim = action_dim
+        self.chunk_size = chunk_size
+        self.loss_fn = nn.MSELoss()
+
+    def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
+        # Simple pooling over sequence dimension to get (B, Hidden)
+        pooled_embed = embeddings.mean(dim=1) 
+        
+        # Predict actions: (B, Chunk * Act_Dim) -> (B, Chunk, Act_Dim)
+        pred_flat = self.regressor(pooled_embed)
+        pred_actions = pred_flat.view(-1, self.chunk_size, self.action_dim)
+
+        output = {"pred_actions": pred_actions}
+
+        if actions is not None:
+            # Calculate MSE Loss against ground truth
+            output["loss"] = self.loss_fn(pred_actions, actions)
+            
+        return output
--- a/roboimi/vla/models/projectors/init.py
+++ b/roboimi/vla/models/projectors/init.py
@@ -1,5 +1,9 @@
 # Projector models
-from .mlp import MLPProjector
-from .perceiver import PerceiverResampler
+# from .mlp import MLPProjector
+# from .perceiver import PerceiverResampler

-__all__ = ["MLPProjector", "PerceiverResampler"]
+# __all__ = ["MLPProjector", "PerceiverResampler"]
+
+from .mlp import MLPProjector
+
+__all__ = ["MLPProjector"]
--- a/roboimi/vla/models/projectors/mlp.py
+++ b/roboimi/vla/models/projectors/mlp.py
@@ -1 +1,19 @@
-# MLP Projector 实现
+import torch
+import torch.nn as nn
+from roboimi.vla.core.interfaces import VLAProjector
+
+class MLPProjector(VLAProjector):
+    """
+    A simple Linear Projection layer.
+    First-class citizen: Adapts Backbone dim -> Head dim.
+    """
+    def __init__(self, input_dim: int, output_dim: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(input_dim, output_dim),
+            nn.GELU(),
+            nn.Linear(output_dim, output_dim)
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
--- a/roboimi/vla/scripts/convert_to_hdf5.py
+++ b/roboimi/vla/scripts/convert_to_hdf5.py
@@ -1 +0,0 @@
-# 将图片文件夹转为 HDF5 格式
--- a/roboimi/vla/scripts/verify_arch.py
+++ b/roboimi/vla/scripts/verify_arch.py
@@ -0,0 +1,58 @@
+import hydra
+import torch
+from omegaconf import DictConfig, OmegaConf
+from roboimi.vla.agent import VLAAgent
+
+@hydra.main(version_base=None, config_path="../conf", config_name="config")
+def main(cfg: DictConfig):
+    print(">>> Initializing VLA Agent (Skeleton Phase)...")
+    # For this test, we override the default agent with our debug config
+    # In a real run, this would be set via command line or defaults list
+    from hydra.utils import instantiate
+    
+    # Instantiate the agent using the debug configuration
+    # Assuming 'agent' is a key in your root config.yaml that points to debug_vla
+    # If testing isolated, we instantiate the structure directly.
+    agent: VLAAgent = instantiate(cfg.agent)
+    
+    print(f"✅ Agent assembled: {type(agent).__name__}")
+    print(f"   - Backbone: {type(agent.backbone).__name__}")
+    print(f"   - Projector: {type(agent.projector).__name__}")
+    print(f"   - Head: {type(agent.head).__name__}")
+
+    # Mock Data
+    batch_size = 2
+    dummy_obs = {
+        'image': torch.randn(batch_size, 3, 224, 224),
+        'text': ["pick up apple"] * batch_size
+    }
+    dummy_actions = torch.randn(batch_size, 16, 7) # (B, Chunk, Act_Dim)
+
+    batch = {
+        'obs': dummy_obs,
+        'actions': dummy_actions
+    }
+
+    # Forward Pass
+    print("\n>>> Running Forward Pass...")
+    outputs = agent(batch)
+    
+    loss = outputs['loss']
+    print(f"✅ Forward successful. Loss: {loss.item():.4f}")
+
+    # Backward Pass (Check Autograd Graph)
+    print("\n>>> Running Backward Pass...")
+    loss.backward()
+    
+    # Verify gradients exist in the backbone (proving the chain is intact)
+    # Note: DebugBackbone needs a dummy parameter to show grad
+    backbone_has_grad = agent.backbone.dummy_param.grad is not None or \
+                        any(p.grad is not None for p in agent.backbone.parameters())
+    
+    if backbone_has_grad:
+        print("✅ Backward successful. Gradients reached Backbone.")
+    else:
+        print("❌ Warning: No gradients found in Backbone.")
+
+if __name__ == "__main__":
+    main()