From bd8bbb0cfc2d16b0e57f3543703da2d44ce5f240 Mon Sep 17 00:00:00 2001
From: gouhanke <12219217+gouhanke@user.noreply.gitee.com>
Date: Tue, 3 Feb 2026 16:14:54 +0800
Subject: [PATCH] =?UTF-8?q?debug:=20=E6=A0=B8=E5=BF=83=E9=AA=A8=E6=9E=B6?=
 =?UTF-8?q?=E4=BC=AA=E5=AE=9E=E7=8E=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .gitignore                                |   4 +-
 roboimi/__init__.py                       |   0
 roboimi/vla/agent.py                      | 165 ++++++++++++++--------
 roboimi/vla/conf/agent/debug_vla.yaml     |  24 ++++
 roboimi/vla/conf/config.yaml              |  15 +-
 roboimi/vla/core/interfaces.py            |  51 +++++++
 roboimi/vla/models/__init__.py            |   0
 roboimi/vla/models/backbones/__init__.py  |  10 +-
 roboimi/vla/models/backbones/debug.py     |  30 ++++
 roboimi/vla/models/heads/__init__.py      |  12 +-
 roboimi/vla/models/heads/debug.py         |  33 +++++
 roboimi/vla/models/projectors/__init__.py |  10 +-
 roboimi/vla/models/projectors/mlp.py      |  20 ++-
 roboimi/vla/scripts/convert_to_hdf5.py    |   1 -
 roboimi/vla/scripts/verify_arch.py        |  58 ++++++++
 15 files changed, 348 insertions(+), 85 deletions(-)
 create mode 100644 roboimi/__init__.py
 create mode 100644 roboimi/vla/conf/agent/debug_vla.yaml
 create mode 100644 roboimi/vla/core/interfaces.py
 create mode 100644 roboimi/vla/models/__init__.py
 create mode 100644 roboimi/vla/models/backbones/debug.py
 create mode 100644 roboimi/vla/models/heads/debug.py
 delete mode 100644 roboimi/vla/scripts/convert_to_hdf5.py
 create mode 100644 roboimi/vla/scripts/verify_arch.py

diff --git a/.gitignore b/.gitignore
index 6e9a55d..cec3a36 100644
--- a/.gitignore
+++ b/.gitignore
@@ -123,4 +123,6 @@ CLAUDE.md
 GEMINI.md
 
 # Copilot
-.github/copilot-instructions.md
\ No newline at end of file
+.github/copilot-instructions.md
+
+.hydra/
\ No newline at end of file
diff --git a/roboimi/__init__.py b/roboimi/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/roboimi/vla/agent.py b/roboimi/vla/agent.py
index 6009b90..e3133ab 100644
--- a/roboimi/vla/agent.py
+++ b/roboimi/vla/agent.py
@@ -1,73 +1,114 @@
-# roboimi/vla/agent.py
-
 import torch
 import torch.nn as nn
-from typing import Optional, Dict, Union
+from typing import Dict, Optional, Any
+from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead
 
 class VLAAgent(nn.Module):
-    def __init__(self, 
-                 vlm_backbone: nn.Module,
-                 img_projector: nn.Module,
-                 action_head: nn.Module,
-                 state_dim: int,
-                 embed_dim: int):
+    """
+    The main assembly class.
+    Flow: Obs -> Backbone -> Projector -> Head -> Action/Loss
+    """
+    def __init__(
+        self,
+        backbone: VLABackbone,
+        projector: VLAProjector,
+        head: VLAHead
+    ):
         super().__init__()
-        self.vlm_backbone = vlm_backbone
-        self.img_projector = img_projector
-        self.action_head = action_head
-        
-        # 简单的状态编码器 (通常不需要复杂的 config，直接写在这里即可)
-        self.state_encoder = nn.Sequential(
-            nn.Linear(state_dim, embed_dim),
-            nn.Mish(),
-            nn.Linear(embed_dim, embed_dim)
-        )
+        self.backbone = backbone
+        self.projector = projector
+        self.head = head
 
-    def forward(self, 
-                images: torch.Tensor, 
-                state: torch.Tensor, 
-                text: Optional[Union[str, list]] = None, 
-                actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]:
+    def forward(self, batch: Dict[str, Any]) -> Dict[str, torch.Tensor]:
         """
         Args:
-            images: [Batch, Obs_Horizon, C, H, W]  注意: 这里需要处理时间维度
-            state:  [Batch, Obs_Horizon, State_Dim]
-            text:   Optional text instructions
-            actions: [Batch, Pred_Horizon, Action_Dim] (Training only)
-            
-        Returns:
-            Training: Loss scalar
-            Inference: Predicted actions
+            batch: Dict containing 'obs' (image/text) and 'actions' (ground truth)
         """
-        
-        B, T, C, H, W = images.shape
-        
-        # 1. 图像编码 (Flatten time dimension for efficiency)
-        # [B*T, C, H, W] -> [B*T, Vision_Dim]
-        flat_images = images.view(B * T, C, H, W)
-        vision_feats_dict = self.vlm_backbone(flat_images)
-        raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim]
-        
-        # 投影并还原时间维度 -> [B, T, Embed_Dim]
-        img_emb = self.img_projector(raw_img_emb)
-        img_emb = img_emb.view(B, T, -1)
-        
-        # 2. 状态编码
-        state_emb = self.state_encoder(state) # [B, T, Embed_Dim]
+        # 1. Extract Features
+        # Shape: (B, Seq, Backbone_Dim)
+        features = self.backbone(batch['obs'])
 
-        # 3. 特征融合 (这里做一个简单的 Early Fusion 示例)
-        # 将图像特征和状态特征在特征维度拼接，或在时间维度拼接
-        # 假设我们只用最近的一帧图像作为 Context，或者将所有历史特征作为 Context
-        # 这里演示：Context = (Image_History + State_History)
-        # [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time)
-        context = torch.cat([img_emb, state_emb], dim=1) 
+        # 2. Project Features
+        # Shape: (B, Seq, Head_Dim)
+        embeddings = self.projector(features)
+
+        # 3. Compute Action/Loss
+        # We pass actions if they exist (training mode)
+        actions = batch.get('actions', None)
+        outputs = self.head(embeddings=embeddings, actions=actions)
+
+        return outputs
+
+# # roboimi/vla/agent.py
+
+# import torch
+# import torch.nn as nn
+# from typing import Optional, Dict, Union
+
+# class VLAAgent(nn.Module):
+#     def __init__(self, 
+#                  vlm_backbone: nn.Module,
+#                  img_projector: nn.Module,
+#                  action_head: nn.Module,
+#                  state_dim: int,
+#                  embed_dim: int):
+#         super().__init__()
+#         self.vlm_backbone = vlm_backbone
+#         self.img_projector = img_projector
+#         self.action_head = action_head
         
-        # 4. Action Head 分支
-        if actions is not None:
-            # --- Training Mode ---
-            # 必须返回 Loss
-            return self.action_head.compute_loss(context, actions)
-        else:
-            # --- Inference Mode ---
-            # 必须返回预测的动作序列
-            return self.action_head.predict_action(context)
\ No newline at end of file
+#         # 简单的状态编码器 (通常不需要复杂的 config，直接写在这里即可)
+#         self.state_encoder = nn.Sequential(
+#             nn.Linear(state_dim, embed_dim),
+#             nn.Mish(),
+#             nn.Linear(embed_dim, embed_dim)
+#         )
+
+#     def forward(self, 
+#                 images: torch.Tensor, 
+#                 state: torch.Tensor, 
+#                 text: Optional[Union[str, list]] = None, 
+#                 actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]:
+#         """
+#         Args:
+#             images: [Batch, Obs_Horizon, C, H, W]  注意: 这里需要处理时间维度
+#             state:  [Batch, Obs_Horizon, State_Dim]
+#             text:   Optional text instructions
+#             actions: [Batch, Pred_Horizon, Action_Dim] (Training only)
+            
+#         Returns:
+#             Training: Loss scalar
+#             Inference: Predicted actions
+#         """
+        
+#         B, T, C, H, W = images.shape
+        
+#         # 1. 图像编码 (Flatten time dimension for efficiency)
+#         # [B*T, C, H, W] -> [B*T, Vision_Dim]
+#         flat_images = images.view(B * T, C, H, W)
+#         vision_feats_dict = self.vlm_backbone(flat_images)
+#         raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim]
+        
+#         # 投影并还原时间维度 -> [B, T, Embed_Dim]
+#         img_emb = self.img_projector(raw_img_emb)
+#         img_emb = img_emb.view(B, T, -1)
+        
+#         # 2. 状态编码
+#         state_emb = self.state_encoder(state) # [B, T, Embed_Dim]
+
+#         # 3. 特征融合 (这里做一个简单的 Early Fusion 示例)
+#         # 将图像特征和状态特征在特征维度拼接，或在时间维度拼接
+#         # 假设我们只用最近的一帧图像作为 Context，或者将所有历史特征作为 Context
+#         # 这里演示：Context = (Image_History + State_History)
+#         # [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time)
+#         context = torch.cat([img_emb, state_emb], dim=1) 
+        
+#         # 4. Action Head 分支
+#         if actions is not None:
+#             # --- Training Mode ---
+#             # 必须返回 Loss
+#             return self.action_head.compute_loss(context, actions)
+#         else:
+#             # --- Inference Mode ---
+#             # 必须返回预测的动作序列
+#             return self.action_head.predict_action(context)
\ No newline at end of file
diff --git a/roboimi/vla/conf/agent/debug_vla.yaml b/roboimi/vla/conf/agent/debug_vla.yaml
new file mode 100644
index 0000000..f8962ab
--- /dev/null
+++ b/roboimi/vla/conf/agent/debug_vla.yaml
@@ -0,0 +1,24 @@
+_target_: roboimi.vla.agent.VLAAgent
+
+# 1. Backbone Configuration
+backbone:
+  _target_: roboimi.vla.models.backbones.debug.DebugBackbone
+  embed_dim: 768  # Variable A
+  seq_len: 10
+
+# 2. Projector Configuration
+projector:
+  _target_: roboimi.vla.models.projectors.mlp.MLPProjector
+  # Dependency Injection via Interpolation:
+  # Takes 'embed_dim' from the sibling 'backbone' config above.
+  input_dim: ${..backbone.embed_dim} 
+  output_dim: 512 # Variable B (The bottleneck size)
+
+# 3. Head Configuration
+head:
+  _target_: roboimi.vla.models.heads.debug.DebugHead
+  # Dependency Injection via Interpolation:
+  # Takes 'output_dim' from the sibling 'projector' config above.
+  input_dim: ${..projector.output_dim}
+  action_dim: 7  # (x,y,z, r,p,y, gripper)
+  chunk_size: 16
\ No newline at end of file
diff --git a/roboimi/vla/conf/config.yaml b/roboimi/vla/conf/config.yaml
index a203c26..4e993e2 100644
--- a/roboimi/vla/conf/config.yaml
+++ b/roboimi/vla/conf/config.yaml
@@ -1,12 +1,9 @@
 defaults:
   - _self_
-  - agent: default      # 所有的子模块选择都在 agent/default.yaml 中完成了
-  - data: default_dataset
-  - train: gpu
+  - agent: debug_vla  # <--- This tells Hydra to look in conf/agent/ and load debug_vla.yaml
+  # Future expansions:
+  # - data: robomimic_hdf5
+  # - train: standard
 
-project_name: "vla_frame_refactored"
-seed: 42
-
-hydra:
-  run:
-    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
\ No newline at end of file
+# Global settings (optional for now)
+seed: 42
\ No newline at end of file
diff --git a/roboimi/vla/core/interfaces.py b/roboimi/vla/core/interfaces.py
new file mode 100644
index 0000000..6c22139
--- /dev/null
+++ b/roboimi/vla/core/interfaces.py
@@ -0,0 +1,51 @@
+import abc
+import torch
+import torch.nn as nn
+from typing import Dict, Any, Optional
+
+class VLABackbone(nn.Module, abc.ABC):
+    """
+    Contract for Vision/Language Backbones.
+    Must return a feature tensor of shape (B, Seq, Embed_Dim).
+    """
+    @abc.abstractmethod
+    def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
+        """
+        Args:
+            obs: Dictionary containing 'image' and optionally 'text'.
+        Returns:
+            features: (B, S, D) embedding.
+        """
+        pass
+
+    @property
+    @abc.abstractmethod
+    def embed_dim(self) -> int:
+        pass
+
+
+class VLAProjector(nn.Module, abc.ABC):
+    """
+    Contract for the adaptation layer (Projector).
+    Connects Backbone features to the Policy Head.
+    """
+    @abc.abstractmethod
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        pass
+
+
+class VLAHead(nn.Module, abc.ABC):
+    """
+    Contract for Action Generation Heads (Policies).
+    Handles both training (loss calculation) and inference (action generation).
+    """
+    @abc.abstractmethod
+    def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
+        """
+        Args:
+            embeddings: (B, S, Hidden) from Projector.
+            actions: (B, Pred_Horizon, Action_Dim) - Ground truth for training.
+        Returns:
+            Dict containing 'loss' (if actions provided) or 'pred_actions'.
+        """
+        pass
\ No newline at end of file
diff --git a/roboimi/vla/models/__init__.py b/roboimi/vla/models/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/roboimi/vla/models/backbones/__init__.py b/roboimi/vla/models/backbones/__init__.py
index b28dec3..89c86b2 100644
--- a/roboimi/vla/models/backbones/__init__.py
+++ b/roboimi/vla/models/backbones/__init__.py
@@ -1,6 +1,8 @@
 # Backbone models
-from .siglip import SigLIPBackbone
-from .clip import CLIPBackbone
-from .dinov2 import DinoV2Backbone
+# Uncomment when these are implemented:
+# from .siglip import SigLIPBackbone
+# from .clip import CLIPBackbone
+# from .dinov2 import DinoV2Backbone
+from .debug import DebugBackbone
 
-__all__ = ["SigLIPBackbone", "CLIPBackbone", "DinoV2Backbone"]
+__all__ = ["DebugBackbone"]
diff --git a/roboimi/vla/models/backbones/debug.py b/roboimi/vla/models/backbones/debug.py
new file mode 100644
index 0000000..4c85b98
--- /dev/null
+++ b/roboimi/vla/models/backbones/debug.py
@@ -0,0 +1,30 @@
+import torch
+import torch.nn as nn
+from typing import Dict
+from roboimi.vla.core.interfaces import VLABackbone
+
+class DebugBackbone(VLABackbone):
+    """
+    A fake backbone that outputs random tensors.
+    """
+    def __init__(self, embed_dim: int = 768, seq_len: int = 10):
+        super().__init__()
+        self._embed_dim = embed_dim
+        self.seq_len = seq_len
+        # A dummy trainable parameter
+        self.dummy_param = nn.Parameter(torch.zeros(1))
+
+    def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
+        batch_size = obs['image'].shape[0]
+        
+        # 1. Generate random noise
+        noise = torch.randn(batch_size, self.seq_len, self._embed_dim, device=obs['image'].device)
+        
+        # 2. CRITICAL FIX: Add the dummy parameter to the noise.
+        # This connects 'noise' to 'self.dummy_param' in the computation graph.
+        # The value doesn't change (since param is 0), but the gradient path is established.
+        return noise + self.dummy_param
+
+    @property
+    def embed_dim(self) -> int:
+        return self._embed_dim
\ No newline at end of file
diff --git a/roboimi/vla/models/heads/__init__.py b/roboimi/vla/models/heads/__init__.py
index 9de0395..5fb9af2 100644
--- a/roboimi/vla/models/heads/__init__.py
+++ b/roboimi/vla/models/heads/__init__.py
@@ -1,5 +1,9 @@
-# Action Head models
-from .diffusion import DiffusionActionHead
-from .act import ACTHead
+# # Action Head models
+# from .diffusion import DiffusionActionHead
+# from .act import ACTHead
 
-__all__ = ["DiffusionActionHead", "ACTHead"]
+# __all__ = ["DiffusionActionHead", "ACTHead"]
+
+from .debug import DebugHead
+
+__all__ = ["DebugHead"]
\ No newline at end of file
diff --git a/roboimi/vla/models/heads/debug.py b/roboimi/vla/models/heads/debug.py
new file mode 100644
index 0000000..49f0924
--- /dev/null
+++ b/roboimi/vla/models/heads/debug.py
@@ -0,0 +1,33 @@
+import torch
+import torch.nn as nn
+from typing import Dict, Optional
+from roboimi.vla.core.interfaces import VLAHead
+
+class DebugHead(VLAHead):
+    """
+    A fake Action Head using MSE Loss.
+    Replaces complex Diffusion/ACT policies for architecture verification.
+    """
+    def __init__(self, input_dim: int, action_dim: int, chunk_size: int = 16):
+        super().__init__()
+        # Simple regression from embedding -> action chunk
+        self.regressor = nn.Linear(input_dim, chunk_size * action_dim)
+        self.action_dim = action_dim
+        self.chunk_size = chunk_size
+        self.loss_fn = nn.MSELoss()
+
+    def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
+        # Simple pooling over sequence dimension to get (B, Hidden)
+        pooled_embed = embeddings.mean(dim=1) 
+        
+        # Predict actions: (B, Chunk * Act_Dim) -> (B, Chunk, Act_Dim)
+        pred_flat = self.regressor(pooled_embed)
+        pred_actions = pred_flat.view(-1, self.chunk_size, self.action_dim)
+
+        output = {"pred_actions": pred_actions}
+
+        if actions is not None:
+            # Calculate MSE Loss against ground truth
+            output["loss"] = self.loss_fn(pred_actions, actions)
+            
+        return output
\ No newline at end of file
diff --git a/roboimi/vla/models/projectors/__init__.py b/roboimi/vla/models/projectors/__init__.py
index 14ca3df..1d0ccb1 100644
--- a/roboimi/vla/models/projectors/__init__.py
+++ b/roboimi/vla/models/projectors/__init__.py
@@ -1,5 +1,9 @@
 # Projector models
-from .mlp import MLPProjector
-from .perceiver import PerceiverResampler
+# from .mlp import MLPProjector
+# from .perceiver import PerceiverResampler
 
-__all__ = ["MLPProjector", "PerceiverResampler"]
\ No newline at end of file
+# __all__ = ["MLPProjector", "PerceiverResampler"]
+
+from .mlp import MLPProjector
+
+__all__ = ["MLPProjector"]
\ No newline at end of file
diff --git a/roboimi/vla/models/projectors/mlp.py b/roboimi/vla/models/projectors/mlp.py
index 0e7f7de..03655e0 100644
--- a/roboimi/vla/models/projectors/mlp.py
+++ b/roboimi/vla/models/projectors/mlp.py
@@ -1 +1,19 @@
-# MLP Projector 实现
+import torch
+import torch.nn as nn
+from roboimi.vla.core.interfaces import VLAProjector
+
+class MLPProjector(VLAProjector):
+    """
+    A simple Linear Projection layer.
+    First-class citizen: Adapts Backbone dim -> Head dim.
+    """
+    def __init__(self, input_dim: int, output_dim: int):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(input_dim, output_dim),
+            nn.GELU(),
+            nn.Linear(output_dim, output_dim)
+        )
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.net(x)
\ No newline at end of file
diff --git a/roboimi/vla/scripts/convert_to_hdf5.py b/roboimi/vla/scripts/convert_to_hdf5.py
deleted file mode 100644
index 4db4a47..0000000
--- a/roboimi/vla/scripts/convert_to_hdf5.py
+++ /dev/null
@@ -1 +0,0 @@
-# 将图片文件夹转为 HDF5 格式
diff --git a/roboimi/vla/scripts/verify_arch.py b/roboimi/vla/scripts/verify_arch.py
new file mode 100644
index 0000000..84c5984
--- /dev/null
+++ b/roboimi/vla/scripts/verify_arch.py
@@ -0,0 +1,58 @@
+import hydra
+import torch
+from omegaconf import DictConfig, OmegaConf
+from roboimi.vla.agent import VLAAgent
+
+@hydra.main(version_base=None, config_path="../conf", config_name="config")
+def main(cfg: DictConfig):
+    print(">>> Initializing VLA Agent (Skeleton Phase)...")
+    # For this test, we override the default agent with our debug config
+    # In a real run, this would be set via command line or defaults list
+    from hydra.utils import instantiate
+    
+    # Instantiate the agent using the debug configuration
+    # Assuming 'agent' is a key in your root config.yaml that points to debug_vla
+    # If testing isolated, we instantiate the structure directly.
+    agent: VLAAgent = instantiate(cfg.agent)
+    
+    print(f"✅ Agent assembled: {type(agent).__name__}")
+    print(f"   - Backbone: {type(agent.backbone).__name__}")
+    print(f"   - Projector: {type(agent.projector).__name__}")
+    print(f"   - Head: {type(agent.head).__name__}")
+
+    # Mock Data
+    batch_size = 2
+    dummy_obs = {
+        'image': torch.randn(batch_size, 3, 224, 224),
+        'text': ["pick up apple"] * batch_size
+    }
+    dummy_actions = torch.randn(batch_size, 16, 7) # (B, Chunk, Act_Dim)
+
+    batch = {
+        'obs': dummy_obs,
+        'actions': dummy_actions
+    }
+
+    # Forward Pass
+    print("\n>>> Running Forward Pass...")
+    outputs = agent(batch)
+    
+    loss = outputs['loss']
+    print(f"✅ Forward successful. Loss: {loss.item():.4f}")
+
+    # Backward Pass (Check Autograd Graph)
+    print("\n>>> Running Backward Pass...")
+    loss.backward()
+    
+    # Verify gradients exist in the backbone (proving the chain is intact)
+    # Note: DebugBackbone needs a dummy parameter to show grad
+    backbone_has_grad = agent.backbone.dummy_param.grad is not None or \
+                        any(p.grad is not None for p in agent.backbone.parameters())
+    
+    if backbone_has_grad:
+        print("✅ Backward successful. Gradients reached Backbone.")
+    else:
+        print("❌ Warning: No gradients found in Backbone.")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file