From bd8bbb0cfc2d16b0e57f3543703da2d44ce5f240 Mon Sep 17 00:00:00 2001 From: gouhanke <12219217+gouhanke@user.noreply.gitee.com> Date: Tue, 3 Feb 2026 16:14:54 +0800 Subject: [PATCH] =?UTF-8?q?debug:=20=E6=A0=B8=E5=BF=83=E9=AA=A8=E6=9E=B6?= =?UTF-8?q?=E4=BC=AA=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .gitignore | 4 +- roboimi/__init__.py | 0 roboimi/vla/agent.py | 165 ++++++++++++++-------- roboimi/vla/conf/agent/debug_vla.yaml | 24 ++++ roboimi/vla/conf/config.yaml | 15 +- roboimi/vla/core/interfaces.py | 51 +++++++ roboimi/vla/models/__init__.py | 0 roboimi/vla/models/backbones/__init__.py | 10 +- roboimi/vla/models/backbones/debug.py | 30 ++++ roboimi/vla/models/heads/__init__.py | 12 +- roboimi/vla/models/heads/debug.py | 33 +++++ roboimi/vla/models/projectors/__init__.py | 10 +- roboimi/vla/models/projectors/mlp.py | 20 ++- roboimi/vla/scripts/convert_to_hdf5.py | 1 - roboimi/vla/scripts/verify_arch.py | 58 ++++++++ 15 files changed, 348 insertions(+), 85 deletions(-) create mode 100644 roboimi/__init__.py create mode 100644 roboimi/vla/conf/agent/debug_vla.yaml create mode 100644 roboimi/vla/core/interfaces.py create mode 100644 roboimi/vla/models/__init__.py create mode 100644 roboimi/vla/models/backbones/debug.py create mode 100644 roboimi/vla/models/heads/debug.py delete mode 100644 roboimi/vla/scripts/convert_to_hdf5.py create mode 100644 roboimi/vla/scripts/verify_arch.py diff --git a/.gitignore b/.gitignore index 6e9a55d..cec3a36 100644 --- a/.gitignore +++ b/.gitignore @@ -123,4 +123,6 @@ CLAUDE.md GEMINI.md # Copilot -.github/copilot-instructions.md \ No newline at end of file +.github/copilot-instructions.md + +.hydra/ \ No newline at end of file diff --git a/roboimi/__init__.py b/roboimi/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/roboimi/vla/agent.py b/roboimi/vla/agent.py index 6009b90..e3133ab 100644 --- a/roboimi/vla/agent.py +++ b/roboimi/vla/agent.py @@ -1,73 +1,114 @@ -# roboimi/vla/agent.py - import torch import torch.nn as nn -from typing import Optional, Dict, Union +from typing import Dict, Optional, Any +from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead class VLAAgent(nn.Module): - def __init__(self, - vlm_backbone: nn.Module, - img_projector: nn.Module, - action_head: nn.Module, - state_dim: int, - embed_dim: int): + """ + The main assembly class. + Flow: Obs -> Backbone -> Projector -> Head -> Action/Loss + """ + def __init__( + self, + backbone: VLABackbone, + projector: VLAProjector, + head: VLAHead + ): super().__init__() - self.vlm_backbone = vlm_backbone - self.img_projector = img_projector - self.action_head = action_head - - # 简单的状态编码器 (通常不需要复杂的 config,直接写在这里即可) - self.state_encoder = nn.Sequential( - nn.Linear(state_dim, embed_dim), - nn.Mish(), - nn.Linear(embed_dim, embed_dim) - ) + self.backbone = backbone + self.projector = projector + self.head = head - def forward(self, - images: torch.Tensor, - state: torch.Tensor, - text: Optional[Union[str, list]] = None, - actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]: + def forward(self, batch: Dict[str, Any]) -> Dict[str, torch.Tensor]: """ Args: - images: [Batch, Obs_Horizon, C, H, W] 注意: 这里需要处理时间维度 - state: [Batch, Obs_Horizon, State_Dim] - text: Optional text instructions - actions: [Batch, Pred_Horizon, Action_Dim] (Training only) - - Returns: - Training: Loss scalar - Inference: Predicted actions + batch: Dict containing 'obs' (image/text) and 'actions' (ground truth) """ - - B, T, C, H, W = images.shape - - # 1. 图像编码 (Flatten time dimension for efficiency) - # [B*T, C, H, W] -> [B*T, Vision_Dim] - flat_images = images.view(B * T, C, H, W) - vision_feats_dict = self.vlm_backbone(flat_images) - raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim] - - # 投影并还原时间维度 -> [B, T, Embed_Dim] - img_emb = self.img_projector(raw_img_emb) - img_emb = img_emb.view(B, T, -1) - - # 2. 状态编码 - state_emb = self.state_encoder(state) # [B, T, Embed_Dim] + # 1. Extract Features + # Shape: (B, Seq, Backbone_Dim) + features = self.backbone(batch['obs']) - # 3. 特征融合 (这里做一个简单的 Early Fusion 示例) - # 将图像特征和状态特征在特征维度拼接,或在时间维度拼接 - # 假设我们只用最近的一帧图像作为 Context,或者将所有历史特征作为 Context - # 这里演示:Context = (Image_History + State_History) - # [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time) - context = torch.cat([img_emb, state_emb], dim=1) + # 2. Project Features + # Shape: (B, Seq, Head_Dim) + embeddings = self.projector(features) + + # 3. Compute Action/Loss + # We pass actions if they exist (training mode) + actions = batch.get('actions', None) + outputs = self.head(embeddings=embeddings, actions=actions) + + return outputs + +# # roboimi/vla/agent.py + +# import torch +# import torch.nn as nn +# from typing import Optional, Dict, Union + +# class VLAAgent(nn.Module): +# def __init__(self, +# vlm_backbone: nn.Module, +# img_projector: nn.Module, +# action_head: nn.Module, +# state_dim: int, +# embed_dim: int): +# super().__init__() +# self.vlm_backbone = vlm_backbone +# self.img_projector = img_projector +# self.action_head = action_head - # 4. Action Head 分支 - if actions is not None: - # --- Training Mode --- - # 必须返回 Loss - return self.action_head.compute_loss(context, actions) - else: - # --- Inference Mode --- - # 必须返回预测的动作序列 - return self.action_head.predict_action(context) \ No newline at end of file +# # 简单的状态编码器 (通常不需要复杂的 config,直接写在这里即可) +# self.state_encoder = nn.Sequential( +# nn.Linear(state_dim, embed_dim), +# nn.Mish(), +# nn.Linear(embed_dim, embed_dim) +# ) + +# def forward(self, +# images: torch.Tensor, +# state: torch.Tensor, +# text: Optional[Union[str, list]] = None, +# actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]: +# """ +# Args: +# images: [Batch, Obs_Horizon, C, H, W] 注意: 这里需要处理时间维度 +# state: [Batch, Obs_Horizon, State_Dim] +# text: Optional text instructions +# actions: [Batch, Pred_Horizon, Action_Dim] (Training only) + +# Returns: +# Training: Loss scalar +# Inference: Predicted actions +# """ + +# B, T, C, H, W = images.shape + +# # 1. 图像编码 (Flatten time dimension for efficiency) +# # [B*T, C, H, W] -> [B*T, Vision_Dim] +# flat_images = images.view(B * T, C, H, W) +# vision_feats_dict = self.vlm_backbone(flat_images) +# raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim] + +# # 投影并还原时间维度 -> [B, T, Embed_Dim] +# img_emb = self.img_projector(raw_img_emb) +# img_emb = img_emb.view(B, T, -1) + +# # 2. 状态编码 +# state_emb = self.state_encoder(state) # [B, T, Embed_Dim] + +# # 3. 特征融合 (这里做一个简单的 Early Fusion 示例) +# # 将图像特征和状态特征在特征维度拼接,或在时间维度拼接 +# # 假设我们只用最近的一帧图像作为 Context,或者将所有历史特征作为 Context +# # 这里演示:Context = (Image_History + State_History) +# # [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time) +# context = torch.cat([img_emb, state_emb], dim=1) + +# # 4. Action Head 分支 +# if actions is not None: +# # --- Training Mode --- +# # 必须返回 Loss +# return self.action_head.compute_loss(context, actions) +# else: +# # --- Inference Mode --- +# # 必须返回预测的动作序列 +# return self.action_head.predict_action(context) \ No newline at end of file diff --git a/roboimi/vla/conf/agent/debug_vla.yaml b/roboimi/vla/conf/agent/debug_vla.yaml new file mode 100644 index 0000000..f8962ab --- /dev/null +++ b/roboimi/vla/conf/agent/debug_vla.yaml @@ -0,0 +1,24 @@ +_target_: roboimi.vla.agent.VLAAgent + +# 1. Backbone Configuration +backbone: + _target_: roboimi.vla.models.backbones.debug.DebugBackbone + embed_dim: 768 # Variable A + seq_len: 10 + +# 2. Projector Configuration +projector: + _target_: roboimi.vla.models.projectors.mlp.MLPProjector + # Dependency Injection via Interpolation: + # Takes 'embed_dim' from the sibling 'backbone' config above. + input_dim: ${..backbone.embed_dim} + output_dim: 512 # Variable B (The bottleneck size) + +# 3. Head Configuration +head: + _target_: roboimi.vla.models.heads.debug.DebugHead + # Dependency Injection via Interpolation: + # Takes 'output_dim' from the sibling 'projector' config above. + input_dim: ${..projector.output_dim} + action_dim: 7 # (x,y,z, r,p,y, gripper) + chunk_size: 16 \ No newline at end of file diff --git a/roboimi/vla/conf/config.yaml b/roboimi/vla/conf/config.yaml index a203c26..4e993e2 100644 --- a/roboimi/vla/conf/config.yaml +++ b/roboimi/vla/conf/config.yaml @@ -1,12 +1,9 @@ defaults: - _self_ - - agent: default # 所有的子模块选择都在 agent/default.yaml 中完成了 - - data: default_dataset - - train: gpu + - agent: debug_vla # <--- This tells Hydra to look in conf/agent/ and load debug_vla.yaml + # Future expansions: + # - data: robomimic_hdf5 + # - train: standard -project_name: "vla_frame_refactored" -seed: 42 - -hydra: - run: - dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} \ No newline at end of file +# Global settings (optional for now) +seed: 42 \ No newline at end of file diff --git a/roboimi/vla/core/interfaces.py b/roboimi/vla/core/interfaces.py new file mode 100644 index 0000000..6c22139 --- /dev/null +++ b/roboimi/vla/core/interfaces.py @@ -0,0 +1,51 @@ +import abc +import torch +import torch.nn as nn +from typing import Dict, Any, Optional + +class VLABackbone(nn.Module, abc.ABC): + """ + Contract for Vision/Language Backbones. + Must return a feature tensor of shape (B, Seq, Embed_Dim). + """ + @abc.abstractmethod + def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor: + """ + Args: + obs: Dictionary containing 'image' and optionally 'text'. + Returns: + features: (B, S, D) embedding. + """ + pass + + @property + @abc.abstractmethod + def embed_dim(self) -> int: + pass + + +class VLAProjector(nn.Module, abc.ABC): + """ + Contract for the adaptation layer (Projector). + Connects Backbone features to the Policy Head. + """ + @abc.abstractmethod + def forward(self, x: torch.Tensor) -> torch.Tensor: + pass + + +class VLAHead(nn.Module, abc.ABC): + """ + Contract for Action Generation Heads (Policies). + Handles both training (loss calculation) and inference (action generation). + """ + @abc.abstractmethod + def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]: + """ + Args: + embeddings: (B, S, Hidden) from Projector. + actions: (B, Pred_Horizon, Action_Dim) - Ground truth for training. + Returns: + Dict containing 'loss' (if actions provided) or 'pred_actions'. + """ + pass \ No newline at end of file diff --git a/roboimi/vla/models/__init__.py b/roboimi/vla/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/roboimi/vla/models/backbones/__init__.py b/roboimi/vla/models/backbones/__init__.py index b28dec3..89c86b2 100644 --- a/roboimi/vla/models/backbones/__init__.py +++ b/roboimi/vla/models/backbones/__init__.py @@ -1,6 +1,8 @@ # Backbone models -from .siglip import SigLIPBackbone -from .clip import CLIPBackbone -from .dinov2 import DinoV2Backbone +# Uncomment when these are implemented: +# from .siglip import SigLIPBackbone +# from .clip import CLIPBackbone +# from .dinov2 import DinoV2Backbone +from .debug import DebugBackbone -__all__ = ["SigLIPBackbone", "CLIPBackbone", "DinoV2Backbone"] +__all__ = ["DebugBackbone"] diff --git a/roboimi/vla/models/backbones/debug.py b/roboimi/vla/models/backbones/debug.py new file mode 100644 index 0000000..4c85b98 --- /dev/null +++ b/roboimi/vla/models/backbones/debug.py @@ -0,0 +1,30 @@ +import torch +import torch.nn as nn +from typing import Dict +from roboimi.vla.core.interfaces import VLABackbone + +class DebugBackbone(VLABackbone): + """ + A fake backbone that outputs random tensors. + """ + def __init__(self, embed_dim: int = 768, seq_len: int = 10): + super().__init__() + self._embed_dim = embed_dim + self.seq_len = seq_len + # A dummy trainable parameter + self.dummy_param = nn.Parameter(torch.zeros(1)) + + def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor: + batch_size = obs['image'].shape[0] + + # 1. Generate random noise + noise = torch.randn(batch_size, self.seq_len, self._embed_dim, device=obs['image'].device) + + # 2. CRITICAL FIX: Add the dummy parameter to the noise. + # This connects 'noise' to 'self.dummy_param' in the computation graph. + # The value doesn't change (since param is 0), but the gradient path is established. + return noise + self.dummy_param + + @property + def embed_dim(self) -> int: + return self._embed_dim \ No newline at end of file diff --git a/roboimi/vla/models/heads/__init__.py b/roboimi/vla/models/heads/__init__.py index 9de0395..5fb9af2 100644 --- a/roboimi/vla/models/heads/__init__.py +++ b/roboimi/vla/models/heads/__init__.py @@ -1,5 +1,9 @@ -# Action Head models -from .diffusion import DiffusionActionHead -from .act import ACTHead +# # Action Head models +# from .diffusion import DiffusionActionHead +# from .act import ACTHead -__all__ = ["DiffusionActionHead", "ACTHead"] +# __all__ = ["DiffusionActionHead", "ACTHead"] + +from .debug import DebugHead + +__all__ = ["DebugHead"] \ No newline at end of file diff --git a/roboimi/vla/models/heads/debug.py b/roboimi/vla/models/heads/debug.py new file mode 100644 index 0000000..49f0924 --- /dev/null +++ b/roboimi/vla/models/heads/debug.py @@ -0,0 +1,33 @@ +import torch +import torch.nn as nn +from typing import Dict, Optional +from roboimi.vla.core.interfaces import VLAHead + +class DebugHead(VLAHead): + """ + A fake Action Head using MSE Loss. + Replaces complex Diffusion/ACT policies for architecture verification. + """ + def __init__(self, input_dim: int, action_dim: int, chunk_size: int = 16): + super().__init__() + # Simple regression from embedding -> action chunk + self.regressor = nn.Linear(input_dim, chunk_size * action_dim) + self.action_dim = action_dim + self.chunk_size = chunk_size + self.loss_fn = nn.MSELoss() + + def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]: + # Simple pooling over sequence dimension to get (B, Hidden) + pooled_embed = embeddings.mean(dim=1) + + # Predict actions: (B, Chunk * Act_Dim) -> (B, Chunk, Act_Dim) + pred_flat = self.regressor(pooled_embed) + pred_actions = pred_flat.view(-1, self.chunk_size, self.action_dim) + + output = {"pred_actions": pred_actions} + + if actions is not None: + # Calculate MSE Loss against ground truth + output["loss"] = self.loss_fn(pred_actions, actions) + + return output \ No newline at end of file diff --git a/roboimi/vla/models/projectors/__init__.py b/roboimi/vla/models/projectors/__init__.py index 14ca3df..1d0ccb1 100644 --- a/roboimi/vla/models/projectors/__init__.py +++ b/roboimi/vla/models/projectors/__init__.py @@ -1,5 +1,9 @@ # Projector models -from .mlp import MLPProjector -from .perceiver import PerceiverResampler +# from .mlp import MLPProjector +# from .perceiver import PerceiverResampler -__all__ = ["MLPProjector", "PerceiverResampler"] \ No newline at end of file +# __all__ = ["MLPProjector", "PerceiverResampler"] + +from .mlp import MLPProjector + +__all__ = ["MLPProjector"] \ No newline at end of file diff --git a/roboimi/vla/models/projectors/mlp.py b/roboimi/vla/models/projectors/mlp.py index 0e7f7de..03655e0 100644 --- a/roboimi/vla/models/projectors/mlp.py +++ b/roboimi/vla/models/projectors/mlp.py @@ -1 +1,19 @@ -# MLP Projector 实现 +import torch +import torch.nn as nn +from roboimi.vla.core.interfaces import VLAProjector + +class MLPProjector(VLAProjector): + """ + A simple Linear Projection layer. + First-class citizen: Adapts Backbone dim -> Head dim. + """ + def __init__(self, input_dim: int, output_dim: int): + super().__init__() + self.net = nn.Sequential( + nn.Linear(input_dim, output_dim), + nn.GELU(), + nn.Linear(output_dim, output_dim) + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.net(x) \ No newline at end of file diff --git a/roboimi/vla/scripts/convert_to_hdf5.py b/roboimi/vla/scripts/convert_to_hdf5.py deleted file mode 100644 index 4db4a47..0000000 --- a/roboimi/vla/scripts/convert_to_hdf5.py +++ /dev/null @@ -1 +0,0 @@ -# 将图片文件夹转为 HDF5 格式 diff --git a/roboimi/vla/scripts/verify_arch.py b/roboimi/vla/scripts/verify_arch.py new file mode 100644 index 0000000..84c5984 --- /dev/null +++ b/roboimi/vla/scripts/verify_arch.py @@ -0,0 +1,58 @@ +import hydra +import torch +from omegaconf import DictConfig, OmegaConf +from roboimi.vla.agent import VLAAgent + +@hydra.main(version_base=None, config_path="../conf", config_name="config") +def main(cfg: DictConfig): + print(">>> Initializing VLA Agent (Skeleton Phase)...") + # For this test, we override the default agent with our debug config + # In a real run, this would be set via command line or defaults list + from hydra.utils import instantiate + + # Instantiate the agent using the debug configuration + # Assuming 'agent' is a key in your root config.yaml that points to debug_vla + # If testing isolated, we instantiate the structure directly. + agent: VLAAgent = instantiate(cfg.agent) + + print(f"✅ Agent assembled: {type(agent).__name__}") + print(f" - Backbone: {type(agent.backbone).__name__}") + print(f" - Projector: {type(agent.projector).__name__}") + print(f" - Head: {type(agent.head).__name__}") + + # Mock Data + batch_size = 2 + dummy_obs = { + 'image': torch.randn(batch_size, 3, 224, 224), + 'text': ["pick up apple"] * batch_size + } + dummy_actions = torch.randn(batch_size, 16, 7) # (B, Chunk, Act_Dim) + + batch = { + 'obs': dummy_obs, + 'actions': dummy_actions + } + + # Forward Pass + print("\n>>> Running Forward Pass...") + outputs = agent(batch) + + loss = outputs['loss'] + print(f"✅ Forward successful. Loss: {loss.item():.4f}") + + # Backward Pass (Check Autograd Graph) + print("\n>>> Running Backward Pass...") + loss.backward() + + # Verify gradients exist in the backbone (proving the chain is intact) + # Note: DebugBackbone needs a dummy parameter to show grad + backbone_has_grad = agent.backbone.dummy_param.grad is not None or \ + any(p.grad is not None for p in agent.backbone.parameters()) + + if backbone_has_grad: + print("✅ Backward successful. Gradients reached Backbone.") + else: + print("❌ Warning: No gradients found in Backbone.") + +if __name__ == "__main__": + main() \ No newline at end of file