debug: 核心骨架伪实现

This commit is contained in:
gouhanke
2026-02-03 16:14:54 +08:00
parent d3863ea1dd
commit bd8bbb0cfc
15 changed files with 348 additions and 85 deletions

2
.gitignore vendored
View File

@@ -124,3 +124,5 @@ GEMINI.md
# Copilot # Copilot
.github/copilot-instructions.md .github/copilot-instructions.md
.hydra/

0
roboimi/__init__.py Normal file
View File

View File

@@ -1,73 +1,114 @@
# roboimi/vla/agent.py
import torch import torch
import torch.nn as nn import torch.nn as nn
from typing import Optional, Dict, Union from typing import Dict, Optional, Any
from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead
class VLAAgent(nn.Module): class VLAAgent(nn.Module):
def __init__(self, """
vlm_backbone: nn.Module, The main assembly class.
img_projector: nn.Module, Flow: Obs -> Backbone -> Projector -> Head -> Action/Loss
action_head: nn.Module, """
state_dim: int, def __init__(
embed_dim: int): self,
backbone: VLABackbone,
projector: VLAProjector,
head: VLAHead
):
super().__init__() super().__init__()
self.vlm_backbone = vlm_backbone self.backbone = backbone
self.img_projector = img_projector self.projector = projector
self.action_head = action_head self.head = head
# 简单的状态编码器 (通常不需要复杂的 config直接写在这里即可) def forward(self, batch: Dict[str, Any]) -> Dict[str, torch.Tensor]:
self.state_encoder = nn.Sequential(
nn.Linear(state_dim, embed_dim),
nn.Mish(),
nn.Linear(embed_dim, embed_dim)
)
def forward(self,
images: torch.Tensor,
state: torch.Tensor,
text: Optional[Union[str, list]] = None,
actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]:
""" """
Args: Args:
images: [Batch, Obs_Horizon, C, H, W] 注意: 这里需要处理时间维度 batch: Dict containing 'obs' (image/text) and 'actions' (ground truth)
state: [Batch, Obs_Horizon, State_Dim]
text: Optional text instructions
actions: [Batch, Pred_Horizon, Action_Dim] (Training only)
Returns:
Training: Loss scalar
Inference: Predicted actions
""" """
# 1. Extract Features
# Shape: (B, Seq, Backbone_Dim)
features = self.backbone(batch['obs'])
B, T, C, H, W = images.shape # 2. Project Features
# Shape: (B, Seq, Head_Dim)
embeddings = self.projector(features)
# 1. 图像编码 (Flatten time dimension for efficiency) # 3. Compute Action/Loss
# [B*T, C, H, W] -> [B*T, Vision_Dim] # We pass actions if they exist (training mode)
flat_images = images.view(B * T, C, H, W) actions = batch.get('actions', None)
vision_feats_dict = self.vlm_backbone(flat_images) outputs = self.head(embeddings=embeddings, actions=actions)
raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim]
# 投影并还原时间维度 -> [B, T, Embed_Dim] return outputs
img_emb = self.img_projector(raw_img_emb)
img_emb = img_emb.view(B, T, -1)
# 2. 状态编码 # # roboimi/vla/agent.py
state_emb = self.state_encoder(state) # [B, T, Embed_Dim]
# 3. 特征融合 (这里做一个简单的 Early Fusion 示例) # import torch
# 将图像特征和状态特征在特征维度拼接,或在时间维度拼接 # import torch.nn as nn
# 假设我们只用最近的一帧图像作为 Context或者将所有历史特征作为 Context # from typing import Optional, Dict, Union
# 这里演示Context = (Image_History + State_History)
# [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time)
context = torch.cat([img_emb, state_emb], dim=1)
# 4. Action Head 分支 # class VLAAgent(nn.Module):
if actions is not None: # def __init__(self,
# --- Training Mode --- # vlm_backbone: nn.Module,
# 必须返回 Loss # img_projector: nn.Module,
return self.action_head.compute_loss(context, actions) # action_head: nn.Module,
else: # state_dim: int,
# --- Inference Mode --- # embed_dim: int):
# 必须返回预测的动作序列 # super().__init__()
return self.action_head.predict_action(context) # self.vlm_backbone = vlm_backbone
# self.img_projector = img_projector
# self.action_head = action_head
# # 简单的状态编码器 (通常不需要复杂的 config直接写在这里即可)
# self.state_encoder = nn.Sequential(
# nn.Linear(state_dim, embed_dim),
# nn.Mish(),
# nn.Linear(embed_dim, embed_dim)
# )
# def forward(self,
# images: torch.Tensor,
# state: torch.Tensor,
# text: Optional[Union[str, list]] = None,
# actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]:
# """
# Args:
# images: [Batch, Obs_Horizon, C, H, W] 注意: 这里需要处理时间维度
# state: [Batch, Obs_Horizon, State_Dim]
# text: Optional text instructions
# actions: [Batch, Pred_Horizon, Action_Dim] (Training only)
# Returns:
# Training: Loss scalar
# Inference: Predicted actions
# """
# B, T, C, H, W = images.shape
# # 1. 图像编码 (Flatten time dimension for efficiency)
# # [B*T, C, H, W] -> [B*T, Vision_Dim]
# flat_images = images.view(B * T, C, H, W)
# vision_feats_dict = self.vlm_backbone(flat_images)
# raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim]
# # 投影并还原时间维度 -> [B, T, Embed_Dim]
# img_emb = self.img_projector(raw_img_emb)
# img_emb = img_emb.view(B, T, -1)
# # 2. 状态编码
# state_emb = self.state_encoder(state) # [B, T, Embed_Dim]
# # 3. 特征融合 (这里做一个简单的 Early Fusion 示例)
# # 将图像特征和状态特征在特征维度拼接,或在时间维度拼接
# # 假设我们只用最近的一帧图像作为 Context或者将所有历史特征作为 Context
# # 这里演示Context = (Image_History + State_History)
# # [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time)
# context = torch.cat([img_emb, state_emb], dim=1)
# # 4. Action Head 分支
# if actions is not None:
# # --- Training Mode ---
# # 必须返回 Loss
# return self.action_head.compute_loss(context, actions)
# else:
# # --- Inference Mode ---
# # 必须返回预测的动作序列
# return self.action_head.predict_action(context)

View File

@@ -0,0 +1,24 @@
_target_: roboimi.vla.agent.VLAAgent
# 1. Backbone Configuration
backbone:
_target_: roboimi.vla.models.backbones.debug.DebugBackbone
embed_dim: 768 # Variable A
seq_len: 10
# 2. Projector Configuration
projector:
_target_: roboimi.vla.models.projectors.mlp.MLPProjector
# Dependency Injection via Interpolation:
# Takes 'embed_dim' from the sibling 'backbone' config above.
input_dim: ${..backbone.embed_dim}
output_dim: 512 # Variable B (The bottleneck size)
# 3. Head Configuration
head:
_target_: roboimi.vla.models.heads.debug.DebugHead
# Dependency Injection via Interpolation:
# Takes 'output_dim' from the sibling 'projector' config above.
input_dim: ${..projector.output_dim}
action_dim: 7 # (x,y,z, r,p,y, gripper)
chunk_size: 16

View File

@@ -1,12 +1,9 @@
defaults: defaults:
- _self_ - _self_
- agent: default # 所有的子模块选择都在 agent/default.yaml 中完成了 - agent: debug_vla # <--- This tells Hydra to look in conf/agent/ and load debug_vla.yaml
- data: default_dataset # Future expansions:
- train: gpu # - data: robomimic_hdf5
# - train: standard
project_name: "vla_frame_refactored" # Global settings (optional for now)
seed: 42 seed: 42
hydra:
run:
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}

View File

@@ -0,0 +1,51 @@
import abc
import torch
import torch.nn as nn
from typing import Dict, Any, Optional
class VLABackbone(nn.Module, abc.ABC):
"""
Contract for Vision/Language Backbones.
Must return a feature tensor of shape (B, Seq, Embed_Dim).
"""
@abc.abstractmethod
def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
"""
Args:
obs: Dictionary containing 'image' and optionally 'text'.
Returns:
features: (B, S, D) embedding.
"""
pass
@property
@abc.abstractmethod
def embed_dim(self) -> int:
pass
class VLAProjector(nn.Module, abc.ABC):
"""
Contract for the adaptation layer (Projector).
Connects Backbone features to the Policy Head.
"""
@abc.abstractmethod
def forward(self, x: torch.Tensor) -> torch.Tensor:
pass
class VLAHead(nn.Module, abc.ABC):
"""
Contract for Action Generation Heads (Policies).
Handles both training (loss calculation) and inference (action generation).
"""
@abc.abstractmethod
def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
"""
Args:
embeddings: (B, S, Hidden) from Projector.
actions: (B, Pred_Horizon, Action_Dim) - Ground truth for training.
Returns:
Dict containing 'loss' (if actions provided) or 'pred_actions'.
"""
pass

View File

View File

@@ -1,6 +1,8 @@
# Backbone models # Backbone models
from .siglip import SigLIPBackbone # Uncomment when these are implemented:
from .clip import CLIPBackbone # from .siglip import SigLIPBackbone
from .dinov2 import DinoV2Backbone # from .clip import CLIPBackbone
# from .dinov2 import DinoV2Backbone
from .debug import DebugBackbone
__all__ = ["SigLIPBackbone", "CLIPBackbone", "DinoV2Backbone"] __all__ = ["DebugBackbone"]

View File

@@ -0,0 +1,30 @@
import torch
import torch.nn as nn
from typing import Dict
from roboimi.vla.core.interfaces import VLABackbone
class DebugBackbone(VLABackbone):
"""
A fake backbone that outputs random tensors.
"""
def __init__(self, embed_dim: int = 768, seq_len: int = 10):
super().__init__()
self._embed_dim = embed_dim
self.seq_len = seq_len
# A dummy trainable parameter
self.dummy_param = nn.Parameter(torch.zeros(1))
def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
batch_size = obs['image'].shape[0]
# 1. Generate random noise
noise = torch.randn(batch_size, self.seq_len, self._embed_dim, device=obs['image'].device)
# 2. CRITICAL FIX: Add the dummy parameter to the noise.
# This connects 'noise' to 'self.dummy_param' in the computation graph.
# The value doesn't change (since param is 0), but the gradient path is established.
return noise + self.dummy_param
@property
def embed_dim(self) -> int:
return self._embed_dim

View File

@@ -1,5 +1,9 @@
# Action Head models # # Action Head models
from .diffusion import DiffusionActionHead # from .diffusion import DiffusionActionHead
from .act import ACTHead # from .act import ACTHead
__all__ = ["DiffusionActionHead", "ACTHead"] # __all__ = ["DiffusionActionHead", "ACTHead"]
from .debug import DebugHead
__all__ = ["DebugHead"]

View File

@@ -0,0 +1,33 @@
import torch
import torch.nn as nn
from typing import Dict, Optional
from roboimi.vla.core.interfaces import VLAHead
class DebugHead(VLAHead):
"""
A fake Action Head using MSE Loss.
Replaces complex Diffusion/ACT policies for architecture verification.
"""
def __init__(self, input_dim: int, action_dim: int, chunk_size: int = 16):
super().__init__()
# Simple regression from embedding -> action chunk
self.regressor = nn.Linear(input_dim, chunk_size * action_dim)
self.action_dim = action_dim
self.chunk_size = chunk_size
self.loss_fn = nn.MSELoss()
def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
# Simple pooling over sequence dimension to get (B, Hidden)
pooled_embed = embeddings.mean(dim=1)
# Predict actions: (B, Chunk * Act_Dim) -> (B, Chunk, Act_Dim)
pred_flat = self.regressor(pooled_embed)
pred_actions = pred_flat.view(-1, self.chunk_size, self.action_dim)
output = {"pred_actions": pred_actions}
if actions is not None:
# Calculate MSE Loss against ground truth
output["loss"] = self.loss_fn(pred_actions, actions)
return output

View File

@@ -1,5 +1,9 @@
# Projector models # Projector models
from .mlp import MLPProjector # from .mlp import MLPProjector
from .perceiver import PerceiverResampler # from .perceiver import PerceiverResampler
__all__ = ["MLPProjector", "PerceiverResampler"] # __all__ = ["MLPProjector", "PerceiverResampler"]
from .mlp import MLPProjector
__all__ = ["MLPProjector"]

View File

@@ -1 +1,19 @@
# MLP Projector 实现 import torch
import torch.nn as nn
from roboimi.vla.core.interfaces import VLAProjector
class MLPProjector(VLAProjector):
"""
A simple Linear Projection layer.
First-class citizen: Adapts Backbone dim -> Head dim.
"""
def __init__(self, input_dim: int, output_dim: int):
super().__init__()
self.net = nn.Sequential(
nn.Linear(input_dim, output_dim),
nn.GELU(),
nn.Linear(output_dim, output_dim)
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.net(x)

View File

@@ -1 +0,0 @@
# 将图片文件夹转为 HDF5 格式

View File

@@ -0,0 +1,58 @@
import hydra
import torch
from omegaconf import DictConfig, OmegaConf
from roboimi.vla.agent import VLAAgent
@hydra.main(version_base=None, config_path="../conf", config_name="config")
def main(cfg: DictConfig):
print(">>> Initializing VLA Agent (Skeleton Phase)...")
# For this test, we override the default agent with our debug config
# In a real run, this would be set via command line or defaults list
from hydra.utils import instantiate
# Instantiate the agent using the debug configuration
# Assuming 'agent' is a key in your root config.yaml that points to debug_vla
# If testing isolated, we instantiate the structure directly.
agent: VLAAgent = instantiate(cfg.agent)
print(f"✅ Agent assembled: {type(agent).__name__}")
print(f" - Backbone: {type(agent.backbone).__name__}")
print(f" - Projector: {type(agent.projector).__name__}")
print(f" - Head: {type(agent.head).__name__}")
# Mock Data
batch_size = 2
dummy_obs = {
'image': torch.randn(batch_size, 3, 224, 224),
'text': ["pick up apple"] * batch_size
}
dummy_actions = torch.randn(batch_size, 16, 7) # (B, Chunk, Act_Dim)
batch = {
'obs': dummy_obs,
'actions': dummy_actions
}
# Forward Pass
print("\n>>> Running Forward Pass...")
outputs = agent(batch)
loss = outputs['loss']
print(f"✅ Forward successful. Loss: {loss.item():.4f}")
# Backward Pass (Check Autograd Graph)
print("\n>>> Running Backward Pass...")
loss.backward()
# Verify gradients exist in the backbone (proving the chain is intact)
# Note: DebugBackbone needs a dummy parameter to show grad
backbone_has_grad = agent.backbone.dummy_param.grad is not None or \
any(p.grad is not None for p in agent.backbone.parameters())
if backbone_has_grad:
print("✅ Backward successful. Gradients reached Backbone.")
else:
print("❌ Warning: No gradients found in Backbone.")
if __name__ == "__main__":
main()