debug: 核心骨架伪实现

This commit is contained in:
gouhanke
2026-02-03 16:14:54 +08:00
parent d3863ea1dd
commit bd8bbb0cfc
15 changed files with 348 additions and 85 deletions

2
.gitignore vendored
View File

@@ -124,3 +124,5 @@ GEMINI.md
# Copilot
.github/copilot-instructions.md
.hydra/

0
roboimi/__init__.py Normal file
View File

View File

@@ -1,73 +1,114 @@
# roboimi/vla/agent.py
import torch
import torch.nn as nn
from typing import Optional, Dict, Union
from typing import Dict, Optional, Any
from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead
class VLAAgent(nn.Module):
def __init__(self,
vlm_backbone: nn.Module,
img_projector: nn.Module,
action_head: nn.Module,
state_dim: int,
embed_dim: int):
"""
The main assembly class.
Flow: Obs -> Backbone -> Projector -> Head -> Action/Loss
"""
def __init__(
self,
backbone: VLABackbone,
projector: VLAProjector,
head: VLAHead
):
super().__init__()
self.vlm_backbone = vlm_backbone
self.img_projector = img_projector
self.action_head = action_head
self.backbone = backbone
self.projector = projector
self.head = head
# 简单的状态编码器 (通常不需要复杂的 config直接写在这里即可)
self.state_encoder = nn.Sequential(
nn.Linear(state_dim, embed_dim),
nn.Mish(),
nn.Linear(embed_dim, embed_dim)
)
def forward(self,
images: torch.Tensor,
state: torch.Tensor,
text: Optional[Union[str, list]] = None,
actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]:
def forward(self, batch: Dict[str, Any]) -> Dict[str, torch.Tensor]:
"""
Args:
images: [Batch, Obs_Horizon, C, H, W] 注意: 这里需要处理时间维度
state: [Batch, Obs_Horizon, State_Dim]
text: Optional text instructions
actions: [Batch, Pred_Horizon, Action_Dim] (Training only)
Returns:
Training: Loss scalar
Inference: Predicted actions
batch: Dict containing 'obs' (image/text) and 'actions' (ground truth)
"""
# 1. Extract Features
# Shape: (B, Seq, Backbone_Dim)
features = self.backbone(batch['obs'])
B, T, C, H, W = images.shape
# 2. Project Features
# Shape: (B, Seq, Head_Dim)
embeddings = self.projector(features)
# 1. 图像编码 (Flatten time dimension for efficiency)
# [B*T, C, H, W] -> [B*T, Vision_Dim]
flat_images = images.view(B * T, C, H, W)
vision_feats_dict = self.vlm_backbone(flat_images)
raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim]
# 3. Compute Action/Loss
# We pass actions if they exist (training mode)
actions = batch.get('actions', None)
outputs = self.head(embeddings=embeddings, actions=actions)
# 投影并还原时间维度 -> [B, T, Embed_Dim]
img_emb = self.img_projector(raw_img_emb)
img_emb = img_emb.view(B, T, -1)
return outputs
# 2. 状态编码
state_emb = self.state_encoder(state) # [B, T, Embed_Dim]
# # roboimi/vla/agent.py
# 3. 特征融合 (这里做一个简单的 Early Fusion 示例)
# 将图像特征和状态特征在特征维度拼接,或在时间维度拼接
# 假设我们只用最近的一帧图像作为 Context或者将所有历史特征作为 Context
# 这里演示Context = (Image_History + State_History)
# [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time)
context = torch.cat([img_emb, state_emb], dim=1)
# import torch
# import torch.nn as nn
# from typing import Optional, Dict, Union
# 4. Action Head 分支
if actions is not None:
# --- Training Mode ---
# 必须返回 Loss
return self.action_head.compute_loss(context, actions)
else:
# --- Inference Mode ---
# 必须返回预测的动作序列
return self.action_head.predict_action(context)
# class VLAAgent(nn.Module):
# def __init__(self,
# vlm_backbone: nn.Module,
# img_projector: nn.Module,
# action_head: nn.Module,
# state_dim: int,
# embed_dim: int):
# super().__init__()
# self.vlm_backbone = vlm_backbone
# self.img_projector = img_projector
# self.action_head = action_head
# # 简单的状态编码器 (通常不需要复杂的 config直接写在这里即可)
# self.state_encoder = nn.Sequential(
# nn.Linear(state_dim, embed_dim),
# nn.Mish(),
# nn.Linear(embed_dim, embed_dim)
# )
# def forward(self,
# images: torch.Tensor,
# state: torch.Tensor,
# text: Optional[Union[str, list]] = None,
# actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]:
# """
# Args:
# images: [Batch, Obs_Horizon, C, H, W] 注意: 这里需要处理时间维度
# state: [Batch, Obs_Horizon, State_Dim]
# text: Optional text instructions
# actions: [Batch, Pred_Horizon, Action_Dim] (Training only)
# Returns:
# Training: Loss scalar
# Inference: Predicted actions
# """
# B, T, C, H, W = images.shape
# # 1. 图像编码 (Flatten time dimension for efficiency)
# # [B*T, C, H, W] -> [B*T, Vision_Dim]
# flat_images = images.view(B * T, C, H, W)
# vision_feats_dict = self.vlm_backbone(flat_images)
# raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim]
# # 投影并还原时间维度 -> [B, T, Embed_Dim]
# img_emb = self.img_projector(raw_img_emb)
# img_emb = img_emb.view(B, T, -1)
# # 2. 状态编码
# state_emb = self.state_encoder(state) # [B, T, Embed_Dim]
# # 3. 特征融合 (这里做一个简单的 Early Fusion 示例)
# # 将图像特征和状态特征在特征维度拼接,或在时间维度拼接
# # 假设我们只用最近的一帧图像作为 Context或者将所有历史特征作为 Context
# # 这里演示Context = (Image_History + State_History)
# # [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time)
# context = torch.cat([img_emb, state_emb], dim=1)
# # 4. Action Head 分支
# if actions is not None:
# # --- Training Mode ---
# # 必须返回 Loss
# return self.action_head.compute_loss(context, actions)
# else:
# # --- Inference Mode ---
# # 必须返回预测的动作序列
# return self.action_head.predict_action(context)

View File

@@ -0,0 +1,24 @@
_target_: roboimi.vla.agent.VLAAgent
# 1. Backbone Configuration
backbone:
_target_: roboimi.vla.models.backbones.debug.DebugBackbone
embed_dim: 768 # Variable A
seq_len: 10
# 2. Projector Configuration
projector:
_target_: roboimi.vla.models.projectors.mlp.MLPProjector
# Dependency Injection via Interpolation:
# Takes 'embed_dim' from the sibling 'backbone' config above.
input_dim: ${..backbone.embed_dim}
output_dim: 512 # Variable B (The bottleneck size)
# 3. Head Configuration
head:
_target_: roboimi.vla.models.heads.debug.DebugHead
# Dependency Injection via Interpolation:
# Takes 'output_dim' from the sibling 'projector' config above.
input_dim: ${..projector.output_dim}
action_dim: 7 # (x,y,z, r,p,y, gripper)
chunk_size: 16

View File

@@ -1,12 +1,9 @@
defaults:
- _self_
- agent: default # 所有的子模块选择都在 agent/default.yaml 中完成了
- data: default_dataset
- train: gpu
- agent: debug_vla # <--- This tells Hydra to look in conf/agent/ and load debug_vla.yaml
# Future expansions:
# - data: robomimic_hdf5
# - train: standard
project_name: "vla_frame_refactored"
# Global settings (optional for now)
seed: 42
hydra:
run:
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}

View File

@@ -0,0 +1,51 @@
import abc
import torch
import torch.nn as nn
from typing import Dict, Any, Optional
class VLABackbone(nn.Module, abc.ABC):
"""
Contract for Vision/Language Backbones.
Must return a feature tensor of shape (B, Seq, Embed_Dim).
"""
@abc.abstractmethod
def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
"""
Args:
obs: Dictionary containing 'image' and optionally 'text'.
Returns:
features: (B, S, D) embedding.
"""
pass
@property
@abc.abstractmethod
def embed_dim(self) -> int:
pass
class VLAProjector(nn.Module, abc.ABC):
"""
Contract for the adaptation layer (Projector).
Connects Backbone features to the Policy Head.
"""
@abc.abstractmethod
def forward(self, x: torch.Tensor) -> torch.Tensor:
pass
class VLAHead(nn.Module, abc.ABC):
"""
Contract for Action Generation Heads (Policies).
Handles both training (loss calculation) and inference (action generation).
"""
@abc.abstractmethod
def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
"""
Args:
embeddings: (B, S, Hidden) from Projector.
actions: (B, Pred_Horizon, Action_Dim) - Ground truth for training.
Returns:
Dict containing 'loss' (if actions provided) or 'pred_actions'.
"""
pass

View File

View File

@@ -1,6 +1,8 @@
# Backbone models
from .siglip import SigLIPBackbone
from .clip import CLIPBackbone
from .dinov2 import DinoV2Backbone
# Uncomment when these are implemented:
# from .siglip import SigLIPBackbone
# from .clip import CLIPBackbone
# from .dinov2 import DinoV2Backbone
from .debug import DebugBackbone
__all__ = ["SigLIPBackbone", "CLIPBackbone", "DinoV2Backbone"]
__all__ = ["DebugBackbone"]

View File

@@ -0,0 +1,30 @@
import torch
import torch.nn as nn
from typing import Dict
from roboimi.vla.core.interfaces import VLABackbone
class DebugBackbone(VLABackbone):
"""
A fake backbone that outputs random tensors.
"""
def __init__(self, embed_dim: int = 768, seq_len: int = 10):
super().__init__()
self._embed_dim = embed_dim
self.seq_len = seq_len
# A dummy trainable parameter
self.dummy_param = nn.Parameter(torch.zeros(1))
def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
batch_size = obs['image'].shape[0]
# 1. Generate random noise
noise = torch.randn(batch_size, self.seq_len, self._embed_dim, device=obs['image'].device)
# 2. CRITICAL FIX: Add the dummy parameter to the noise.
# This connects 'noise' to 'self.dummy_param' in the computation graph.
# The value doesn't change (since param is 0), but the gradient path is established.
return noise + self.dummy_param
@property
def embed_dim(self) -> int:
return self._embed_dim

View File

@@ -1,5 +1,9 @@
# Action Head models
from .diffusion import DiffusionActionHead
from .act import ACTHead
# # Action Head models
# from .diffusion import DiffusionActionHead
# from .act import ACTHead
__all__ = ["DiffusionActionHead", "ACTHead"]
# __all__ = ["DiffusionActionHead", "ACTHead"]
from .debug import DebugHead
__all__ = ["DebugHead"]

View File

@@ -0,0 +1,33 @@
import torch
import torch.nn as nn
from typing import Dict, Optional
from roboimi.vla.core.interfaces import VLAHead
class DebugHead(VLAHead):
"""
A fake Action Head using MSE Loss.
Replaces complex Diffusion/ACT policies for architecture verification.
"""
def __init__(self, input_dim: int, action_dim: int, chunk_size: int = 16):
super().__init__()
# Simple regression from embedding -> action chunk
self.regressor = nn.Linear(input_dim, chunk_size * action_dim)
self.action_dim = action_dim
self.chunk_size = chunk_size
self.loss_fn = nn.MSELoss()
def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
# Simple pooling over sequence dimension to get (B, Hidden)
pooled_embed = embeddings.mean(dim=1)
# Predict actions: (B, Chunk * Act_Dim) -> (B, Chunk, Act_Dim)
pred_flat = self.regressor(pooled_embed)
pred_actions = pred_flat.view(-1, self.chunk_size, self.action_dim)
output = {"pred_actions": pred_actions}
if actions is not None:
# Calculate MSE Loss against ground truth
output["loss"] = self.loss_fn(pred_actions, actions)
return output

View File

@@ -1,5 +1,9 @@
# Projector models
from .mlp import MLPProjector
from .perceiver import PerceiverResampler
# from .mlp import MLPProjector
# from .perceiver import PerceiverResampler
__all__ = ["MLPProjector", "PerceiverResampler"]
# __all__ = ["MLPProjector", "PerceiverResampler"]
from .mlp import MLPProjector
__all__ = ["MLPProjector"]

View File

@@ -1 +1,19 @@
# MLP Projector 实现
import torch
import torch.nn as nn
from roboimi.vla.core.interfaces import VLAProjector
class MLPProjector(VLAProjector):
"""
A simple Linear Projection layer.
First-class citizen: Adapts Backbone dim -> Head dim.
"""
def __init__(self, input_dim: int, output_dim: int):
super().__init__()
self.net = nn.Sequential(
nn.Linear(input_dim, output_dim),
nn.GELU(),
nn.Linear(output_dim, output_dim)
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.net(x)

View File

@@ -1 +0,0 @@
# 将图片文件夹转为 HDF5 格式

View File

@@ -0,0 +1,58 @@
import hydra
import torch
from omegaconf import DictConfig, OmegaConf
from roboimi.vla.agent import VLAAgent
@hydra.main(version_base=None, config_path="../conf", config_name="config")
def main(cfg: DictConfig):
print(">>> Initializing VLA Agent (Skeleton Phase)...")
# For this test, we override the default agent with our debug config
# In a real run, this would be set via command line or defaults list
from hydra.utils import instantiate
# Instantiate the agent using the debug configuration
# Assuming 'agent' is a key in your root config.yaml that points to debug_vla
# If testing isolated, we instantiate the structure directly.
agent: VLAAgent = instantiate(cfg.agent)
print(f"✅ Agent assembled: {type(agent).__name__}")
print(f" - Backbone: {type(agent.backbone).__name__}")
print(f" - Projector: {type(agent.projector).__name__}")
print(f" - Head: {type(agent.head).__name__}")
# Mock Data
batch_size = 2
dummy_obs = {
'image': torch.randn(batch_size, 3, 224, 224),
'text': ["pick up apple"] * batch_size
}
dummy_actions = torch.randn(batch_size, 16, 7) # (B, Chunk, Act_Dim)
batch = {
'obs': dummy_obs,
'actions': dummy_actions
}
# Forward Pass
print("\n>>> Running Forward Pass...")
outputs = agent(batch)
loss = outputs['loss']
print(f"✅ Forward successful. Loss: {loss.item():.4f}")
# Backward Pass (Check Autograd Graph)
print("\n>>> Running Backward Pass...")
loss.backward()
# Verify gradients exist in the backbone (proving the chain is intact)
# Note: DebugBackbone needs a dummy parameter to show grad
backbone_has_grad = agent.backbone.dummy_param.grad is not None or \
any(p.grad is not None for p in agent.backbone.parameters())
if backbone_has_grad:
print("✅ Backward successful. Gradients reached Backbone.")
else:
print("❌ Warning: No gradients found in Backbone.")
if __name__ == "__main__":
main()