debug: 核心骨架伪实现
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -124,3 +124,5 @@ GEMINI.md
|
||||
|
||||
# Copilot
|
||||
.github/copilot-instructions.md
|
||||
|
||||
.hydra/
|
||||
0
roboimi/__init__.py
Normal file
0
roboimi/__init__.py
Normal file
@@ -1,73 +1,114 @@
|
||||
# roboimi/vla/agent.py
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from typing import Optional, Dict, Union
|
||||
from typing import Dict, Optional, Any
|
||||
from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead
|
||||
|
||||
class VLAAgent(nn.Module):
|
||||
def __init__(self,
|
||||
vlm_backbone: nn.Module,
|
||||
img_projector: nn.Module,
|
||||
action_head: nn.Module,
|
||||
state_dim: int,
|
||||
embed_dim: int):
|
||||
"""
|
||||
The main assembly class.
|
||||
Flow: Obs -> Backbone -> Projector -> Head -> Action/Loss
|
||||
"""
|
||||
def __init__(
|
||||
self,
|
||||
backbone: VLABackbone,
|
||||
projector: VLAProjector,
|
||||
head: VLAHead
|
||||
):
|
||||
super().__init__()
|
||||
self.vlm_backbone = vlm_backbone
|
||||
self.img_projector = img_projector
|
||||
self.action_head = action_head
|
||||
self.backbone = backbone
|
||||
self.projector = projector
|
||||
self.head = head
|
||||
|
||||
# 简单的状态编码器 (通常不需要复杂的 config,直接写在这里即可)
|
||||
self.state_encoder = nn.Sequential(
|
||||
nn.Linear(state_dim, embed_dim),
|
||||
nn.Mish(),
|
||||
nn.Linear(embed_dim, embed_dim)
|
||||
)
|
||||
|
||||
def forward(self,
|
||||
images: torch.Tensor,
|
||||
state: torch.Tensor,
|
||||
text: Optional[Union[str, list]] = None,
|
||||
actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]:
|
||||
def forward(self, batch: Dict[str, Any]) -> Dict[str, torch.Tensor]:
|
||||
"""
|
||||
Args:
|
||||
images: [Batch, Obs_Horizon, C, H, W] 注意: 这里需要处理时间维度
|
||||
state: [Batch, Obs_Horizon, State_Dim]
|
||||
text: Optional text instructions
|
||||
actions: [Batch, Pred_Horizon, Action_Dim] (Training only)
|
||||
|
||||
Returns:
|
||||
Training: Loss scalar
|
||||
Inference: Predicted actions
|
||||
batch: Dict containing 'obs' (image/text) and 'actions' (ground truth)
|
||||
"""
|
||||
# 1. Extract Features
|
||||
# Shape: (B, Seq, Backbone_Dim)
|
||||
features = self.backbone(batch['obs'])
|
||||
|
||||
B, T, C, H, W = images.shape
|
||||
# 2. Project Features
|
||||
# Shape: (B, Seq, Head_Dim)
|
||||
embeddings = self.projector(features)
|
||||
|
||||
# 1. 图像编码 (Flatten time dimension for efficiency)
|
||||
# [B*T, C, H, W] -> [B*T, Vision_Dim]
|
||||
flat_images = images.view(B * T, C, H, W)
|
||||
vision_feats_dict = self.vlm_backbone(flat_images)
|
||||
raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim]
|
||||
# 3. Compute Action/Loss
|
||||
# We pass actions if they exist (training mode)
|
||||
actions = batch.get('actions', None)
|
||||
outputs = self.head(embeddings=embeddings, actions=actions)
|
||||
|
||||
# 投影并还原时间维度 -> [B, T, Embed_Dim]
|
||||
img_emb = self.img_projector(raw_img_emb)
|
||||
img_emb = img_emb.view(B, T, -1)
|
||||
return outputs
|
||||
|
||||
# 2. 状态编码
|
||||
state_emb = self.state_encoder(state) # [B, T, Embed_Dim]
|
||||
# # roboimi/vla/agent.py
|
||||
|
||||
# 3. 特征融合 (这里做一个简单的 Early Fusion 示例)
|
||||
# 将图像特征和状态特征在特征维度拼接,或在时间维度拼接
|
||||
# 假设我们只用最近的一帧图像作为 Context,或者将所有历史特征作为 Context
|
||||
# 这里演示:Context = (Image_History + State_History)
|
||||
# [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time)
|
||||
context = torch.cat([img_emb, state_emb], dim=1)
|
||||
# import torch
|
||||
# import torch.nn as nn
|
||||
# from typing import Optional, Dict, Union
|
||||
|
||||
# 4. Action Head 分支
|
||||
if actions is not None:
|
||||
# --- Training Mode ---
|
||||
# 必须返回 Loss
|
||||
return self.action_head.compute_loss(context, actions)
|
||||
else:
|
||||
# --- Inference Mode ---
|
||||
# 必须返回预测的动作序列
|
||||
return self.action_head.predict_action(context)
|
||||
# class VLAAgent(nn.Module):
|
||||
# def __init__(self,
|
||||
# vlm_backbone: nn.Module,
|
||||
# img_projector: nn.Module,
|
||||
# action_head: nn.Module,
|
||||
# state_dim: int,
|
||||
# embed_dim: int):
|
||||
# super().__init__()
|
||||
# self.vlm_backbone = vlm_backbone
|
||||
# self.img_projector = img_projector
|
||||
# self.action_head = action_head
|
||||
|
||||
# # 简单的状态编码器 (通常不需要复杂的 config,直接写在这里即可)
|
||||
# self.state_encoder = nn.Sequential(
|
||||
# nn.Linear(state_dim, embed_dim),
|
||||
# nn.Mish(),
|
||||
# nn.Linear(embed_dim, embed_dim)
|
||||
# )
|
||||
|
||||
# def forward(self,
|
||||
# images: torch.Tensor,
|
||||
# state: torch.Tensor,
|
||||
# text: Optional[Union[str, list]] = None,
|
||||
# actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]:
|
||||
# """
|
||||
# Args:
|
||||
# images: [Batch, Obs_Horizon, C, H, W] 注意: 这里需要处理时间维度
|
||||
# state: [Batch, Obs_Horizon, State_Dim]
|
||||
# text: Optional text instructions
|
||||
# actions: [Batch, Pred_Horizon, Action_Dim] (Training only)
|
||||
|
||||
# Returns:
|
||||
# Training: Loss scalar
|
||||
# Inference: Predicted actions
|
||||
# """
|
||||
|
||||
# B, T, C, H, W = images.shape
|
||||
|
||||
# # 1. 图像编码 (Flatten time dimension for efficiency)
|
||||
# # [B*T, C, H, W] -> [B*T, Vision_Dim]
|
||||
# flat_images = images.view(B * T, C, H, W)
|
||||
# vision_feats_dict = self.vlm_backbone(flat_images)
|
||||
# raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim]
|
||||
|
||||
# # 投影并还原时间维度 -> [B, T, Embed_Dim]
|
||||
# img_emb = self.img_projector(raw_img_emb)
|
||||
# img_emb = img_emb.view(B, T, -1)
|
||||
|
||||
# # 2. 状态编码
|
||||
# state_emb = self.state_encoder(state) # [B, T, Embed_Dim]
|
||||
|
||||
# # 3. 特征融合 (这里做一个简单的 Early Fusion 示例)
|
||||
# # 将图像特征和状态特征在特征维度拼接,或在时间维度拼接
|
||||
# # 假设我们只用最近的一帧图像作为 Context,或者将所有历史特征作为 Context
|
||||
# # 这里演示:Context = (Image_History + State_History)
|
||||
# # [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time)
|
||||
# context = torch.cat([img_emb, state_emb], dim=1)
|
||||
|
||||
# # 4. Action Head 分支
|
||||
# if actions is not None:
|
||||
# # --- Training Mode ---
|
||||
# # 必须返回 Loss
|
||||
# return self.action_head.compute_loss(context, actions)
|
||||
# else:
|
||||
# # --- Inference Mode ---
|
||||
# # 必须返回预测的动作序列
|
||||
# return self.action_head.predict_action(context)
|
||||
24
roboimi/vla/conf/agent/debug_vla.yaml
Normal file
24
roboimi/vla/conf/agent/debug_vla.yaml
Normal file
@@ -0,0 +1,24 @@
|
||||
_target_: roboimi.vla.agent.VLAAgent
|
||||
|
||||
# 1. Backbone Configuration
|
||||
backbone:
|
||||
_target_: roboimi.vla.models.backbones.debug.DebugBackbone
|
||||
embed_dim: 768 # Variable A
|
||||
seq_len: 10
|
||||
|
||||
# 2. Projector Configuration
|
||||
projector:
|
||||
_target_: roboimi.vla.models.projectors.mlp.MLPProjector
|
||||
# Dependency Injection via Interpolation:
|
||||
# Takes 'embed_dim' from the sibling 'backbone' config above.
|
||||
input_dim: ${..backbone.embed_dim}
|
||||
output_dim: 512 # Variable B (The bottleneck size)
|
||||
|
||||
# 3. Head Configuration
|
||||
head:
|
||||
_target_: roboimi.vla.models.heads.debug.DebugHead
|
||||
# Dependency Injection via Interpolation:
|
||||
# Takes 'output_dim' from the sibling 'projector' config above.
|
||||
input_dim: ${..projector.output_dim}
|
||||
action_dim: 7 # (x,y,z, r,p,y, gripper)
|
||||
chunk_size: 16
|
||||
@@ -1,12 +1,9 @@
|
||||
defaults:
|
||||
- _self_
|
||||
- agent: default # 所有的子模块选择都在 agent/default.yaml 中完成了
|
||||
- data: default_dataset
|
||||
- train: gpu
|
||||
- agent: debug_vla # <--- This tells Hydra to look in conf/agent/ and load debug_vla.yaml
|
||||
# Future expansions:
|
||||
# - data: robomimic_hdf5
|
||||
# - train: standard
|
||||
|
||||
project_name: "vla_frame_refactored"
|
||||
# Global settings (optional for now)
|
||||
seed: 42
|
||||
|
||||
hydra:
|
||||
run:
|
||||
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
||||
51
roboimi/vla/core/interfaces.py
Normal file
51
roboimi/vla/core/interfaces.py
Normal file
@@ -0,0 +1,51 @@
|
||||
import abc
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
class VLABackbone(nn.Module, abc.ABC):
|
||||
"""
|
||||
Contract for Vision/Language Backbones.
|
||||
Must return a feature tensor of shape (B, Seq, Embed_Dim).
|
||||
"""
|
||||
@abc.abstractmethod
|
||||
def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
|
||||
"""
|
||||
Args:
|
||||
obs: Dictionary containing 'image' and optionally 'text'.
|
||||
Returns:
|
||||
features: (B, S, D) embedding.
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abc.abstractmethod
|
||||
def embed_dim(self) -> int:
|
||||
pass
|
||||
|
||||
|
||||
class VLAProjector(nn.Module, abc.ABC):
|
||||
"""
|
||||
Contract for the adaptation layer (Projector).
|
||||
Connects Backbone features to the Policy Head.
|
||||
"""
|
||||
@abc.abstractmethod
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
pass
|
||||
|
||||
|
||||
class VLAHead(nn.Module, abc.ABC):
|
||||
"""
|
||||
Contract for Action Generation Heads (Policies).
|
||||
Handles both training (loss calculation) and inference (action generation).
|
||||
"""
|
||||
@abc.abstractmethod
|
||||
def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
|
||||
"""
|
||||
Args:
|
||||
embeddings: (B, S, Hidden) from Projector.
|
||||
actions: (B, Pred_Horizon, Action_Dim) - Ground truth for training.
|
||||
Returns:
|
||||
Dict containing 'loss' (if actions provided) or 'pred_actions'.
|
||||
"""
|
||||
pass
|
||||
0
roboimi/vla/models/__init__.py
Normal file
0
roboimi/vla/models/__init__.py
Normal file
@@ -1,6 +1,8 @@
|
||||
# Backbone models
|
||||
from .siglip import SigLIPBackbone
|
||||
from .clip import CLIPBackbone
|
||||
from .dinov2 import DinoV2Backbone
|
||||
# Uncomment when these are implemented:
|
||||
# from .siglip import SigLIPBackbone
|
||||
# from .clip import CLIPBackbone
|
||||
# from .dinov2 import DinoV2Backbone
|
||||
from .debug import DebugBackbone
|
||||
|
||||
__all__ = ["SigLIPBackbone", "CLIPBackbone", "DinoV2Backbone"]
|
||||
__all__ = ["DebugBackbone"]
|
||||
|
||||
30
roboimi/vla/models/backbones/debug.py
Normal file
30
roboimi/vla/models/backbones/debug.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from typing import Dict
|
||||
from roboimi.vla.core.interfaces import VLABackbone
|
||||
|
||||
class DebugBackbone(VLABackbone):
|
||||
"""
|
||||
A fake backbone that outputs random tensors.
|
||||
"""
|
||||
def __init__(self, embed_dim: int = 768, seq_len: int = 10):
|
||||
super().__init__()
|
||||
self._embed_dim = embed_dim
|
||||
self.seq_len = seq_len
|
||||
# A dummy trainable parameter
|
||||
self.dummy_param = nn.Parameter(torch.zeros(1))
|
||||
|
||||
def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
|
||||
batch_size = obs['image'].shape[0]
|
||||
|
||||
# 1. Generate random noise
|
||||
noise = torch.randn(batch_size, self.seq_len, self._embed_dim, device=obs['image'].device)
|
||||
|
||||
# 2. CRITICAL FIX: Add the dummy parameter to the noise.
|
||||
# This connects 'noise' to 'self.dummy_param' in the computation graph.
|
||||
# The value doesn't change (since param is 0), but the gradient path is established.
|
||||
return noise + self.dummy_param
|
||||
|
||||
@property
|
||||
def embed_dim(self) -> int:
|
||||
return self._embed_dim
|
||||
@@ -1,5 +1,9 @@
|
||||
# Action Head models
|
||||
from .diffusion import DiffusionActionHead
|
||||
from .act import ACTHead
|
||||
# # Action Head models
|
||||
# from .diffusion import DiffusionActionHead
|
||||
# from .act import ACTHead
|
||||
|
||||
__all__ = ["DiffusionActionHead", "ACTHead"]
|
||||
# __all__ = ["DiffusionActionHead", "ACTHead"]
|
||||
|
||||
from .debug import DebugHead
|
||||
|
||||
__all__ = ["DebugHead"]
|
||||
33
roboimi/vla/models/heads/debug.py
Normal file
33
roboimi/vla/models/heads/debug.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from typing import Dict, Optional
|
||||
from roboimi.vla.core.interfaces import VLAHead
|
||||
|
||||
class DebugHead(VLAHead):
|
||||
"""
|
||||
A fake Action Head using MSE Loss.
|
||||
Replaces complex Diffusion/ACT policies for architecture verification.
|
||||
"""
|
||||
def __init__(self, input_dim: int, action_dim: int, chunk_size: int = 16):
|
||||
super().__init__()
|
||||
# Simple regression from embedding -> action chunk
|
||||
self.regressor = nn.Linear(input_dim, chunk_size * action_dim)
|
||||
self.action_dim = action_dim
|
||||
self.chunk_size = chunk_size
|
||||
self.loss_fn = nn.MSELoss()
|
||||
|
||||
def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
|
||||
# Simple pooling over sequence dimension to get (B, Hidden)
|
||||
pooled_embed = embeddings.mean(dim=1)
|
||||
|
||||
# Predict actions: (B, Chunk * Act_Dim) -> (B, Chunk, Act_Dim)
|
||||
pred_flat = self.regressor(pooled_embed)
|
||||
pred_actions = pred_flat.view(-1, self.chunk_size, self.action_dim)
|
||||
|
||||
output = {"pred_actions": pred_actions}
|
||||
|
||||
if actions is not None:
|
||||
# Calculate MSE Loss against ground truth
|
||||
output["loss"] = self.loss_fn(pred_actions, actions)
|
||||
|
||||
return output
|
||||
@@ -1,5 +1,9 @@
|
||||
# Projector models
|
||||
from .mlp import MLPProjector
|
||||
from .perceiver import PerceiverResampler
|
||||
# from .mlp import MLPProjector
|
||||
# from .perceiver import PerceiverResampler
|
||||
|
||||
__all__ = ["MLPProjector", "PerceiverResampler"]
|
||||
# __all__ = ["MLPProjector", "PerceiverResampler"]
|
||||
|
||||
from .mlp import MLPProjector
|
||||
|
||||
__all__ = ["MLPProjector"]
|
||||
@@ -1 +1,19 @@
|
||||
# MLP Projector 实现
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from roboimi.vla.core.interfaces import VLAProjector
|
||||
|
||||
class MLPProjector(VLAProjector):
|
||||
"""
|
||||
A simple Linear Projection layer.
|
||||
First-class citizen: Adapts Backbone dim -> Head dim.
|
||||
"""
|
||||
def __init__(self, input_dim: int, output_dim: int):
|
||||
super().__init__()
|
||||
self.net = nn.Sequential(
|
||||
nn.Linear(input_dim, output_dim),
|
||||
nn.GELU(),
|
||||
nn.Linear(output_dim, output_dim)
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return self.net(x)
|
||||
@@ -1 +0,0 @@
|
||||
# 将图片文件夹转为 HDF5 格式
|
||||
58
roboimi/vla/scripts/verify_arch.py
Normal file
58
roboimi/vla/scripts/verify_arch.py
Normal file
@@ -0,0 +1,58 @@
|
||||
import hydra
|
||||
import torch
|
||||
from omegaconf import DictConfig, OmegaConf
|
||||
from roboimi.vla.agent import VLAAgent
|
||||
|
||||
@hydra.main(version_base=None, config_path="../conf", config_name="config")
|
||||
def main(cfg: DictConfig):
|
||||
print(">>> Initializing VLA Agent (Skeleton Phase)...")
|
||||
# For this test, we override the default agent with our debug config
|
||||
# In a real run, this would be set via command line or defaults list
|
||||
from hydra.utils import instantiate
|
||||
|
||||
# Instantiate the agent using the debug configuration
|
||||
# Assuming 'agent' is a key in your root config.yaml that points to debug_vla
|
||||
# If testing isolated, we instantiate the structure directly.
|
||||
agent: VLAAgent = instantiate(cfg.agent)
|
||||
|
||||
print(f"✅ Agent assembled: {type(agent).__name__}")
|
||||
print(f" - Backbone: {type(agent.backbone).__name__}")
|
||||
print(f" - Projector: {type(agent.projector).__name__}")
|
||||
print(f" - Head: {type(agent.head).__name__}")
|
||||
|
||||
# Mock Data
|
||||
batch_size = 2
|
||||
dummy_obs = {
|
||||
'image': torch.randn(batch_size, 3, 224, 224),
|
||||
'text': ["pick up apple"] * batch_size
|
||||
}
|
||||
dummy_actions = torch.randn(batch_size, 16, 7) # (B, Chunk, Act_Dim)
|
||||
|
||||
batch = {
|
||||
'obs': dummy_obs,
|
||||
'actions': dummy_actions
|
||||
}
|
||||
|
||||
# Forward Pass
|
||||
print("\n>>> Running Forward Pass...")
|
||||
outputs = agent(batch)
|
||||
|
||||
loss = outputs['loss']
|
||||
print(f"✅ Forward successful. Loss: {loss.item():.4f}")
|
||||
|
||||
# Backward Pass (Check Autograd Graph)
|
||||
print("\n>>> Running Backward Pass...")
|
||||
loss.backward()
|
||||
|
||||
# Verify gradients exist in the backbone (proving the chain is intact)
|
||||
# Note: DebugBackbone needs a dummy parameter to show grad
|
||||
backbone_has_grad = agent.backbone.dummy_param.grad is not None or \
|
||||
any(p.grad is not None for p in agent.backbone.parameters())
|
||||
|
||||
if backbone_has_grad:
|
||||
print("✅ Backward successful. Gradients reached Backbone.")
|
||||
else:
|
||||
print("❌ Warning: No gradients found in Backbone.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user