debug: 核心骨架伪实现
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -124,3 +124,5 @@ GEMINI.md
|
|||||||
|
|
||||||
# Copilot
|
# Copilot
|
||||||
.github/copilot-instructions.md
|
.github/copilot-instructions.md
|
||||||
|
|
||||||
|
.hydra/
|
||||||
0
roboimi/__init__.py
Normal file
0
roboimi/__init__.py
Normal file
@@ -1,73 +1,114 @@
|
|||||||
# roboimi/vla/agent.py
|
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
import torch.nn as nn
|
import torch.nn as nn
|
||||||
from typing import Optional, Dict, Union
|
from typing import Dict, Optional, Any
|
||||||
|
from roboimi.vla.core.interfaces import VLABackbone, VLAProjector, VLAHead
|
||||||
|
|
||||||
class VLAAgent(nn.Module):
|
class VLAAgent(nn.Module):
|
||||||
def __init__(self,
|
"""
|
||||||
vlm_backbone: nn.Module,
|
The main assembly class.
|
||||||
img_projector: nn.Module,
|
Flow: Obs -> Backbone -> Projector -> Head -> Action/Loss
|
||||||
action_head: nn.Module,
|
"""
|
||||||
state_dim: int,
|
def __init__(
|
||||||
embed_dim: int):
|
self,
|
||||||
|
backbone: VLABackbone,
|
||||||
|
projector: VLAProjector,
|
||||||
|
head: VLAHead
|
||||||
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.vlm_backbone = vlm_backbone
|
self.backbone = backbone
|
||||||
self.img_projector = img_projector
|
self.projector = projector
|
||||||
self.action_head = action_head
|
self.head = head
|
||||||
|
|
||||||
# 简单的状态编码器 (通常不需要复杂的 config,直接写在这里即可)
|
def forward(self, batch: Dict[str, Any]) -> Dict[str, torch.Tensor]:
|
||||||
self.state_encoder = nn.Sequential(
|
|
||||||
nn.Linear(state_dim, embed_dim),
|
|
||||||
nn.Mish(),
|
|
||||||
nn.Linear(embed_dim, embed_dim)
|
|
||||||
)
|
|
||||||
|
|
||||||
def forward(self,
|
|
||||||
images: torch.Tensor,
|
|
||||||
state: torch.Tensor,
|
|
||||||
text: Optional[Union[str, list]] = None,
|
|
||||||
actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]:
|
|
||||||
"""
|
"""
|
||||||
Args:
|
Args:
|
||||||
images: [Batch, Obs_Horizon, C, H, W] 注意: 这里需要处理时间维度
|
batch: Dict containing 'obs' (image/text) and 'actions' (ground truth)
|
||||||
state: [Batch, Obs_Horizon, State_Dim]
|
|
||||||
text: Optional text instructions
|
|
||||||
actions: [Batch, Pred_Horizon, Action_Dim] (Training only)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Training: Loss scalar
|
|
||||||
Inference: Predicted actions
|
|
||||||
"""
|
"""
|
||||||
|
# 1. Extract Features
|
||||||
|
# Shape: (B, Seq, Backbone_Dim)
|
||||||
|
features = self.backbone(batch['obs'])
|
||||||
|
|
||||||
B, T, C, H, W = images.shape
|
# 2. Project Features
|
||||||
|
# Shape: (B, Seq, Head_Dim)
|
||||||
|
embeddings = self.projector(features)
|
||||||
|
|
||||||
# 1. 图像编码 (Flatten time dimension for efficiency)
|
# 3. Compute Action/Loss
|
||||||
# [B*T, C, H, W] -> [B*T, Vision_Dim]
|
# We pass actions if they exist (training mode)
|
||||||
flat_images = images.view(B * T, C, H, W)
|
actions = batch.get('actions', None)
|
||||||
vision_feats_dict = self.vlm_backbone(flat_images)
|
outputs = self.head(embeddings=embeddings, actions=actions)
|
||||||
raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim]
|
|
||||||
|
|
||||||
# 投影并还原时间维度 -> [B, T, Embed_Dim]
|
return outputs
|
||||||
img_emb = self.img_projector(raw_img_emb)
|
|
||||||
img_emb = img_emb.view(B, T, -1)
|
|
||||||
|
|
||||||
# 2. 状态编码
|
# # roboimi/vla/agent.py
|
||||||
state_emb = self.state_encoder(state) # [B, T, Embed_Dim]
|
|
||||||
|
|
||||||
# 3. 特征融合 (这里做一个简单的 Early Fusion 示例)
|
# import torch
|
||||||
# 将图像特征和状态特征在特征维度拼接,或在时间维度拼接
|
# import torch.nn as nn
|
||||||
# 假设我们只用最近的一帧图像作为 Context,或者将所有历史特征作为 Context
|
# from typing import Optional, Dict, Union
|
||||||
# 这里演示:Context = (Image_History + State_History)
|
|
||||||
# [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time)
|
|
||||||
context = torch.cat([img_emb, state_emb], dim=1)
|
|
||||||
|
|
||||||
# 4. Action Head 分支
|
# class VLAAgent(nn.Module):
|
||||||
if actions is not None:
|
# def __init__(self,
|
||||||
# --- Training Mode ---
|
# vlm_backbone: nn.Module,
|
||||||
# 必须返回 Loss
|
# img_projector: nn.Module,
|
||||||
return self.action_head.compute_loss(context, actions)
|
# action_head: nn.Module,
|
||||||
else:
|
# state_dim: int,
|
||||||
# --- Inference Mode ---
|
# embed_dim: int):
|
||||||
# 必须返回预测的动作序列
|
# super().__init__()
|
||||||
return self.action_head.predict_action(context)
|
# self.vlm_backbone = vlm_backbone
|
||||||
|
# self.img_projector = img_projector
|
||||||
|
# self.action_head = action_head
|
||||||
|
|
||||||
|
# # 简单的状态编码器 (通常不需要复杂的 config,直接写在这里即可)
|
||||||
|
# self.state_encoder = nn.Sequential(
|
||||||
|
# nn.Linear(state_dim, embed_dim),
|
||||||
|
# nn.Mish(),
|
||||||
|
# nn.Linear(embed_dim, embed_dim)
|
||||||
|
# )
|
||||||
|
|
||||||
|
# def forward(self,
|
||||||
|
# images: torch.Tensor,
|
||||||
|
# state: torch.Tensor,
|
||||||
|
# text: Optional[Union[str, list]] = None,
|
||||||
|
# actions: Optional[torch.Tensor] = None) -> Union[torch.Tensor, Dict]:
|
||||||
|
# """
|
||||||
|
# Args:
|
||||||
|
# images: [Batch, Obs_Horizon, C, H, W] 注意: 这里需要处理时间维度
|
||||||
|
# state: [Batch, Obs_Horizon, State_Dim]
|
||||||
|
# text: Optional text instructions
|
||||||
|
# actions: [Batch, Pred_Horizon, Action_Dim] (Training only)
|
||||||
|
|
||||||
|
# Returns:
|
||||||
|
# Training: Loss scalar
|
||||||
|
# Inference: Predicted actions
|
||||||
|
# """
|
||||||
|
|
||||||
|
# B, T, C, H, W = images.shape
|
||||||
|
|
||||||
|
# # 1. 图像编码 (Flatten time dimension for efficiency)
|
||||||
|
# # [B*T, C, H, W] -> [B*T, Vision_Dim]
|
||||||
|
# flat_images = images.view(B * T, C, H, W)
|
||||||
|
# vision_feats_dict = self.vlm_backbone(flat_images)
|
||||||
|
# raw_img_emb = vision_feats_dict['image_embeds'] # [B*T, Vision_Dim]
|
||||||
|
|
||||||
|
# # 投影并还原时间维度 -> [B, T, Embed_Dim]
|
||||||
|
# img_emb = self.img_projector(raw_img_emb)
|
||||||
|
# img_emb = img_emb.view(B, T, -1)
|
||||||
|
|
||||||
|
# # 2. 状态编码
|
||||||
|
# state_emb = self.state_encoder(state) # [B, T, Embed_Dim]
|
||||||
|
|
||||||
|
# # 3. 特征融合 (这里做一个简单的 Early Fusion 示例)
|
||||||
|
# # 将图像特征和状态特征在特征维度拼接,或在时间维度拼接
|
||||||
|
# # 假设我们只用最近的一帧图像作为 Context,或者将所有历史特征作为 Context
|
||||||
|
# # 这里演示:Context = (Image_History + State_History)
|
||||||
|
# # [B, T, Embed] + [B, T, Embed] -> [B, 2*T, Embed] (Concat on time)
|
||||||
|
# context = torch.cat([img_emb, state_emb], dim=1)
|
||||||
|
|
||||||
|
# # 4. Action Head 分支
|
||||||
|
# if actions is not None:
|
||||||
|
# # --- Training Mode ---
|
||||||
|
# # 必须返回 Loss
|
||||||
|
# return self.action_head.compute_loss(context, actions)
|
||||||
|
# else:
|
||||||
|
# # --- Inference Mode ---
|
||||||
|
# # 必须返回预测的动作序列
|
||||||
|
# return self.action_head.predict_action(context)
|
||||||
24
roboimi/vla/conf/agent/debug_vla.yaml
Normal file
24
roboimi/vla/conf/agent/debug_vla.yaml
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
_target_: roboimi.vla.agent.VLAAgent
|
||||||
|
|
||||||
|
# 1. Backbone Configuration
|
||||||
|
backbone:
|
||||||
|
_target_: roboimi.vla.models.backbones.debug.DebugBackbone
|
||||||
|
embed_dim: 768 # Variable A
|
||||||
|
seq_len: 10
|
||||||
|
|
||||||
|
# 2. Projector Configuration
|
||||||
|
projector:
|
||||||
|
_target_: roboimi.vla.models.projectors.mlp.MLPProjector
|
||||||
|
# Dependency Injection via Interpolation:
|
||||||
|
# Takes 'embed_dim' from the sibling 'backbone' config above.
|
||||||
|
input_dim: ${..backbone.embed_dim}
|
||||||
|
output_dim: 512 # Variable B (The bottleneck size)
|
||||||
|
|
||||||
|
# 3. Head Configuration
|
||||||
|
head:
|
||||||
|
_target_: roboimi.vla.models.heads.debug.DebugHead
|
||||||
|
# Dependency Injection via Interpolation:
|
||||||
|
# Takes 'output_dim' from the sibling 'projector' config above.
|
||||||
|
input_dim: ${..projector.output_dim}
|
||||||
|
action_dim: 7 # (x,y,z, r,p,y, gripper)
|
||||||
|
chunk_size: 16
|
||||||
@@ -1,12 +1,9 @@
|
|||||||
defaults:
|
defaults:
|
||||||
- _self_
|
- _self_
|
||||||
- agent: default # 所有的子模块选择都在 agent/default.yaml 中完成了
|
- agent: debug_vla # <--- This tells Hydra to look in conf/agent/ and load debug_vla.yaml
|
||||||
- data: default_dataset
|
# Future expansions:
|
||||||
- train: gpu
|
# - data: robomimic_hdf5
|
||||||
|
# - train: standard
|
||||||
|
|
||||||
project_name: "vla_frame_refactored"
|
# Global settings (optional for now)
|
||||||
seed: 42
|
seed: 42
|
||||||
|
|
||||||
hydra:
|
|
||||||
run:
|
|
||||||
dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
|
|
||||||
51
roboimi/vla/core/interfaces.py
Normal file
51
roboimi/vla/core/interfaces.py
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
import abc
|
||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
|
||||||
|
class VLABackbone(nn.Module, abc.ABC):
|
||||||
|
"""
|
||||||
|
Contract for Vision/Language Backbones.
|
||||||
|
Must return a feature tensor of shape (B, Seq, Embed_Dim).
|
||||||
|
"""
|
||||||
|
@abc.abstractmethod
|
||||||
|
def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
obs: Dictionary containing 'image' and optionally 'text'.
|
||||||
|
Returns:
|
||||||
|
features: (B, S, D) embedding.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@property
|
||||||
|
@abc.abstractmethod
|
||||||
|
def embed_dim(self) -> int:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class VLAProjector(nn.Module, abc.ABC):
|
||||||
|
"""
|
||||||
|
Contract for the adaptation layer (Projector).
|
||||||
|
Connects Backbone features to the Policy Head.
|
||||||
|
"""
|
||||||
|
@abc.abstractmethod
|
||||||
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class VLAHead(nn.Module, abc.ABC):
|
||||||
|
"""
|
||||||
|
Contract for Action Generation Heads (Policies).
|
||||||
|
Handles both training (loss calculation) and inference (action generation).
|
||||||
|
"""
|
||||||
|
@abc.abstractmethod
|
||||||
|
def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
embeddings: (B, S, Hidden) from Projector.
|
||||||
|
actions: (B, Pred_Horizon, Action_Dim) - Ground truth for training.
|
||||||
|
Returns:
|
||||||
|
Dict containing 'loss' (if actions provided) or 'pred_actions'.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
0
roboimi/vla/models/__init__.py
Normal file
0
roboimi/vla/models/__init__.py
Normal file
@@ -1,6 +1,8 @@
|
|||||||
# Backbone models
|
# Backbone models
|
||||||
from .siglip import SigLIPBackbone
|
# Uncomment when these are implemented:
|
||||||
from .clip import CLIPBackbone
|
# from .siglip import SigLIPBackbone
|
||||||
from .dinov2 import DinoV2Backbone
|
# from .clip import CLIPBackbone
|
||||||
|
# from .dinov2 import DinoV2Backbone
|
||||||
|
from .debug import DebugBackbone
|
||||||
|
|
||||||
__all__ = ["SigLIPBackbone", "CLIPBackbone", "DinoV2Backbone"]
|
__all__ = ["DebugBackbone"]
|
||||||
|
|||||||
30
roboimi/vla/models/backbones/debug.py
Normal file
30
roboimi/vla/models/backbones/debug.py
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from typing import Dict
|
||||||
|
from roboimi.vla.core.interfaces import VLABackbone
|
||||||
|
|
||||||
|
class DebugBackbone(VLABackbone):
|
||||||
|
"""
|
||||||
|
A fake backbone that outputs random tensors.
|
||||||
|
"""
|
||||||
|
def __init__(self, embed_dim: int = 768, seq_len: int = 10):
|
||||||
|
super().__init__()
|
||||||
|
self._embed_dim = embed_dim
|
||||||
|
self.seq_len = seq_len
|
||||||
|
# A dummy trainable parameter
|
||||||
|
self.dummy_param = nn.Parameter(torch.zeros(1))
|
||||||
|
|
||||||
|
def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
|
||||||
|
batch_size = obs['image'].shape[0]
|
||||||
|
|
||||||
|
# 1. Generate random noise
|
||||||
|
noise = torch.randn(batch_size, self.seq_len, self._embed_dim, device=obs['image'].device)
|
||||||
|
|
||||||
|
# 2. CRITICAL FIX: Add the dummy parameter to the noise.
|
||||||
|
# This connects 'noise' to 'self.dummy_param' in the computation graph.
|
||||||
|
# The value doesn't change (since param is 0), but the gradient path is established.
|
||||||
|
return noise + self.dummy_param
|
||||||
|
|
||||||
|
@property
|
||||||
|
def embed_dim(self) -> int:
|
||||||
|
return self._embed_dim
|
||||||
@@ -1,5 +1,9 @@
|
|||||||
# Action Head models
|
# # Action Head models
|
||||||
from .diffusion import DiffusionActionHead
|
# from .diffusion import DiffusionActionHead
|
||||||
from .act import ACTHead
|
# from .act import ACTHead
|
||||||
|
|
||||||
__all__ = ["DiffusionActionHead", "ACTHead"]
|
# __all__ = ["DiffusionActionHead", "ACTHead"]
|
||||||
|
|
||||||
|
from .debug import DebugHead
|
||||||
|
|
||||||
|
__all__ = ["DebugHead"]
|
||||||
33
roboimi/vla/models/heads/debug.py
Normal file
33
roboimi/vla/models/heads/debug.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from typing import Dict, Optional
|
||||||
|
from roboimi.vla.core.interfaces import VLAHead
|
||||||
|
|
||||||
|
class DebugHead(VLAHead):
|
||||||
|
"""
|
||||||
|
A fake Action Head using MSE Loss.
|
||||||
|
Replaces complex Diffusion/ACT policies for architecture verification.
|
||||||
|
"""
|
||||||
|
def __init__(self, input_dim: int, action_dim: int, chunk_size: int = 16):
|
||||||
|
super().__init__()
|
||||||
|
# Simple regression from embedding -> action chunk
|
||||||
|
self.regressor = nn.Linear(input_dim, chunk_size * action_dim)
|
||||||
|
self.action_dim = action_dim
|
||||||
|
self.chunk_size = chunk_size
|
||||||
|
self.loss_fn = nn.MSELoss()
|
||||||
|
|
||||||
|
def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
|
||||||
|
# Simple pooling over sequence dimension to get (B, Hidden)
|
||||||
|
pooled_embed = embeddings.mean(dim=1)
|
||||||
|
|
||||||
|
# Predict actions: (B, Chunk * Act_Dim) -> (B, Chunk, Act_Dim)
|
||||||
|
pred_flat = self.regressor(pooled_embed)
|
||||||
|
pred_actions = pred_flat.view(-1, self.chunk_size, self.action_dim)
|
||||||
|
|
||||||
|
output = {"pred_actions": pred_actions}
|
||||||
|
|
||||||
|
if actions is not None:
|
||||||
|
# Calculate MSE Loss against ground truth
|
||||||
|
output["loss"] = self.loss_fn(pred_actions, actions)
|
||||||
|
|
||||||
|
return output
|
||||||
@@ -1,5 +1,9 @@
|
|||||||
# Projector models
|
# Projector models
|
||||||
from .mlp import MLPProjector
|
# from .mlp import MLPProjector
|
||||||
from .perceiver import PerceiverResampler
|
# from .perceiver import PerceiverResampler
|
||||||
|
|
||||||
__all__ = ["MLPProjector", "PerceiverResampler"]
|
# __all__ = ["MLPProjector", "PerceiverResampler"]
|
||||||
|
|
||||||
|
from .mlp import MLPProjector
|
||||||
|
|
||||||
|
__all__ = ["MLPProjector"]
|
||||||
@@ -1 +1,19 @@
|
|||||||
# MLP Projector 实现
|
import torch
|
||||||
|
import torch.nn as nn
|
||||||
|
from roboimi.vla.core.interfaces import VLAProjector
|
||||||
|
|
||||||
|
class MLPProjector(VLAProjector):
|
||||||
|
"""
|
||||||
|
A simple Linear Projection layer.
|
||||||
|
First-class citizen: Adapts Backbone dim -> Head dim.
|
||||||
|
"""
|
||||||
|
def __init__(self, input_dim: int, output_dim: int):
|
||||||
|
super().__init__()
|
||||||
|
self.net = nn.Sequential(
|
||||||
|
nn.Linear(input_dim, output_dim),
|
||||||
|
nn.GELU(),
|
||||||
|
nn.Linear(output_dim, output_dim)
|
||||||
|
)
|
||||||
|
|
||||||
|
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||||
|
return self.net(x)
|
||||||
@@ -1 +0,0 @@
|
|||||||
# 将图片文件夹转为 HDF5 格式
|
|
||||||
58
roboimi/vla/scripts/verify_arch.py
Normal file
58
roboimi/vla/scripts/verify_arch.py
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
import hydra
|
||||||
|
import torch
|
||||||
|
from omegaconf import DictConfig, OmegaConf
|
||||||
|
from roboimi.vla.agent import VLAAgent
|
||||||
|
|
||||||
|
@hydra.main(version_base=None, config_path="../conf", config_name="config")
|
||||||
|
def main(cfg: DictConfig):
|
||||||
|
print(">>> Initializing VLA Agent (Skeleton Phase)...")
|
||||||
|
# For this test, we override the default agent with our debug config
|
||||||
|
# In a real run, this would be set via command line or defaults list
|
||||||
|
from hydra.utils import instantiate
|
||||||
|
|
||||||
|
# Instantiate the agent using the debug configuration
|
||||||
|
# Assuming 'agent' is a key in your root config.yaml that points to debug_vla
|
||||||
|
# If testing isolated, we instantiate the structure directly.
|
||||||
|
agent: VLAAgent = instantiate(cfg.agent)
|
||||||
|
|
||||||
|
print(f"✅ Agent assembled: {type(agent).__name__}")
|
||||||
|
print(f" - Backbone: {type(agent.backbone).__name__}")
|
||||||
|
print(f" - Projector: {type(agent.projector).__name__}")
|
||||||
|
print(f" - Head: {type(agent.head).__name__}")
|
||||||
|
|
||||||
|
# Mock Data
|
||||||
|
batch_size = 2
|
||||||
|
dummy_obs = {
|
||||||
|
'image': torch.randn(batch_size, 3, 224, 224),
|
||||||
|
'text': ["pick up apple"] * batch_size
|
||||||
|
}
|
||||||
|
dummy_actions = torch.randn(batch_size, 16, 7) # (B, Chunk, Act_Dim)
|
||||||
|
|
||||||
|
batch = {
|
||||||
|
'obs': dummy_obs,
|
||||||
|
'actions': dummy_actions
|
||||||
|
}
|
||||||
|
|
||||||
|
# Forward Pass
|
||||||
|
print("\n>>> Running Forward Pass...")
|
||||||
|
outputs = agent(batch)
|
||||||
|
|
||||||
|
loss = outputs['loss']
|
||||||
|
print(f"✅ Forward successful. Loss: {loss.item():.4f}")
|
||||||
|
|
||||||
|
# Backward Pass (Check Autograd Graph)
|
||||||
|
print("\n>>> Running Backward Pass...")
|
||||||
|
loss.backward()
|
||||||
|
|
||||||
|
# Verify gradients exist in the backbone (proving the chain is intact)
|
||||||
|
# Note: DebugBackbone needs a dummy parameter to show grad
|
||||||
|
backbone_has_grad = agent.backbone.dummy_param.grad is not None or \
|
||||||
|
any(p.grad is not None for p in agent.backbone.parameters())
|
||||||
|
|
||||||
|
if backbone_has_grad:
|
||||||
|
print("✅ Backward successful. Gradients reached Backbone.")
|
||||||
|
else:
|
||||||
|
print("❌ Warning: No gradients found in Backbone.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user