debug: 核心骨架伪实现

This commit is contained in:
gouhanke
2026-02-03 16:14:54 +08:00
parent d3863ea1dd
commit bd8bbb0cfc
15 changed files with 348 additions and 85 deletions

View File

View File

@@ -1,6 +1,8 @@
# Backbone models
from .siglip import SigLIPBackbone
from .clip import CLIPBackbone
from .dinov2 import DinoV2Backbone
# Uncomment when these are implemented:
# from .siglip import SigLIPBackbone
# from .clip import CLIPBackbone
# from .dinov2 import DinoV2Backbone
from .debug import DebugBackbone
__all__ = ["SigLIPBackbone", "CLIPBackbone", "DinoV2Backbone"]
__all__ = ["DebugBackbone"]

View File

@@ -0,0 +1,30 @@
import torch
import torch.nn as nn
from typing import Dict
from roboimi.vla.core.interfaces import VLABackbone
class DebugBackbone(VLABackbone):
"""
A fake backbone that outputs random tensors.
"""
def __init__(self, embed_dim: int = 768, seq_len: int = 10):
super().__init__()
self._embed_dim = embed_dim
self.seq_len = seq_len
# A dummy trainable parameter
self.dummy_param = nn.Parameter(torch.zeros(1))
def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
batch_size = obs['image'].shape[0]
# 1. Generate random noise
noise = torch.randn(batch_size, self.seq_len, self._embed_dim, device=obs['image'].device)
# 2. CRITICAL FIX: Add the dummy parameter to the noise.
# This connects 'noise' to 'self.dummy_param' in the computation graph.
# The value doesn't change (since param is 0), but the gradient path is established.
return noise + self.dummy_param
@property
def embed_dim(self) -> int:
return self._embed_dim

View File

@@ -1,5 +1,9 @@
# Action Head models
from .diffusion import DiffusionActionHead
from .act import ACTHead
# # Action Head models
# from .diffusion import DiffusionActionHead
# from .act import ACTHead
__all__ = ["DiffusionActionHead", "ACTHead"]
# __all__ = ["DiffusionActionHead", "ACTHead"]
from .debug import DebugHead
__all__ = ["DebugHead"]

View File

@@ -0,0 +1,33 @@
import torch
import torch.nn as nn
from typing import Dict, Optional
from roboimi.vla.core.interfaces import VLAHead
class DebugHead(VLAHead):
"""
A fake Action Head using MSE Loss.
Replaces complex Diffusion/ACT policies for architecture verification.
"""
def __init__(self, input_dim: int, action_dim: int, chunk_size: int = 16):
super().__init__()
# Simple regression from embedding -> action chunk
self.regressor = nn.Linear(input_dim, chunk_size * action_dim)
self.action_dim = action_dim
self.chunk_size = chunk_size
self.loss_fn = nn.MSELoss()
def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
# Simple pooling over sequence dimension to get (B, Hidden)
pooled_embed = embeddings.mean(dim=1)
# Predict actions: (B, Chunk * Act_Dim) -> (B, Chunk, Act_Dim)
pred_flat = self.regressor(pooled_embed)
pred_actions = pred_flat.view(-1, self.chunk_size, self.action_dim)
output = {"pred_actions": pred_actions}
if actions is not None:
# Calculate MSE Loss against ground truth
output["loss"] = self.loss_fn(pred_actions, actions)
return output

View File

@@ -1,5 +1,9 @@
# Projector models
from .mlp import MLPProjector
from .perceiver import PerceiverResampler
# from .mlp import MLPProjector
# from .perceiver import PerceiverResampler
__all__ = ["MLPProjector", "PerceiverResampler"]
# __all__ = ["MLPProjector", "PerceiverResampler"]
from .mlp import MLPProjector
__all__ = ["MLPProjector"]

View File

@@ -1 +1,19 @@
# MLP Projector 实现
import torch
import torch.nn as nn
from roboimi.vla.core.interfaces import VLAProjector
class MLPProjector(VLAProjector):
"""
A simple Linear Projection layer.
First-class citizen: Adapts Backbone dim -> Head dim.
"""
def __init__(self, input_dim: int, output_dim: int):
super().__init__()
self.net = nn.Sequential(
nn.Linear(input_dim, output_dim),
nn.GELU(),
nn.Linear(output_dim, output_dim)
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.net(x)