debug: 核心骨架伪实现
This commit is contained in:
0
roboimi/vla/models/__init__.py
Normal file
0
roboimi/vla/models/__init__.py
Normal file
@@ -1,6 +1,8 @@
|
||||
# Backbone models
|
||||
from .siglip import SigLIPBackbone
|
||||
from .clip import CLIPBackbone
|
||||
from .dinov2 import DinoV2Backbone
|
||||
# Uncomment when these are implemented:
|
||||
# from .siglip import SigLIPBackbone
|
||||
# from .clip import CLIPBackbone
|
||||
# from .dinov2 import DinoV2Backbone
|
||||
from .debug import DebugBackbone
|
||||
|
||||
__all__ = ["SigLIPBackbone", "CLIPBackbone", "DinoV2Backbone"]
|
||||
__all__ = ["DebugBackbone"]
|
||||
|
||||
30
roboimi/vla/models/backbones/debug.py
Normal file
30
roboimi/vla/models/backbones/debug.py
Normal file
@@ -0,0 +1,30 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from typing import Dict
|
||||
from roboimi.vla.core.interfaces import VLABackbone
|
||||
|
||||
class DebugBackbone(VLABackbone):
|
||||
"""
|
||||
A fake backbone that outputs random tensors.
|
||||
"""
|
||||
def __init__(self, embed_dim: int = 768, seq_len: int = 10):
|
||||
super().__init__()
|
||||
self._embed_dim = embed_dim
|
||||
self.seq_len = seq_len
|
||||
# A dummy trainable parameter
|
||||
self.dummy_param = nn.Parameter(torch.zeros(1))
|
||||
|
||||
def forward(self, obs: Dict[str, torch.Tensor]) -> torch.Tensor:
|
||||
batch_size = obs['image'].shape[0]
|
||||
|
||||
# 1. Generate random noise
|
||||
noise = torch.randn(batch_size, self.seq_len, self._embed_dim, device=obs['image'].device)
|
||||
|
||||
# 2. CRITICAL FIX: Add the dummy parameter to the noise.
|
||||
# This connects 'noise' to 'self.dummy_param' in the computation graph.
|
||||
# The value doesn't change (since param is 0), but the gradient path is established.
|
||||
return noise + self.dummy_param
|
||||
|
||||
@property
|
||||
def embed_dim(self) -> int:
|
||||
return self._embed_dim
|
||||
@@ -1,5 +1,9 @@
|
||||
# Action Head models
|
||||
from .diffusion import DiffusionActionHead
|
||||
from .act import ACTHead
|
||||
# # Action Head models
|
||||
# from .diffusion import DiffusionActionHead
|
||||
# from .act import ACTHead
|
||||
|
||||
__all__ = ["DiffusionActionHead", "ACTHead"]
|
||||
# __all__ = ["DiffusionActionHead", "ACTHead"]
|
||||
|
||||
from .debug import DebugHead
|
||||
|
||||
__all__ = ["DebugHead"]
|
||||
33
roboimi/vla/models/heads/debug.py
Normal file
33
roboimi/vla/models/heads/debug.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from typing import Dict, Optional
|
||||
from roboimi.vla.core.interfaces import VLAHead
|
||||
|
||||
class DebugHead(VLAHead):
|
||||
"""
|
||||
A fake Action Head using MSE Loss.
|
||||
Replaces complex Diffusion/ACT policies for architecture verification.
|
||||
"""
|
||||
def __init__(self, input_dim: int, action_dim: int, chunk_size: int = 16):
|
||||
super().__init__()
|
||||
# Simple regression from embedding -> action chunk
|
||||
self.regressor = nn.Linear(input_dim, chunk_size * action_dim)
|
||||
self.action_dim = action_dim
|
||||
self.chunk_size = chunk_size
|
||||
self.loss_fn = nn.MSELoss()
|
||||
|
||||
def forward(self, embeddings: torch.Tensor, actions: Optional[torch.Tensor] = None) -> Dict[str, torch.Tensor]:
|
||||
# Simple pooling over sequence dimension to get (B, Hidden)
|
||||
pooled_embed = embeddings.mean(dim=1)
|
||||
|
||||
# Predict actions: (B, Chunk * Act_Dim) -> (B, Chunk, Act_Dim)
|
||||
pred_flat = self.regressor(pooled_embed)
|
||||
pred_actions = pred_flat.view(-1, self.chunk_size, self.action_dim)
|
||||
|
||||
output = {"pred_actions": pred_actions}
|
||||
|
||||
if actions is not None:
|
||||
# Calculate MSE Loss against ground truth
|
||||
output["loss"] = self.loss_fn(pred_actions, actions)
|
||||
|
||||
return output
|
||||
@@ -1,5 +1,9 @@
|
||||
# Projector models
|
||||
from .mlp import MLPProjector
|
||||
from .perceiver import PerceiverResampler
|
||||
# from .mlp import MLPProjector
|
||||
# from .perceiver import PerceiverResampler
|
||||
|
||||
__all__ = ["MLPProjector", "PerceiverResampler"]
|
||||
# __all__ = ["MLPProjector", "PerceiverResampler"]
|
||||
|
||||
from .mlp import MLPProjector
|
||||
|
||||
__all__ = ["MLPProjector"]
|
||||
@@ -1 +1,19 @@
|
||||
# MLP Projector 实现
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from roboimi.vla.core.interfaces import VLAProjector
|
||||
|
||||
class MLPProjector(VLAProjector):
|
||||
"""
|
||||
A simple Linear Projection layer.
|
||||
First-class citizen: Adapts Backbone dim -> Head dim.
|
||||
"""
|
||||
def __init__(self, input_dim: int, output_dim: int):
|
||||
super().__init__()
|
||||
self.net = nn.Sequential(
|
||||
nn.Linear(input_dim, output_dim),
|
||||
nn.GELU(),
|
||||
nn.Linear(output_dim, output_dim)
|
||||
)
|
||||
|
||||
def forward(self, x: torch.Tensor) -> torch.Tensor:
|
||||
return self.net(x)
|
||||
Reference in New Issue
Block a user