diff --git a/roboimi/vla/conf/projector/mlp.yaml b/roboimi/vla/conf/projector/mlp.yaml deleted file mode 100644 index d59eda2..0000000 --- a/roboimi/vla/conf/projector/mlp.yaml +++ /dev/null @@ -1,6 +0,0 @@ -_target_: roboimi.vla.models.projectors.MLPProjector - -input_dim: ??? # 【修复】等待插值 -output_dim: ??? # 【修复】等待插值 -hidden_dim: 1024 -dropout: 0.1 \ No newline at end of file diff --git a/roboimi/vla/conf/projector/perceiver.yaml b/roboimi/vla/conf/projector/perceiver.yaml deleted file mode 100644 index e69de29..0000000 diff --git a/roboimi/vla/core/base_policy.py b/roboimi/vla/core/base_policy.py deleted file mode 100644 index b262417..0000000 --- a/roboimi/vla/core/base_policy.py +++ /dev/null @@ -1 +0,0 @@ -# define ActionHead(ABC) diff --git a/roboimi/vla/core/base_vlm.py b/roboimi/vla/core/base_vlm.py deleted file mode 100644 index e785c85..0000000 --- a/roboimi/vla/core/base_vlm.py +++ /dev/null @@ -1 +0,0 @@ -# define VLMBackbone(ABC) diff --git a/roboimi/vla/models/backbones/siglip2.py b/roboimi/vla/models/backbones/siglip2.py deleted file mode 100644 index a44997a..0000000 --- a/roboimi/vla/models/backbones/siglip2.py +++ /dev/null @@ -1,37 +0,0 @@ -from transformers import SiglipVisionModel -from roboimi.vla.core.interfaces import VLABackbone -from torchvision import transforms - -class SigLIP2(VLABackbone): - def __init__( - self, - model_name = "google/siglip2-base-patch16-384", - freeze: bool = True, - ): - super().__init__() - - self.vision_model = SiglipVisionModel.from_pretrained(model_name) - self.transform = transforms.Compose([ - transforms.Resize((384, 384), antialias=True), - transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]) - ]) - - if freeze: - self._freeze_parameters() - - def _freeze_parameters(self): - print("❄️ Freezing Vision Backbone parameters") - for param in self.vision_model.parameters(): - param.requires_grad = False - self.vision_model.eval() - - def forward( - self, - images - ): - # images: (B, C, H, W), 归一化到 [0, 1] - images = self.transform(images) # 归一化到 [-1, 1] - - outputs = self.vision_model(pixel_values=images) - - return outputs.last_hidden_state \ No newline at end of file diff --git a/roboimi/vla/models/projectors/__init__.py b/roboimi/vla/models/projectors/__init__.py deleted file mode 100644 index 1d0ccb1..0000000 --- a/roboimi/vla/models/projectors/__init__.py +++ /dev/null @@ -1,9 +0,0 @@ -# Projector models -# from .mlp import MLPProjector -# from .perceiver import PerceiverResampler - -# __all__ = ["MLPProjector", "PerceiverResampler"] - -from .mlp import MLPProjector - -__all__ = ["MLPProjector"] \ No newline at end of file diff --git a/roboimi/vla/models/projectors/mlp.py b/roboimi/vla/models/projectors/mlp.py deleted file mode 100644 index 03655e0..0000000 --- a/roboimi/vla/models/projectors/mlp.py +++ /dev/null @@ -1,19 +0,0 @@ -import torch -import torch.nn as nn -from roboimi.vla.core.interfaces import VLAProjector - -class MLPProjector(VLAProjector): - """ - A simple Linear Projection layer. - First-class citizen: Adapts Backbone dim -> Head dim. - """ - def __init__(self, input_dim: int, output_dim: int): - super().__init__() - self.net = nn.Sequential( - nn.Linear(input_dim, output_dim), - nn.GELU(), - nn.Linear(output_dim, output_dim) - ) - - def forward(self, x: torch.Tensor) -> torch.Tensor: - return self.net(x) \ No newline at end of file diff --git a/roboimi/vla/models/projectors/perceiver.py b/roboimi/vla/models/projectors/perceiver.py deleted file mode 100644 index de29008..0000000 --- a/roboimi/vla/models/projectors/perceiver.py +++ /dev/null @@ -1 +0,0 @@ -# Perceiver Resampler 实现 diff --git a/roboimi/vla/modules/__init__.py b/roboimi/vla/modules/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/roboimi/vla/modules/encoders.py b/roboimi/vla/modules/encoders.py deleted file mode 100644 index 2d600d2..0000000 --- a/roboimi/vla/modules/encoders.py +++ /dev/null @@ -1,106 +0,0 @@ -# StateEncoder, ActionEncoder -import torch -from torch import nn -import torch.nn.functional as F - - -class MLP(nn.Module): - def __init__( - self, - input_dim, - hidden_dim, - output_dim - ): - super().__init__() - self.model = nn.Sequential( - nn.Linear(input_dim, hidden_dim), - nn.ReLU(), - nn.Linear(hidden_dim, output_dim) - ) - - def forward( - self, - input - ): - output = self.model(input) - return output - - - -class SinusoidalPositionalEncoding(nn.Module): - def __init__( - self, - embed_dim - ): - super().__init__() - self.embed_dim = embed_dim - - def forward(self, timesteps): - timesteps = timesteps.float() - B, T = timesteps.shape - device = timesteps.device - - half_dim = self.embed_dim // 2 - - exponent = -torch.arange(half_dim, dtype=torch.float, device=device) * ( - torch.log(torch.tensor(10000.0)) / half_dim - ) - - freqs = timesteps.unsqueeze(-1) * exponent.exp() - - sin = torch.sin(freqs) - cos = torch.cos(freqs) - enc = torch.cat([sin, cos], dim=-1) # (B, T, w) - - return enc - -class ActionEncoder(nn.Module): - def __init__( - self, - action_dim, - embed_dim, - - ): - super().__init__() - self.W1 = nn.Linear(action_dim, embed_dim) - self.W2 = nn.Linear(2 * action_dim, action_dim) - self.W3 = nn.Linear(embed_dim, embed_dim) - self.pos_encoder = SinusoidalPositionalEncoding(embed_dim) - - def forward( - self, - actions, - timesteps - ): - B, T, _ = actions.shape - timesteps = timesteps.unsqueeze(1).expand(-1, T) - - a_emb = self.W1(actions) - tau_emb = self.pos_encoder(timesteps).to(dtype=a_emb.dtype) - x = torch.cat([a_emb, tau_emb], dim=-1) - x = F.silu(self.W2(x)) - x = self.W3(x) - - return x - - -class StateEncoder(nn.Module): - def __init__( - self, - state_dim, - hidden_dim, - embed_dim - ): - super().__init__() - self.mlp = MLP( - state_dim, - hidden_dim, - embed_dim - ) - - def forward( - self, - states - ): - state_emb = self.mlp(states) - return state_emb # [B, 1, embed_dim] \ No newline at end of file diff --git a/roboimi/vla/modules/fusion.py b/roboimi/vla/modules/fusion.py deleted file mode 100644 index 7e0bba3..0000000 --- a/roboimi/vla/modules/fusion.py +++ /dev/null @@ -1 +0,0 @@ -# TransformerFusion, FiLM