refactor: 重构resnet

2026-02-10 15:26:10 +08:00
parent 88b9c10a75
commit 3c27d6d793
3 changed files with 300 additions and 3 deletions
--- a/roboimi/vla/conf/agent/resnet_diffusion.yaml
+++ b/roboimi/vla/conf/agent/resnet_diffusion.yaml
@@ -1,6 +1,7 @@
 # @package agent
 defaults:
-  - /backbone@vision_backbone: resnet
+  # - /backbone@vision_backbone: resnet
  - /backbone@vision_backbone: resnet_diffusion
  - /modules@state_encoder: identity_state_encoder
  - /modules@action_encoder: identity_action_encoder
  - /head: conditional_unet1d
@@ -16,8 +17,6 @@ obs_dim: 16
 pred_horizon: 16
 obs_horizon: 2
 # Diffusion Parameters
 # diffusion_steps: 100  (这些参数应该移到 head 配置中，或者通过变量传递)
 # Camera Configuration
 num_cams: ${len:${data.camera_names}}  # 自动从 data.camera_names 列表长度获取
--- a/roboimi/vla/conf/backbone/resnet_diffusion.yaml
+++ b/roboimi/vla/conf/backbone/resnet_diffusion.yaml
@@ -0,0 +1,9 @@
 _target_: roboimi.vla.models.backbones.resnet_diffusion.ResNetDiffusionBackbone
 vision_backbone: "resnet18"
 pretrained_backbone_weights: null
 input_shape: [3, 96, 96]
 crop_shape: [84, 84]
 crop_is_random: true
 use_group_norm: true
 spatial_softmax_num_keypoints: 32
 use_separate_rgb_encoder_per_camera: true
--- a/roboimi/vla/models/backbones/resnet_diffusion.py
+++ b/roboimi/vla/models/backbones/resnet_diffusion.py
@@ -0,0 +1,289 @@
 from roboimi.vla.core.interfaces import VLABackbone
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import torchvision
 import numpy as np
 from typing import Callable, Optional, Tuple, Union
 def _replace_submodules(
    root_module: nn.Module, predicate: Callable[[nn.Module], bool], func: Callable[[nn.Module], nn.Module]
 ) -> nn.Module:
    """
    Args:
        root_module: 需要替换子模块的根模块
        predicate: 接受一个模块作为参数，如果该模块需要被替换则返回 True。
        func: 接受一个模块作为参数，并返回一个新的模块来替换它。
    Returns:
        子模块已被替换的根模块。
    """
    if predicate(root_module):
        return func(root_module)
    replace_list = [k.split(".") for k, m in root_module.named_modules(remove_duplicate=True) if predicate(m)]
    for *parents, k in replace_list:
        parent_module = root_module
        if len(parents) > 0:
            parent_module = root_module.get_submodule(".".join(parents))
        if isinstance(parent_module, nn.Sequential):
            src_module = parent_module[int(k)]
        else:
            src_module = getattr(parent_module, k)
        tgt_module = func(src_module)
        if isinstance(parent_module, nn.Sequential):
            parent_module[int(k)] = tgt_module
        else:
            setattr(parent_module, k, tgt_module)
    # 验证所有 BN 是否已被替换
    assert not any(predicate(m) for _, m in root_module.named_modules(remove_duplicate=True))
    return root_module
 class SpatialSoftmax(nn.Module):
    """
    Finn 等人在 "Deep Spatial Autoencoders for Visuomotor Learning" 中描述的空间软 Argmax 操作
    (https://huggingface.co/papers/1509.06113)。这是 robomimic 实现的一个最小移植版本。
    """
    def __init__(self, input_shape, num_kp=None):
        """
        Args:
            input_shape (list): (C, H, W) 输入特征图形状。
            num_kp (int): 输出中的关键点数量。如果为 None，输出将具有与输入相同的通道数。
        """
        super().__init__()
        assert len(input_shape) == 3
        self._in_c, self._in_h, self._in_w = input_shape
        if num_kp is not None:
            self.nets = torch.nn.Conv2d(self._in_c, num_kp, kernel_size=1)
            self._out_c = num_kp
        else:
            self.nets = None
            self._out_c = self._in_c
        # 我们可以直接使用 torch.linspace，但这似乎与 numpy 的行为略有不同
        # 并且会导致预训练模型的 pc_success 略有下降。
        pos_x, pos_y = np.meshgrid(np.linspace(-1.0, 1.0, self._in_w), np.linspace(-1.0, 1.0, self._in_h))
        pos_x = torch.from_numpy(pos_x.reshape(self._in_h * self._in_w, 1)).float()
        pos_y = torch.from_numpy(pos_y.reshape(self._in_h * self._in_w, 1)).float()
        # 注册为 buffer，以便将其移动到正确的设备。
        self.register_buffer("pos_grid", torch.cat([pos_x, pos_y], dim=1))
    def forward(self, features: torch.Tensor) -> torch.Tensor:
        """
        Args:
            features: (B, C, H, W) 输入特征图。
        Returns:
            (B, K, 2) 关键点的图像空间坐标。
        """
        if self.nets is not None:
            features = self.nets(features)
        # [B, K, H, W] -> [B * K, H * W]，其中 K 是关键点数量
        features = features.reshape(-1, self._in_h * self._in_w)
        # 2d softmax 归一化
        attention = F.softmax(features, dim=-1)
        # [B * K, H * W] x [H * W, 2] -> [B * K, 2] 用于 x 和 y 维度的空间坐标均值
        expected_xy = attention @ self.pos_grid
        # 重塑为 [B, K, 2]
        feature_keypoints = expected_xy.view(-1, self._out_c, 2)
        return feature_keypoints
 class ResNetDiffusionBackbone(VLABackbone):
    def __init__(
        self,
        vision_backbone: str = "resnet18",
        pretrained_backbone_weights: str | None = None,
        input_shape: Tuple[int, int, int] = (3, 84, 84), # (C, H, W)
        crop_shape: Optional[Tuple[int, int]] = None,
        crop_is_random: bool = True,
        use_group_norm: bool = True,
        spatial_softmax_num_keypoints: int = 32,
        use_separate_rgb_encoder_per_camera: bool = True,
    ):
        super().__init__()
        # 保存所有参数作为实例变量
        self.vision_backbone = vision_backbone
        self.pretrained_backbone_weights = pretrained_backbone_weights
        self.input_shape = input_shape
        self.crop_shape = crop_shape
        self.crop_is_random = crop_is_random
        self.use_group_norm = use_group_norm
        self.spatial_softmax_num_keypoints = spatial_softmax_num_keypoints
        self.use_separate_rgb_encoder_per_camera = use_separate_rgb_encoder_per_camera
        # 设置可选的预处理。
        if crop_shape is not None:
            self.do_crop = True
            # 评估时始终使用中心裁剪
            self.center_crop = torchvision.transforms.CenterCrop(crop_shape)
            if crop_is_random:
                self.maybe_random_crop = torchvision.transforms.RandomCrop(crop_shape)
            else:
                self.maybe_random_crop = self.center_crop
        else:
            self.do_crop = False
            self.crop_shape = input_shape[1:]
        # 创建骨干网络的内部函数
        def _create_backbone():
            backbone_model = getattr(torchvision.models, vision_backbone)(
                weights=pretrained_backbone_weights
            )
            # 移除 AvgPool 和 FC (假设 layer4 是 children()[-3])
            backbone = nn.Sequential(*(list(backbone_model.children())[:-2]))
            if use_group_norm:
                backbone = _replace_submodules(
                    root_module=backbone,
                    predicate=lambda x: isinstance(x, nn.BatchNorm2d),
                    func=lambda x: nn.GroupNorm(num_groups=x.num_features // 16, num_channels=x.num_features),
                )
            return backbone
        # 创建池化和最终层的内部函数
        def _create_head(feature_map_shape):
            pool = SpatialSoftmax(feature_map_shape, num_kp=spatial_softmax_num_keypoints)
            feature_dim = spatial_softmax_num_keypoints * 2
            out = nn.Linear(spatial_softmax_num_keypoints * 2, feature_dim)
            relu = nn.ReLU()
            return pool, feature_dim, out, relu
        # 使用试运行来获取特征图形状
        dummy_shape = (1, input_shape[0], *self.crop_shape)
        if self.use_separate_rgb_encoder_per_camera:
            # 每个相机使用独立的编码器，我们先创建一个临时骨干网络来获取特征图形状
            temp_backbone = _create_backbone()
            with torch.no_grad():
                dummy_out = temp_backbone(torch.zeros(dummy_shape))
            feature_map_shape = dummy_out.shape[1:] # (C, H, W)
            del temp_backbone
            # 注意：我们在 forward 方法中动态创建编码器，或者在知道相机数量时创建
            # 这里我们先不创建具体的编码器实例，而是在 forward 时根据需要创建
            # 或者，我们可以要求用户提供相机数量参数
            self.camera_encoders = None
            self.feature_dim = spatial_softmax_num_keypoints * 2
        else:
            # 所有相机共享同一个编码器
            self.backbone = _create_backbone()
            with torch.no_grad():
                dummy_out = self.backbone(torch.zeros(dummy_shape))
            feature_map_shape = dummy_out.shape[1:] # (C, H, W)
            self.pool, self.feature_dim, self.out, self.relu = _create_head(feature_map_shape)
    def _create_single_encoder(self):
        """内部方法：创建单个编码器（骨干网络 + 池化 + 输出层）"""
        # 创建骨干网络
        backbone_model = getattr(torchvision.models, self.vision_backbone)(
            weights=self.pretrained_backbone_weights
        )
        backbone = nn.Sequential(*(list(backbone_model.children())[:-2]))
        if self.use_group_norm:
            backbone = _replace_submodules(
                root_module=backbone,
                predicate=lambda x: isinstance(x, nn.BatchNorm2d),
                func=lambda x: nn.GroupNorm(num_groups=x.num_features // 16, num_channels=x.num_features),
            )
        # 获取特征图形状
        dummy_shape = (1, self.input_shape[0], *self.crop_shape)
        with torch.no_grad():
            dummy_out = backbone(torch.zeros(dummy_shape))
        feature_map_shape = dummy_out.shape[1:]
        # 创建池化和输出层
        pool = SpatialSoftmax(feature_map_shape, num_kp=self.spatial_softmax_num_keypoints)
        out = nn.Linear(self.spatial_softmax_num_keypoints * 2, self.feature_dim)
        relu = nn.ReLU()
        return nn.ModuleList([backbone, pool, out, relu])
    def forward_single_image(self, x: torch.Tensor, encoder: nn.ModuleList = None) -> torch.Tensor:
        if self.do_crop:
            x = self.maybe_random_crop(x) if self.training else self.center_crop(x)
        if self.use_separate_rgb_encoder_per_camera:
            # 使用独立编码器
            backbone, pool, out, relu = encoder
            x = relu(out(torch.flatten(pool(backbone(x)), start_dim=1)))
        else:
            # 使用共享编码器
            x = self.relu(self.out(torch.flatten(self.pool(self.backbone(x)), start_dim=1)))
        return x
    def forward(self, images):
        any_tensor = next(iter(images.values()))
        B, T = any_tensor.shape[:2]
        features_all = []
        # 检查是否需要初始化独立编码器
        if self.use_separate_rgb_encoder_per_camera and self.camera_encoders is None:
            self.camera_encoders = nn.ModuleDict()
            for cam_name in sorted(images.keys()):
                self.camera_encoders[cam_name] = self._create_single_encoder()
        for cam_name in sorted(images.keys()):
            img = images[cam_name]
            if self.use_separate_rgb_encoder_per_camera:
                # 使用该相机对应的独立编码器
                features = self.forward_single_image(
                    img.view(B * T, *img.shape[2:]),
                    self.camera_encoders[cam_name]
                )
            else:
                # 使用共享编码器
                features = self.forward_single_image(img.view(B * T, *img.shape[2:]))
            features_all.append(features)
        return torch.cat(features_all, dim=1).view(B, T, -1)
    @property
    def output_dim(self):
        return self.feature_dim
 if __name__ == "__main__":
    print("🚀 Testing ResNetDiffusionBackbone...")
    # Configuration
    B, T = 2, 5
    C, H, W = 3, 96, 96
    crop_h, crop_w = 84, 84
    num_keypoints = 32
    feature_dim_per_cam = num_keypoints * 2
    # Instantiate model
    backbone = ResNetDiffusionBackbone(
        vision_backbone="resnet18",
        pretrained_backbone_weights=None, # Speed up test
        input_shape=(C, H, W),
        crop_shape=(crop_h, crop_w),
        crop_is_random=True,
        use_group_norm=True,
        spatial_softmax_num_keypoints=num_keypoints
    )
    print(f"✅ Model instantiated. Output dim per camera: {backbone.output_dim}")
    # Create dummy input
    images = {
        "cam_high": torch.randn(B, T, C, H, W),
        "cam_wrist": torch.randn(B, T, C, H, W)
    }
    # Forward pass
    print("🔄 Running forward pass...")
    output = backbone(images)
    print(f"Input shapes: {[v.shape for v in images.values()]}")
    print(f"Output shape: {output.shape}")
    # Verification
    expected_dim = len(images) * feature_dim_per_cam
    assert output.shape == (B, T, expected_dim), f"Expected shape {(B, T, expected_dim)}, got {output.shape}"
    print("✨ Test passed!")