Files
roboimi/roboimi/vla/conf/backbone/resnet_diffusion.yaml

51 lines
2.0 KiB
YAML
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
_target_: roboimi.vla.models.backbones.resnet_diffusion.ResNetDiffusionBackbone
# ====================
# 骨干网络选择
# ====================
vision_backbone: "resnet18" # torchvision 模型名称: resnet18, resnet34, resnet50
pretrained_backbone_weights: "IMAGENET1K_V1" # 使用ImageNet预训练权重torchvision>=0.13
vision_backbone_mode: "resnet" # resnet | attnres_resnet
# ====================
# 冻结设置
# ====================
freeze_backbone: true # 冻结ResNet参数只训练后面的pool和out层推荐true
# ====================
# 输入配置
# ====================
input_shape: [3, 224, 224] # 输入图像形状 (C, H, W) - ImageNet标准尺寸
crop_shape: null # 裁剪后的图像形状 (H, W) - 设为null禁用裁剪
crop_is_random: true # 训练时使用随机裁剪评估时使用中心裁剪crop_shape=null时无效
# ====================
# 归一化和特征提取
# ====================
use_group_norm: true # 使用 GroupNorm 替代 BatchNorm更适合小批次训练
spatial_softmax_num_keypoints: 32 # Spatial Softmax 关键点数量
# ====================
# 编码器模式
# ====================
# false: 共享编码器(所有摄像头共享一个 ResNet参数少但容量受限推荐
# true: 独立编码器(每个摄像头有独立的 ResNet参数多但容量大
use_separate_rgb_encoder_per_camera: true
# false: 将所有相机特征拼成一个条件tokentrue: 每个相机输出一个独立token
output_tokens_per_camera: false
num_cameras: 3 # 摄像头数量
# ====================
# Full-AttnRes vision trunk当 vision_backbone_mode=attnres_resnet 时生效)
# ====================
attnres_stem_dim: 64
attnres_stage_dims: [64, 128, 256, 512]
attnres_stage_depths: [2, 2, 2, 2]
attnres_stage_heads: [4, 4, 8, 8]
attnres_stage_kv_heads: [1, 1, 1, 1]
attnres_stage_window_sizes: [7, 7, 7, 7]
attnres_dropout: 0.0
attnres_ffn_mult: 2.667
attnres_eps: 1.0e-06
attnres_rope_theta: 10000.0