51 lines
2.0 KiB
YAML
51 lines
2.0 KiB
YAML
_target_: roboimi.vla.models.backbones.resnet_diffusion.ResNetDiffusionBackbone
|
||
|
||
# ====================
|
||
# 骨干网络选择
|
||
# ====================
|
||
vision_backbone: "resnet18" # torchvision 模型名称: resnet18, resnet34, resnet50
|
||
pretrained_backbone_weights: "IMAGENET1K_V1" # 使用ImageNet预训练权重(torchvision>=0.13)
|
||
vision_backbone_mode: "resnet" # resnet | attnres_resnet
|
||
|
||
# ====================
|
||
# 冻结设置
|
||
# ====================
|
||
freeze_backbone: true # 冻结ResNet参数,只训练后面的pool和out层(推荐:true)
|
||
|
||
# ====================
|
||
# 输入配置
|
||
# ====================
|
||
input_shape: [3, 224, 224] # 输入图像形状 (C, H, W) - ImageNet标准尺寸
|
||
crop_shape: null # 裁剪后的图像形状 (H, W) - 设为null禁用裁剪
|
||
crop_is_random: true # 训练时使用随机裁剪,评估时使用中心裁剪(crop_shape=null时无效)
|
||
|
||
# ====================
|
||
# 归一化和特征提取
|
||
# ====================
|
||
use_group_norm: true # 使用 GroupNorm 替代 BatchNorm(更适合小批次训练)
|
||
spatial_softmax_num_keypoints: 32 # Spatial Softmax 关键点数量
|
||
|
||
# ====================
|
||
# 编码器模式
|
||
# ====================
|
||
# false: 共享编码器(所有摄像头共享一个 ResNet,参数少但容量受限)推荐!
|
||
# true: 独立编码器(每个摄像头有独立的 ResNet,参数多但容量大)
|
||
use_separate_rgb_encoder_per_camera: true
|
||
# false: 将所有相机特征拼成一个条件token;true: 每个相机输出一个独立token
|
||
output_tokens_per_camera: false
|
||
num_cameras: 3 # 摄像头数量
|
||
|
||
# ====================
|
||
# Full-AttnRes vision trunk(当 vision_backbone_mode=attnres_resnet 时生效)
|
||
# ====================
|
||
attnres_stem_dim: 64
|
||
attnres_stage_dims: [64, 128, 256, 512]
|
||
attnres_stage_depths: [2, 2, 2, 2]
|
||
attnres_stage_heads: [4, 4, 8, 8]
|
||
attnres_stage_kv_heads: [1, 1, 1, 1]
|
||
attnres_stage_window_sizes: [7, 7, 7, 7]
|
||
attnres_dropout: 0.0
|
||
attnres_ffn_mult: 2.667
|
||
attnres_eps: 1.0e-06
|
||
attnres_rope_theta: 10000.0
|