roboimi/roboimi/vla/conf/agent/resnet_diffusion.yaml

# @package agent
defaults:
  # - /backbone@vision_backbone: resnet
  - /backbone@vision_backbone: resnet_diffusion
  - /modules@state_encoder: identity_state_encoder
  - /modules@action_encoder: identity_action_encoder
  - /head: conditional_unet1d
  - _self_

_target_: roboimi.vla.agent.VLAAgent

# ====================
# 模型维度配置
# ====================
action_dim: 16              # 动作维度（机器人关节数）
obs_dim: 16                 # 本体感知维度（关节位置）

# ====================
#
# ====================
normalization_type: "min_max" # "min_max" or "gaussian"

# ====================
# 时间步配置
# ====================
pred_horizon: 8           # 预测未来多少步动作
obs_horizon: 2              # 使用多少步历史观测
num_action_steps: 4         # 每次推理实际执行多少步动作（应 <= pred_horizon - obs_horizon + 1）

# ====================
# 相机配置
# ====================
num_cams: 3                 # 摄像头数量 (r_vis, top, front)

# ====================
# 扩散过程配置
# ====================
diffusion_steps: 100       # 扩散训练步数（DDPM）
inference_steps: 10         # 推理时的去噪步数（DDIM，固定为 10）