Files
roboimi/roboimi/vla/conf/agent/siglip2_imf_attnres.yaml

45 lines
1011 B
YAML

# @package agent
defaults:
- /backbone@vision_backbone: siglip2_diffusion
- /modules@state_encoder: identity_state_encoder
- /modules@action_encoder: identity_action_encoder
- /modules@cond_projector: linear_condition_projector
- /head: imf_transformer1d
- _self_
_target_: roboimi.vla.agent_imf.IMFVLAAgent
action_dim: 16
obs_dim: 16
normalization_type: "min_max"
pred_horizon: 16
obs_horizon: 2
num_action_steps: 8
camera_names: ${data.camera_names}
num_cams: ${len:${agent.camera_names}}
vision_backbone:
num_cameras: ${agent.num_cams}
camera_names: ${agent.camera_names}
cond_projector:
output_dim: ${agent.head.cond_dim}
diffusion_steps: 100
inference_steps: 1
head_type: "transformer"
head:
input_dim: ${agent.action_dim}
output_dim: ${agent.action_dim}
horizon: ${agent.pred_horizon}
n_obs_steps: ${agent.obs_horizon}
cond_dim: 384
causal_attn: false
time_as_cond: true
obs_as_cond: true
n_cond_layers: 0
backbone_type: attnres_full
n_head: 1
n_kv_head: 1