45 lines
1011 B
YAML
45 lines
1011 B
YAML
# @package agent
|
|
defaults:
|
|
- /backbone@vision_backbone: siglip2_diffusion
|
|
- /modules@state_encoder: identity_state_encoder
|
|
- /modules@action_encoder: identity_action_encoder
|
|
- /modules@cond_projector: linear_condition_projector
|
|
- /head: imf_transformer1d
|
|
- _self_
|
|
|
|
_target_: roboimi.vla.agent_imf.IMFVLAAgent
|
|
|
|
action_dim: 16
|
|
obs_dim: 16
|
|
normalization_type: "min_max"
|
|
pred_horizon: 16
|
|
obs_horizon: 2
|
|
num_action_steps: 8
|
|
camera_names: ${data.camera_names}
|
|
num_cams: ${len:${agent.camera_names}}
|
|
|
|
vision_backbone:
|
|
num_cameras: ${agent.num_cams}
|
|
camera_names: ${agent.camera_names}
|
|
|
|
cond_projector:
|
|
output_dim: ${agent.head.cond_dim}
|
|
|
|
diffusion_steps: 100
|
|
inference_steps: 1
|
|
head_type: "transformer"
|
|
|
|
head:
|
|
input_dim: ${agent.action_dim}
|
|
output_dim: ${agent.action_dim}
|
|
horizon: ${agent.pred_horizon}
|
|
n_obs_steps: ${agent.obs_horizon}
|
|
cond_dim: 384
|
|
causal_attn: false
|
|
time_as_cond: true
|
|
obs_as_cond: true
|
|
n_cond_layers: 0
|
|
backbone_type: attnres_full
|
|
n_head: 1
|
|
n_kv_head: 1
|