feat: add vision transfer backbones and IMF variants
This commit is contained in:
@@ -180,6 +180,14 @@ def _extract_camera_markers(cond, feature_dim, num_cams):
|
||||
return camera_block[:, 0]
|
||||
|
||||
|
||||
def _extract_token_camera_markers(tokens):
|
||||
return tokens[0, 0, :, 0]
|
||||
|
||||
|
||||
def _extract_token_markers(token_sequence):
|
||||
return token_sequence[0, 0, :, 0]
|
||||
|
||||
|
||||
class ResNetTransformerAgentWiringTest(unittest.TestCase):
|
||||
def test_hydra_wiring_uses_required_three_camera_transformer_conditioning_in_agent_order_and_ignores_extra_keys(self):
|
||||
cfg = _compose_cfg(
|
||||
@@ -246,6 +254,36 @@ class ResNetTransformerAgentWiringTest(unittest.TestCase):
|
||||
with self.assertRaisesRegex(ValueError, 'missing=.*top'):
|
||||
agent.predict_action(missing_images, proprioception)
|
||||
|
||||
def test_multitoken_resnet_backbone_emits_one_token_per_camera_in_agent_order(self):
|
||||
cfg = _compose_cfg(
|
||||
overrides=[
|
||||
'agent=resnet_imf_attnres_multitoken',
|
||||
'agent.vision_backbone.pretrained_backbone_weights=null',
|
||||
'agent.vision_backbone.input_shape=[3,16,16]',
|
||||
]
|
||||
)
|
||||
|
||||
with _stub_optional_modules():
|
||||
backbone = instantiate(cfg.agent.vision_backbone)
|
||||
_patch_backbone_for_order_tracking(backbone)
|
||||
images = _make_images(
|
||||
batch_size=1,
|
||||
obs_horizon=cfg.agent.obs_horizon,
|
||||
image_shape=tuple(cfg.agent.vision_backbone.input_shape),
|
||||
per_camera_fill={
|
||||
'front': 30.0,
|
||||
'top': 20.0,
|
||||
'r_vis': 10.0,
|
||||
'left_wrist': 99.0,
|
||||
},
|
||||
)
|
||||
tokens = backbone(images)
|
||||
|
||||
self.assertEqual(tokens.shape, (1, cfg.agent.obs_horizon, 3, backbone.output_dim))
|
||||
self.assertEqual(backbone.tokens_per_step, 3)
|
||||
camera_markers = _extract_token_camera_markers(tokens)
|
||||
self.assertTrue(torch.allclose(camera_markers, torch.tensor([10.0, 20.0, 30.0])))
|
||||
|
||||
def test_agent_rejects_conflicting_explicit_backbone_camera_names(self):
|
||||
cfg = _compose_cfg(
|
||||
overrides=[
|
||||
@@ -382,6 +420,36 @@ class ResNetTransformerAgentWiringTest(unittest.TestCase):
|
||||
with self.assertRaisesRegex(InstantiationException, 'num_cams'):
|
||||
instantiate(cfg.agent)
|
||||
|
||||
def test_multitoken_resnet_backbone_emits_one_token_per_camera_in_agent_order(self):
|
||||
cfg = _compose_cfg(
|
||||
overrides=[
|
||||
'agent=resnet_imf_attnres_multitoken',
|
||||
'agent.vision_backbone.pretrained_backbone_weights=null',
|
||||
'agent.vision_backbone.input_shape=[3,16,16]',
|
||||
'agent.head.n_layer=1',
|
||||
'agent.head.n_emb=32',
|
||||
]
|
||||
)
|
||||
|
||||
with _stub_optional_modules():
|
||||
backbone = instantiate(cfg.agent.vision_backbone)
|
||||
_patch_backbone_for_order_tracking(backbone)
|
||||
images = _make_images(
|
||||
batch_size=1,
|
||||
obs_horizon=cfg.agent.obs_horizon,
|
||||
image_shape=tuple(cfg.agent.vision_backbone.input_shape),
|
||||
per_camera_fill={
|
||||
'front': 30.0,
|
||||
'top': 20.0,
|
||||
'r_vis': 10.0,
|
||||
},
|
||||
)
|
||||
output = backbone(images)
|
||||
|
||||
self.assertEqual(output.shape, (1, cfg.agent.obs_horizon, 3, backbone.output_dim))
|
||||
token_markers = _extract_token_markers(output)
|
||||
self.assertTrue(torch.allclose(token_markers, torch.tensor([10.0, 20.0, 30.0])))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
||||
|
||||
Reference in New Issue
Block a user