feat: add held-out validation and dual-decoder lewm imf

This commit is contained in:
Logic
2026-04-17 19:26:56 +08:00
parent 74f4963613
commit 395f5a1645
8 changed files with 693 additions and 86 deletions

View File

@@ -388,6 +388,26 @@ class _StubFutureTokenPredictor(nn.Module):
return summary.repeat(1, self.num_future_tokens, 1)
class _RecordingDirectFutureDecoder(nn.Module):
def __init__(self):
super().__init__()
self.scale = nn.Parameter(torch.tensor(0.5))
self.calls = []
def forward(self, sample, r, t, cond=None):
record = {
'sample': sample.detach().clone(),
'r': r.detach().clone(),
't': t.detach().clone(),
'cond': None if cond is None else cond.detach().clone(),
}
self.calls.append(record)
cond_term = 0.0
if cond is not None:
cond_term = cond.mean(dim=1, keepdim=True)
return self.scale * sample + cond_term
class _RecordingSigReg(nn.Module):
def __init__(self, value=0.5):
super().__init__()
@@ -687,6 +707,148 @@ class IMFVLAAgentTest(unittest.TestCase):
)
torch.testing.assert_close(sigreg.calls[0], expected_lewm_history.transpose(0, 1))
def test_predict_action_with_dual_decoder_keeps_action_condition_history_only(self):
agent_cls, agent_module = _load_imf_agent_class()
head = _RecordingLinearIMFHead()
future_decoder = _RecordingDirectFutureDecoder()
agent = agent_cls(
vision_backbone=_StubVisionBackbone(),
state_encoder=nn.Identity(),
action_encoder=nn.Identity(),
head=head,
future_decoder=future_decoder,
action_dim=2,
obs_dim=1,
pred_horizon=3,
obs_horizon=2,
diffusion_steps=10,
inference_steps=1,
num_cams=len(_CAMERA_NAMES),
camera_names=_CAMERA_NAMES,
num_action_steps=2,
head_type='transformer',
lewm_history_horizon=3,
lewm_query_offsets=[8],
lewm_loss_weight=1.0,
)
agent.infer_scheduler = _ForbiddenScheduler()
with torch.no_grad():
agent.future_query_tokens.copy_(torch.tensor([[[0.1, 0.2, 0.3, 0.4]]], dtype=torch.float32))
images = _make_images(
batch_size=1,
obs_horizon=2,
per_camera_fill={'r_vis': 10.0, 'top': 20.0, 'front': 30.0},
)
qpos = torch.tensor([[[1.0], [2.0]]], dtype=torch.float32)
initial_noise = torch.tensor(
[[[1.0, -1.0], [0.0, 2.0], [3.0, -2.0]]],
dtype=torch.float32,
)
with mock.patch.object(agent_module.torch, 'randn', return_value=initial_noise):
_ = agent.predict_action(images, qpos)
expected_history = torch.tensor(
[[[10.0, 20.0, 30.0, 1.0], [10.0, 20.0, 30.0, 2.0]]],
dtype=torch.float32,
)
self.assertEqual(len(head.calls), 1)
self.assertTrue(torch.allclose(head.calls[0]['cond'], expected_history))
self.assertEqual(len(future_decoder.calls), 0)
def test_compute_loss_with_dual_decoder_tracks_lewm_loss_breakdown(self):
agent_cls, agent_module = _load_imf_agent_class()
head = _RecordingLinearIMFHead()
future_decoder = _RecordingDirectFutureDecoder()
sigreg = _RecordingSigReg(value=0.75)
agent = agent_cls(
vision_backbone=_StubVisionBackbone(),
state_encoder=nn.Identity(),
action_encoder=nn.Identity(),
head=head,
future_decoder=future_decoder,
action_dim=2,
obs_dim=1,
pred_horizon=3,
obs_horizon=2,
diffusion_steps=10,
inference_steps=1,
num_cams=len(_CAMERA_NAMES),
camera_names=_CAMERA_NAMES,
num_action_steps=2,
head_type='transformer',
lewm_history_horizon=3,
lewm_query_offsets=[8],
lewm_sigreg=sigreg,
lewm_sigreg_weight=0.09,
lewm_loss_weight=1.0,
)
with torch.no_grad():
agent.future_query_tokens.copy_(torch.tensor([[[0.2, 0.4, 0.6, 0.8]]], dtype=torch.float32))
images = _make_images(
batch_size=1,
obs_horizon=2,
per_camera_fill={'r_vis': 1.0, 'top': 2.0, 'front': 3.0},
)
qpos = torch.tensor([[[0.25], [0.75]]], dtype=torch.float32)
actions = torch.tensor(
[[[1.0, -1.0], [0.5, 0.25], [-0.5, 1.5]]],
dtype=torch.float32,
)
lewm_images = _make_images(
batch_size=1,
obs_horizon=3,
per_camera_fill={'r_vis': 1.0, 'top': 2.0, 'front': 3.0},
)
lewm_qpos = torch.tensor([[[0.1], [0.2], [0.3]]], dtype=torch.float32)
lewm_future_images = _make_images(
batch_size=1,
obs_horizon=1,
per_camera_fill={'r_vis': 1.0, 'top': 2.0, 'front': 3.0},
)
lewm_future_qpos = torch.tensor([[[0.4]]], dtype=torch.float32)
noise = torch.tensor(
[[[0.2, -0.4], [0.1, 0.3], [0.5, -0.2]]],
dtype=torch.float32,
)
t_sample = torch.tensor([0.8], dtype=torch.float32)
r_sample = torch.tensor([0.25], dtype=torch.float32)
with mock.patch.object(agent_module.torch, 'randn_like', return_value=noise), \
mock.patch.object(agent_module.torch, 'rand', side_effect=[t_sample, r_sample]):
loss = agent.compute_loss(
{
'images': images,
'qpos': qpos,
'action': actions,
'lewm_images': lewm_images,
'lewm_qpos': lewm_qpos,
'lewm_future_images': lewm_future_images,
'lewm_future_qpos': lewm_future_qpos,
}
)
metrics = agent.get_last_loss_breakdown()
self.assertAlmostEqual(loss.item(), metrics['loss'], places=6)
self.assertEqual(len(head.calls), 2)
self.assertEqual(head.calls[0]['cond'].shape, (1, 2, 4))
self.assertEqual(len(future_decoder.calls), 1)
self.assertEqual(future_decoder.calls[0]['cond'].shape, (1, 3, 4))
self.assertAlmostEqual(
metrics['loss'],
metrics['action_loss'] + metrics['lewm_loss'],
places=5,
)
self.assertAlmostEqual(
metrics['lewm_loss'],
metrics['lewm_pred_loss'] + 0.09 * metrics['lewm_sigreg_loss'],
places=5,
)
self.assertGreater(metrics['lewm_pred_loss'], 0.0)
self.assertAlmostEqual(metrics['lewm_sigreg_loss'], 0.75, places=6)
def test_select_action_only_regenerates_when_action_queue_is_empty(self):
agent, _head, _agent_module = self._make_agent(pred_horizon=4, obs_horizon=2, num_action_steps=2)
observation = {
@@ -1077,6 +1239,36 @@ class IMFVLAAgentTest(unittest.TestCase):
)
self.assertIsNotNone(agent.lewm_sigreg)
def test_hydra_config_instantiates_lewm_resnet_dual_decoder_imf_attnres(self):
cfg = _compose_cfg(
overrides=[
'agent=lewm_resnet_dual_decoder_imf_attnres',
'agent.head.n_layer=1',
'agent.head.n_emb=16',
'agent.future_decoder.n_layer=1',
'agent.future_decoder.n_emb=16',
'agent.lewm_query_offsets=[8]',
]
)
self.assertEqual(cfg.agent._target_, 'roboimi.vla.agent_imf.IMFVLAAgent')
self.assertEqual(cfg.agent.extra_condition_tokens, 0)
self.assertEqual(
cfg.agent.future_decoder._target_,
'roboimi.vla.models.heads.imf_transformer1d.IMFTransformer1D',
)
self.assertEqual(cfg.agent.head.cond_dim, 288)
self.assertEqual(cfg.agent.future_decoder.cond_dim, 288)
with _stub_optional_modules(include_imf_head=True):
agent = instantiate(cfg.agent)
self.assertEqual(agent.per_step_cond_dim, 288)
self.assertEqual(agent.condition_sequence_length, agent.obs_horizon)
self.assertEqual(agent.noise_pred_net.constructor_kwargs['n_obs_steps'], agent.obs_horizon)
self.assertEqual(agent.future_decoder.constructor_kwargs['cond_dim'], 288)
self.assertEqual(agent.future_query_tokens.shape, (1, 1, 288))
def test_hydra_config_instantiates_resnet_imf_attnres_multitoken_with_sequence_length_three_times_obs_horizon(self):
cfg = _compose_cfg(