roboimi/tests/test_imf_vla_agent.py

import contextlib
import importlib
import sys
import types
import unittest
from pathlib import Path
from unittest import mock

import torch
from hydra import compose, initialize_config_dir
from hydra.core.global_hydra import GlobalHydra
from hydra.utils import instantiate
from omegaconf import OmegaConf
from torch import nn


_REPO_ROOT = Path(__file__).resolve().parents[1]
_CONFIG_DIR = str((_REPO_ROOT / 'roboimi/vla/conf').resolve())
_MISSING = object()
_CAMERA_NAMES = ('r_vis', 'top', 'front')


class _FakeScheduler:
    def __init__(self, num_train_timesteps=100, **kwargs):
        self.config = types.SimpleNamespace(num_train_timesteps=num_train_timesteps)
        self.timesteps = []

    def add_noise(self, sample, noise, timestep):
        return sample + noise

    def set_timesteps(self, num_inference_steps):
        self.timesteps = list(range(num_inference_steps - 1, -1, -1))

    def step(self, noise_pred, timestep, sample):
        return types.SimpleNamespace(prev_sample=sample)


class _IdentityCrop:
    def __init__(self, size):
        self.size = size

    def __call__(self, x):
        return x


class _FakeResNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(3, 8, kernel_size=3, padding=1)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv2d(8, 16, kernel_size=3, padding=1, stride=2)
        self.relu2 = nn.ReLU()
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(16, 16)

    def forward(self, x):
        x = self.relu1(self.conv1(x))
        x = self.relu2(self.conv2(x))
        x = self.avgpool(x)
        x = torch.flatten(x, start_dim=1)
        return self.fc(x)


class _FakeRearrange(nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__()

    def forward(self, x):
        return x


class _StubIMFHead(nn.Module):
    def __init__(
        self,
        input_dim,
        output_dim,
        horizon,
        n_obs_steps,
        cond_dim,
        **kwargs,
    ):
        super().__init__()
        self.constructor_kwargs = {
            'input_dim': input_dim,
            'output_dim': output_dim,
            'horizon': horizon,
            'n_obs_steps': n_obs_steps,
            'cond_dim': cond_dim,
            **kwargs,
        }
        self.proj = nn.Linear(input_dim, output_dim)
        self.cond_obs_emb = nn.Linear(cond_dim, max(cond_dim, 1))

    def forward(self, sample, r, t, cond=None):
        return torch.zeros_like(sample)

    def get_optim_groups(self, weight_decay):
        return [
            {'params': [self.proj.weight], 'weight_decay': weight_decay},
            {'params': [self.proj.bias, self.cond_obs_emb.weight, self.cond_obs_emb.bias], 'weight_decay': 0.0},
        ]


@contextlib.contextmanager
def _stub_optional_modules(include_imf_head=False):
    previous_modules = {}

    def inject(name, module):
        if name not in previous_modules:
            previous_modules[name] = sys.modules.get(name, _MISSING)
        sys.modules[name] = module

    diffusers_module = types.ModuleType('diffusers')
    schedulers_module = types.ModuleType('diffusers.schedulers')
    ddpm_module = types.ModuleType('diffusers.schedulers.scheduling_ddpm')
    ddim_module = types.ModuleType('diffusers.schedulers.scheduling_ddim')
    ddpm_module.DDPMScheduler = _FakeScheduler
    ddim_module.DDIMScheduler = _FakeScheduler
    diffusers_module.DDPMScheduler = _FakeScheduler
    diffusers_module.DDIMScheduler = _FakeScheduler
    diffusers_module.schedulers = schedulers_module
    schedulers_module.scheduling_ddpm = ddpm_module
    schedulers_module.scheduling_ddim = ddim_module

    torchvision_module = types.ModuleType('torchvision')
    models_module = types.ModuleType('torchvision.models')
    transforms_module = types.ModuleType('torchvision.transforms')
    models_module.resnet18 = lambda weights=None: _FakeResNet()
    transforms_module.CenterCrop = _IdentityCrop
    transforms_module.RandomCrop = _IdentityCrop
    torchvision_module.models = models_module
    torchvision_module.transforms = transforms_module

    einops_module = types.ModuleType('einops')
    einops_module.rearrange = lambda x, *args, **kwargs: x
    einops_layers_module = types.ModuleType('einops.layers')
    einops_layers_torch_module = types.ModuleType('einops.layers.torch')
    einops_layers_torch_module.Rearrange = _FakeRearrange
    einops_module.layers = einops_layers_module
    einops_layers_module.torch = einops_layers_torch_module

    try:
        inject('diffusers', diffusers_module)
        inject('diffusers.schedulers', schedulers_module)
        inject('diffusers.schedulers.scheduling_ddpm', ddpm_module)
        inject('diffusers.schedulers.scheduling_ddim', ddim_module)
        inject('torchvision', torchvision_module)
        inject('torchvision.models', models_module)
        inject('torchvision.transforms', transforms_module)
        inject('einops', einops_module)
        inject('einops.layers', einops_layers_module)
        inject('einops.layers.torch', einops_layers_torch_module)

        if include_imf_head:
            import roboimi.vla.models.heads as heads_package

            imf_head_module = types.ModuleType('roboimi.vla.models.heads.imf_transformer1d')
            imf_head_module.IMFTransformer1D = _StubIMFHead
            inject('roboimi.vla.models.heads.imf_transformer1d', imf_head_module)
            setattr(heads_package, 'imf_transformer1d', imf_head_module)

        yield
    finally:
        for name, previous in reversed(list(previous_modules.items())):
            if previous is _MISSING:
                sys.modules.pop(name, None)
            else:
                sys.modules[name] = previous


def _compose_cfg(overrides=None):
    if not OmegaConf.has_resolver('len'):
        OmegaConf.register_new_resolver('len', lambda x: len(x))

    GlobalHydra.instance().clear()
    with initialize_config_dir(version_base=None, config_dir=_CONFIG_DIR):
        return compose(config_name='config', overrides=list(overrides or []))


def _load_imf_agent_class():
    with _stub_optional_modules():
        sys.modules.pop('roboimi.vla.agent_imf', None)
        module = importlib.import_module('roboimi.vla.agent_imf')
    return module.IMFVLAAgent, module


class _StubVisionBackbone(nn.Module):
    output_dim = 1

    def __init__(self, camera_names=_CAMERA_NAMES):
        super().__init__()
        self.camera_names = tuple(camera_names)
        self.num_cameras = len(self.camera_names)

    def forward(self, images):
        per_camera_features = []
        for camera_name in self.camera_names:
            image_batch = images[camera_name]
            per_camera_features.append(image_batch.mean(dim=(2, 3, 4), keepdim=False).unsqueeze(-1))
        return torch.cat(per_camera_features, dim=-1)


class _RecordingLinearIMFHead(nn.Module):
    def __init__(self):
        super().__init__()
        self.scale = nn.Parameter(torch.tensor(0.5))
        self.calls = []

    @staticmethod
    def _broadcast_batch_time(value, reference):
        while value.ndim < reference.ndim:
            value = value.unsqueeze(-1)
        return value

    def forward(self, sample, r, t, cond=None):
        record = {
            'sample': sample.detach().clone(),
            'r': r.detach().clone(),
            't': t.detach().clone(),
            'cond': None if cond is None else cond.detach().clone(),
        }
        self.calls.append(record)
        cond_term = 0.0
        if cond is not None:
            cond_term = cond.mean(dim=(1, 2), keepdim=True)
        r_b = self._broadcast_batch_time(r, sample)
        t_b = self._broadcast_batch_time(t, sample)
        return self.scale * sample + r_b + 2.0 * t_b + cond_term


class _ForbiddenScheduler:
    def set_timesteps(self, *args, **kwargs):  # pragma: no cover - only runs on regression
        raise AssertionError('IMF inference should not use DDIM scheduler set_timesteps')

    def step(self, *args, **kwargs):  # pragma: no cover - only runs on regression
        raise AssertionError('IMF inference should not use DDIM scheduler step')


def _make_images(batch_size, obs_horizon, per_camera_fill):
    return {
        name: torch.full((batch_size, obs_horizon, 1, 2, 2), fill_value=value, dtype=torch.float32)
        for name, value in per_camera_fill.items()
    }


class IMFVLAAgentTest(unittest.TestCase):
    def _make_agent(self, pred_horizon=3, obs_horizon=2, num_action_steps=2):
        agent_cls, agent_module = _load_imf_agent_class()
        head = _RecordingLinearIMFHead()
        agent = agent_cls(
            vision_backbone=_StubVisionBackbone(),
            state_encoder=nn.Identity(),
            action_encoder=nn.Identity(),
            head=head,
            action_dim=2,
            obs_dim=1,
            pred_horizon=pred_horizon,
            obs_horizon=obs_horizon,
            diffusion_steps=10,
            inference_steps=1,
            num_cams=len(_CAMERA_NAMES),
            camera_names=_CAMERA_NAMES,
            num_action_steps=num_action_steps,
            head_type='transformer',
        )
        return agent, head, agent_module

    def test_compute_loss_matches_imf_objective_and_masks_padded_actions(self):
        agent, head, agent_module = self._make_agent(pred_horizon=3, obs_horizon=2)
        images = _make_images(
            batch_size=1,
            obs_horizon=2,
            per_camera_fill={'r_vis': 1.0, 'top': 2.0, 'front': 3.0},
        )
        qpos = torch.tensor([[[0.25], [0.75]]], dtype=torch.float32)
        actions = torch.tensor(
            [[[1.0, -1.0], [0.5, 0.25], [-0.5, 1.5]]],
            dtype=torch.float32,
        )
        action_is_pad = torch.tensor([[False, False, True]])
        noise = torch.tensor(
            [[[0.2, -0.4], [0.1, 0.3], [0.5, -0.2]]],
            dtype=torch.float32,
        )
        t_sample = torch.tensor([0.8], dtype=torch.float32)
        r_sample = torch.tensor([0.25], dtype=torch.float32)

        with mock.patch.object(agent_module.torch, 'randn_like', return_value=noise), \
             mock.patch.object(agent_module.torch, 'rand', side_effect=[t_sample, r_sample]):
            loss = agent.compute_loss(
                {
                    'images': images,
                    'qpos': qpos,
                    'action': actions,
                    'action_is_pad': action_is_pad,
                }
            )

        cond = torch.tensor([[[1.0, 2.0, 3.0, 0.25], [1.0, 2.0, 3.0, 0.75]]], dtype=torch.float32)
        cond_term = cond.mean(dim=(1, 2), keepdim=True)
        t = t_sample
        r = r_sample
        z_t = (1 - t.view(1, 1, 1)) * actions + t.view(1, 1, 1) * noise
        scale = head.scale.detach()
        u = scale * z_t + r.view(1, 1, 1) + 2.0 * t.view(1, 1, 1) + cond_term
        v = scale * z_t + 3.0 * t.view(1, 1, 1) + cond_term
        du_dt = scale * v + 2.0
        compound_velocity = u + (t - r).view(1, 1, 1) * du_dt
        target = noise - actions
        elementwise_loss = (compound_velocity - target) ** 2
        mask = (~action_is_pad).unsqueeze(-1).to(elementwise_loss.dtype)
        expected_loss = (elementwise_loss * mask).sum() / (mask.sum() * elementwise_loss.shape[-1])

        self.assertAlmostEqual(loss.item(), expected_loss.item(), places=6)
        self.assertEqual(len(head.calls), 2)
        self.assertTrue(torch.allclose(head.calls[0]['r'], t_sample))
        self.assertTrue(torch.allclose(head.calls[0]['t'], t_sample))
        self.assertTrue(torch.allclose(head.calls[0]['cond'], cond))

    def test_predict_action_uses_one_step_imf_sampling_and_image_conditioning(self):
        agent, head, agent_module = self._make_agent(pred_horizon=3, obs_horizon=2)
        agent.infer_scheduler = _ForbiddenScheduler()

        images = _make_images(
            batch_size=2,
            obs_horizon=2,
            per_camera_fill={'r_vis': 10.0, 'top': 20.0, 'front': 30.0},
        )
        qpos = torch.tensor(
            [
                [[1.0], [2.0]],
                [[3.0], [4.0]],
            ],
            dtype=torch.float32,
        )
        initial_noise = torch.tensor(
            [
                [[1.0, -1.0], [0.0, 2.0], [3.0, -2.0]],
                [[-1.0, 1.0], [2.0, -3.0], [0.5, 0.25]],
            ],
            dtype=torch.float32,
        )

        with mock.patch.object(agent_module.torch, 'randn', return_value=initial_noise):
            predicted_actions = agent.predict_action(images, qpos)

        expected_cond = torch.tensor(
            [
                [[10.0, 20.0, 30.0, 1.0], [10.0, 20.0, 30.0, 2.0]],
                [[10.0, 20.0, 30.0, 3.0], [10.0, 20.0, 30.0, 4.0]],
            ],
            dtype=torch.float32,
        )
        cond_term = expected_cond.mean(dim=(1, 2), keepdim=True)
        expected_actions = 0.5 * initial_noise - 2.0 - cond_term

        self.assertEqual(predicted_actions.shape, (2, 3, 2))
        self.assertTrue(torch.allclose(predicted_actions, expected_actions))
        self.assertEqual(len(head.calls), 1)
        self.assertTrue(torch.allclose(head.calls[0]['r'], torch.zeros(2)))
        self.assertTrue(torch.allclose(head.calls[0]['t'], torch.ones(2)))
        self.assertTrue(torch.allclose(head.calls[0]['cond'], expected_cond))

    def test_select_action_only_regenerates_when_action_queue_is_empty(self):
        agent, _head, _agent_module = self._make_agent(pred_horizon=4, obs_horizon=2, num_action_steps=2)
        observation = {
            'qpos': torch.tensor([0.25], dtype=torch.float32),
            'images': {
                'front': torch.full((1, 2, 2), 3.0, dtype=torch.float32),
                'top': torch.full((1, 2, 2), 2.0, dtype=torch.float32),
                'r_vis': torch.full((1, 2, 2), 1.0, dtype=torch.float32),
            },
        }
        first_chunk = torch.tensor(
            [[[10.0, 11.0], [12.0, 13.0], [14.0, 15.0], [16.0, 17.0]]],
            dtype=torch.float32,
        )
        second_chunk = torch.tensor(
            [[[20.0, 21.0], [22.0, 23.0], [24.0, 25.0], [26.0, 27.0]]],
            dtype=torch.float32,
        )

        with mock.patch.object(agent, 'predict_action_chunk', side_effect=[first_chunk, second_chunk]) as mock_predict_chunk:
            first_action = agent.select_action(observation)
            second_action = agent.select_action(observation)
            third_action = agent.select_action(observation)

        self.assertTrue(torch.equal(first_action, first_chunk[0, 1]))
        self.assertTrue(torch.equal(second_action, first_chunk[0, 2]))
        self.assertTrue(torch.equal(third_action, second_chunk[0, 1]))
        self.assertEqual(mock_predict_chunk.call_count, 2)

    def test_hydra_config_instantiates_resnet_imf_attnres_with_stub_head(self):
        cfg = _compose_cfg(
            overrides=[
                'agent=resnet_imf_attnres',
                'agent.vision_backbone.pretrained_backbone_weights=null',
                'agent.vision_backbone.input_shape=[3,16,16]',
                'agent.vision_backbone.freeze_backbone=false',
                'agent.head.n_layer=1',
                'agent.head.n_emb=16',
            ]
        )

        self.assertEqual(cfg.agent._target_, 'roboimi.vla.agent_imf.IMFVLAAgent')
        self.assertEqual(cfg.agent.head._target_, 'roboimi.vla.models.heads.imf_transformer1d.IMFTransformer1D')
        self.assertEqual(cfg.agent.head.backbone_type, 'attnres_full')
        self.assertEqual(cfg.agent.head.n_head, 1)
        self.assertEqual(cfg.agent.head.n_kv_head, 1)
        self.assertEqual(cfg.agent.head.n_cond_layers, 0)
        self.assertTrue(cfg.agent.head.time_as_cond)
        self.assertFalse(cfg.agent.head.causal_attn)
        self.assertEqual(cfg.agent.inference_steps, 1)
        self.assertEqual(list(cfg.agent.camera_names), list(_CAMERA_NAMES))

        with _stub_optional_modules(include_imf_head=True):
            agent = instantiate(cfg.agent)

        self.assertEqual(agent.head_type, 'transformer')
        self.assertEqual(agent.per_step_cond_dim, agent.vision_encoder.output_dim * agent.num_cams + agent.obs_dim)
        self.assertIsInstance(agent.noise_pred_net, _StubIMFHead)
        self.assertEqual(agent.noise_pred_net.constructor_kwargs['cond_dim'], agent.per_step_cond_dim)
        self.assertEqual(agent.noise_pred_net.constructor_kwargs['backbone_type'], 'attnres_full')

    def test_hydra_config_instantiates_resnet_imf_attnres_with_full_attnres_vision_backbone(self):
        cfg = _compose_cfg(
            overrides=[
                'agent=resnet_imf_attnres',
                'agent.vision_backbone.vision_backbone_mode=attnres_resnet',
                'agent.vision_backbone.pretrained_backbone_weights=null',
                'agent.vision_backbone.input_shape=[3,56,56]',
                'agent.vision_backbone.freeze_backbone=false',
                'agent.vision_backbone.attnres_stem_dim=16',
                'agent.vision_backbone.attnres_stage_dims=[16,32,64,128]',
                'agent.vision_backbone.attnres_stage_depths=[1,1,1,1]',
                'agent.vision_backbone.attnres_stage_heads=[2,4,4,8]',
                'agent.vision_backbone.attnres_stage_kv_heads=[1,1,1,1]',
                'agent.vision_backbone.attnres_stage_window_sizes=[7,7,7,7]',
                'agent.head.n_layer=1',
                'agent.head.n_emb=16',
            ]
        )

        with _stub_optional_modules(include_imf_head=True):
            agent = instantiate(cfg.agent)

        self.assertEqual(agent.vision_encoder.output_dim, 64)
        self.assertEqual(agent.per_step_cond_dim, 64 * agent.num_cams + agent.obs_dim)
        self.assertEqual(agent.noise_pred_net.constructor_kwargs['cond_dim'], agent.per_step_cond_dim)


if __name__ == '__main__':
    unittest.main()