From dffd92f82d1c7b9e7d5976215cf531a04ae94635 Mon Sep 17 00:00:00 2001 From: Logic Date: Wed, 1 Apr 2026 23:50:57 +0800 Subject: [PATCH] fix: sanitize problematic LD_PRELOAD for cuDNN --- roboimi/demos/vla_scripts/train_vla.py | 44 +++++++++++++++++++ tests/test_train_vla_transformer_optimizer.py | 19 ++++++++ 2 files changed, 63 insertions(+) diff --git a/roboimi/demos/vla_scripts/train_vla.py b/roboimi/demos/vla_scripts/train_vla.py index cd5da37..059f4ea 100644 --- a/roboimi/demos/vla_scripts/train_vla.py +++ b/roboimi/demos/vla_scripts/train_vla.py @@ -24,6 +24,47 @@ def _ensure_repo_root_on_syspath(): return repo_root +_PROBLEMATIC_LD_PRELOAD_SUBSTRINGS = ('/usr/NX/lib/libnxegl.so', 'libnxegl.so') + + +def _clean_ld_preload_value(value: str | None): + if not value: + return value, False + entries = [entry for entry in value.split() if entry] + filtered = [ + entry for entry in entries + if not any(marker in entry for marker in _PROBLEMATIC_LD_PRELOAD_SUBSTRINGS) + ] + changed = filtered != entries + cleaned = ' '.join(filtered) if filtered else None + return cleaned, changed + + +def _maybe_reexec_without_problematic_ld_preload(): + if __name__ != '__main__': + return False + if os.environ.get('_ROBOIMI_LD_PRELOAD_SANITIZED') == '1': + return False + + cleaned, changed = _clean_ld_preload_value(os.environ.get('LD_PRELOAD')) + if not changed: + return False + + new_env = dict(os.environ) + new_env['_ROBOIMI_LD_PRELOAD_SANITIZED'] = '1' + if cleaned: + new_env['LD_PRELOAD'] = cleaned + else: + new_env.pop('LD_PRELOAD', None) + + print( + 'Detected problematic LD_PRELOAD entry for CUDA/cuDNN; re-executing train_vla.py without it.', + file=sys.stderr, + flush=True, + ) + os.execvpe(sys.executable, [sys.executable, *sys.argv], new_env) + + _REPO_ROOT = _ensure_repo_root_on_syspath() from hydra.utils import instantiate @@ -35,6 +76,9 @@ if not OmegaConf.has_resolver("len"): OmegaConf.register_new_resolver("len", lambda x: len(x)) +_maybe_reexec_without_problematic_ld_preload() + + def _configure_cuda_runtime(cfg): """Apply process-level CUDA runtime switches required by this environment.""" if str(cfg.train.device).startswith('cuda') and bool(cfg.train.get('disable_cudnn', False)): diff --git a/tests/test_train_vla_transformer_optimizer.py b/tests/test_train_vla_transformer_optimizer.py index bee12bd..7ac7657 100644 --- a/tests/test_train_vla_transformer_optimizer.py +++ b/tests/test_train_vla_transformer_optimizer.py @@ -214,6 +214,25 @@ class TrainVLATransformerOptimizerTest(unittest.TestCase): for group in optimizer.param_groups ] + def test_clean_ld_preload_value_removes_problematic_nxegl_entry(self): + module = self._load_train_vla_module() + + cleaned, changed = module._clean_ld_preload_value( + '/usr/lib/libfoo.so /usr/NX/lib/libnxegl.so /usr/lib/libbar.so' + ) + + self.assertTrue(changed) + self.assertEqual(cleaned, '/usr/lib/libfoo.so /usr/lib/libbar.so') + + def test_clean_ld_preload_value_leaves_safe_entries_unchanged(self): + module = self._load_train_vla_module() + + cleaned, changed = module._clean_ld_preload_value('/usr/lib/libfoo.so /usr/lib/libbar.so') + + self.assertFalse(changed) + self.assertEqual(cleaned, '/usr/lib/libfoo.so /usr/lib/libbar.so') + + def test_configure_cuda_runtime_can_disable_cudnn_for_training(self): module = self._load_train_vla_module() cfg = AttrDict(train=AttrDict(device='cuda', disable_cudnn=True))