fix(infra): keep model.to(device) on unsharded post-shard load

HuiyingLi · HuiyingLi · commit 0663edfc05ff · 2026-05-11T13:04:03.000-07:00
Persistent buffers initialized via torch.tensor()/torch.ones() inside init_empty_weights() (e.g. Gemma4's Gemma4ClippableLinear input_min/max, Gemma4TextDecoderLayer layer_scalar) stay on CPU because the context only patches register_parameter, not register_buffer. The post-shard load path then unconditionally skipped model.to(device), leaving these buffers stranded and tripping torch.clamp on cuda:0 vs cpu. The skip exists for FSDP's reset_sharded_param issue with tied params under TP>1 (pytorch/pytorch#151085). Narrow it to its actual precondition: any DTensor in the model, so single-GPU, DDP, and other unsharded configs still run model.to(device). Add unit coverage for both the unsharded and DTensor-sharded checkpoint load paths. Signed-off-by: HuiyingLi <willwin.lee@gmail.com>
diff --git a/nemo_automodel/_transformers/infrastructure.py b/nemo_automodel/_transformers/infrastructure.py
@@ -565,11 +565,20 @@ def apply_model_infrastructure(
     if autopipeline is None:
         print_trainable_parameters(model)  # Once model's been sharded
         # Ensure model is on the correct device.
-        # Skip when checkpoint was loaded post-shard (params are already on the
-        # target device) to avoid triggering FSDP's reset_sharded_param which
-        # fails on tied parameters (e.g. lm_head/embed_tokens with TP>1).
+        # Skip only when params are actually sharded (any DTensor in the model)
+        # AND the checkpoint was loaded post-shard. Calling model.to(device) on
+        # sharded params triggers FSDP's reset_sharded_param, which fails on
+        # tied parameters (e.g. lm_head/embed_tokens with TP>1).
         # See: https://github.com/pytorch/pytorch/issues/151085
-        if not should_load_checkpoint:
+        # In unsharded cases (single-GPU, DDP, or any combination of TP/DP/CP/EP
+        # that left params as plain tensors), model.to(device) must still run so
+        # that persistent buffers not present in the checkpoint (e.g. Gemma4's
+        # Gemma4ClippableLinear input_min/max, Gemma4TextDecoderLayer
+        # layer_scalar) reach the GPU.
+        from torch.distributed.tensor import DTensor
+
+        has_sharded_params = any(isinstance(p, DTensor) for p in model.parameters())
+        if not (should_load_checkpoint and has_sharded_params):
             try:
                 model.to(device, non_blocking=True)
             except NotImplementedError as e:
diff --git a/tests/unit_tests/_transformers/test_infrastructure.py b/tests/unit_tests/_transformers/test_infrastructure.py
@@ -190,8 +190,8 @@ def test_megatron_fsdp_skips_post_shard_init(self):
 
         mock_ckpt.initialize_model_weights.assert_not_called()
 
-    def test_skips_model_to_device_when_checkpoint_loaded(self):
-        """model.to(device) should be skipped when should_load_checkpoint is True (tied params + FSDP fix)."""
+    def test_calls_model_to_device_when_checkpoint_loaded_without_dtensor(self):
+        """Unsharded post-shard checkpoint loads should still move buffers with model.to(device)."""
         from nemo_automodel._transformers.infrastructure import apply_model_infrastructure
 
         model = _DummyModel()
@@ -217,9 +217,43 @@ def test_skips_model_to_device_when_checkpoint_loaded(self):
                 pretrained_model_name_or_path="test/model",
             )
 
-            # model.to(device) should NOT have been called — checkpoint loading
-            # already placed params on device, and calling to() would trigger
-            # FSDP's reset_sharded_param failure on tied parameters.
+            mock_to.assert_called_once_with(torch.device("cpu"), non_blocking=True)
+
+    def test_skips_model_to_device_when_checkpoint_loaded_with_dtensor(self, monkeypatch):
+        """DTensor-sharded post-shard checkpoint loads should skip model.to(device)."""
+        from nemo_automodel._transformers.infrastructure import apply_model_infrastructure
+        import torch.distributed.tensor as dist_tensor
+
+        class FakeDTensor:
+            pass
+
+        class ModelWithShardedParameter(_DummyModel):
+            def parameters(self, recurse=True):
+                return iter([FakeDTensor()])
+
+        monkeypatch.setattr(dist_tensor, "DTensor", FakeDTensor)
+        model = ModelWithShardedParameter()
+
+        with (
+            patch(f"{_INFRA_MODULE}.get_world_size_safe", return_value=1),
+            patch(f"{_INFRA_MODULE}._supports_logits_to_keep", return_value=True),
+            patch(f"{_INFRA_MODULE}.print_trainable_parameters"),
+            patch(f"{_INFRA_MODULE}._should_load_before_shard", return_value=False),
+            patch(f"{_INFRA_MODULE}.Checkpointer") as MockCheckpointer,
+            patch.object(model, "to", wraps=model.to) as mock_to,
+        ):
+            mock_ckpt = MockCheckpointer.return_value
+            mock_ckpt.config = MagicMock()
+            mock_ckpt.config.dequantize_base_checkpoint = False
+
+            apply_model_infrastructure(
+                model=model,
+                is_meta_device=True,
+                device=torch.device("cpu"),
+                load_base_model=True,
+                pretrained_model_name_or_path="test/model",
+            )
+
             mock_to.assert_not_called()
 
     def test_calls_model_to_device_when_from_config_meta(self):