fix: apply param freeze the right place for moe lora (#1252)

ZhiyuLi-Nvidia · web-flow · commit 5872e023c71d · 2026-02-13T01:09:57.000-08:00
* fix: freeze weights for moe lora finetuning

Signed-off-by: Zhiyu Li &lt;zhiyul@NVIDIA.com&gt;

comments

Signed-off-by: Zhiyu Li &lt;zhiyul@NVIDIA.com&gt;

comments

Signed-off-by: Zhiyu Li &lt;zhiyul@NVIDIA.com&gt;

* better

Signed-off-by: Zhiyu Li &lt;zhiyul@NVIDIA.com&gt;

* lint

Signed-off-by: Zhiyu Li &lt;zhiyul@NVIDIA.com&gt;

* fix test

Signed-off-by: Zhiyu Li &lt;zhiyul@NVIDIA.com&gt;

---------

Signed-off-by: Zhiyu Li &lt;zhiyul@NVIDIA.com&gt;
diff --git a/nemo_automodel/_transformers/infrastructure.py b/nemo_automodel/_transformers/infrastructure.py
@@ -80,7 +80,8 @@ def _apply_peft_and_lower_precision(
             logger.info("Enabling PEFT with Pipeline Parallelism")
             logger.info("Disabling Triton with Pipeline Parallelism Enabled.")
             peft_config.use_triton = False
-        apply_lora_to_linear_modules(model, peft_config, quantization_config=quantization_config)
+        # Skip freeze here - will do global freeze after checkpoint loading
+        apply_lora_to_linear_modules(model, peft_config, quantization_config=quantization_config, skip_freeze=True)
 
     # FP8
     if fp8_config is not None:
@@ -446,6 +447,15 @@ def apply_model_infrastructure(
                 load_base_model=load_base_model,
             )
 
+    # Freeze parameters after checkpoint loading and parallelization
+    # This catches params created during parallelization (e.g., GroupedExpertsTE in init_token_dispatcher)
+    if peft_config is not None:
+        models_to_freeze = model.parts if hasattr(model, "parts") else [model]
+        for mp in models_to_freeze:
+            for name, param in mp.named_parameters():
+                if "lora_" not in name and param.requires_grad:
+                    param.requires_grad_(False)
+
     if autopipeline is None:
         print_trainable_parameters(model)  # Once model's been sharded
         # Ensure model is on the correct device; AutoPipeline takes care of it internally
diff --git a/nemo_automodel/components/_peft/lora.py b/nemo_automodel/components/_peft/lora.py
@@ -464,6 +464,7 @@ def apply_lora_to_linear_modules(
     model: nn.Module,
     peft_config: PeftConfig,
     quantization_config=None,
+    skip_freeze: bool = False,
 ) -> int:
     """
     Replace selected nn.Linear layers with LinearLoRA layers (in-place).
@@ -472,6 +473,7 @@ def apply_lora_to_linear_modules(
         model: The model to apply LoRA to.
         peft_config: PEFT configuration for LoRA parameters.
         quantization_config: Optional separate QLoRA quantization configuration.
+        skip_freeze: If True, skip the global parameter freeze (caller will handle it later).
 
     Returns:
         Number of modules that were modified with LoRA.
@@ -480,8 +482,9 @@ def apply_lora_to_linear_modules(
         target_modules accepts wildcard fragments, e.g. ["q_proj", "k_proj", ".*fc.*"].
     """
     # Freeze base model parameters
-    for w in model.parameters():
-        w.requires_grad_(False)
+    if not skip_freeze:
+        for w in model.parameters():
+            w.requires_grad_(False)
 
     is_causal_lm = False
     try:
diff --git a/tests/unit_tests/recipes/test_train_ft.py b/tests/unit_tests/recipes/test_train_ft.py
@@ -230,7 +230,13 @@ def test_peft_without_pipeline_parallelism(caplog):
                 with patch('nemo_automodel._transformers.infrastructure._supports_logits_to_keep', return_value=True):
                     with patch('nemo_automodel._transformers.auto_model._verify_sdpa_support'):
                         with patch('nemo_automodel._transformers.infrastructure._shard_ep_fsdp') as mock_shard:
-                            mock_shard.return_value = DummyModel()
+                            # Return a DummyModel with lora_dummy_param so freeze doesn't remove all trainable params
+                            sharded_model = DummyModel()
+                            sharded_model.register_parameter(
+                                "lora_dummy_param",
+                                nn.Parameter(torch.tensor(1.0, device=torch.device("cuda")), requires_grad=True)
+                            )
+                            mock_shard.return_value = sharded_model
                             with caplog.at_level(logging.INFO):
                                 # This should work fine without PP
                                 model = build_model(