NVIDIA-NeMo
diff --git a/‎examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag.yaml‎
Lines changed: 4 additions & 1 deletion b/‎examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag.yaml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎nemo_automodel/components/models/common/mtp/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎nemo_automodel/components/models/common/mtp/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎nemo_automodel/components/models/common/mtp/mtp.py‎
Lines changed: 8 additions & 0 deletions b/‎nemo_automodel/components/models/common/mtp/mtp.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎nemo_automodel/components/models/deepseek_v4/model.py‎
Lines changed: 119 additions & 10 deletions b/‎nemo_automodel/components/models/deepseek_v4/model.py‎
Lines changed: 119 additions & 10 deletions
diff --git a/‎nemo_automodel/components/models/deepseek_v4/mtp.py‎
Lines changed: 16 additions & 5 deletions b/‎nemo_automodel/components/models/deepseek_v4/mtp.py‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎nemo_automodel/components/utils/model_utils.py‎
Lines changed: 24 additions & 0 deletions b/‎nemo_automodel/components/utils/model_utils.py‎
Lines changed: 24 additions & 0 deletions
@@ -61,9 +61,12 @@ model:
     _target_: nemo_automodel.components.models.deepseek_v4.config.DeepseekV4Config.from_pretrained
     pretrained_model_name_or_path: deepseek-ai/DeepSeek-V4-Flash
     name_or_path: deepseek-ai/DeepSeek-V4-Flash
-    num_nextn_predict_layers: 0
+    num_nextn_predict_layers: 1
   trust_remote_code: false
   load_base_model: true
+  # DeepSeek-V4 uses 0.3 for most pretraining, then 0.1 during LR decay.
+  # Keep finetuning/RL conservative unless explicitly reproducing pretraining.
+  mtp_loss_scaling_factor: 0.1
   backend:
     _target_: nemo_automodel.components.models.common.BackendConfig
     attn: sdpa
 
@@ -27,13 +27,15 @@
 from nemo_automodel.components.models.common.mtp.mtp import (
     MTPConfig,
     MTPModule,
+    get_mtp_loss_scaling_factor,
     parse_mtp_layer_pattern,
     roll_tensor,
 )
 
 __all__ = [
     "MTPConfig",
     "MTPModule",
+    "get_mtp_loss_scaling_factor",
     "parse_mtp_layer_pattern",
     "roll_tensor",
 ]
@@ -86,6 +86,14 @@ def roll_tensor(t: torch.Tensor, shifts: int = -1, dim: int = -1) -> torch.Tenso
     return rolled
 
 
+def get_mtp_loss_scaling_factor(model: nn.Module, default: float = 0.1) -> float:
+    """Return the model's configured MTP auxiliary-loss scaling factor."""
+    mtp_config = getattr(model, "mtp_config", None)
+    if mtp_config is not None:
+        return float(getattr(mtp_config, "loss_scaling_factor", default))
+    return default
+
+
 @dataclass
 class MTPConfig:
     """Runtime configuration for the MTP block.
 
@@ -599,15 +599,104 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         self.lm_head = new_embeddings
 
+    def customize_pipeline_stage_modules(
+        self,
+        module_names_per_stage: list[list[str]],
+        *,
+        layers_prefix: str,
+        text_model: nn.Module | None = None,
+    ) -> list[list[str]]:
+        """Keep DSV4 non-layer PP dependencies with the stages that need them."""
+
+        text_model = text_model or self.model
+        stage_modules = [list(modules) for modules in module_names_per_stage]
+
+        def append_once(modules: list[str], fqn: str) -> None:
+            if fqn not in modules:
+                modules.append(fqn)
+
+        if getattr(text_model, "rotary_emb_compress", None) is not None:
+            for modules in stage_modules:
+                append_once(modules, f"{layers_prefix}rotary_emb_compress")
+        if getattr(text_model, "hc_head", None) is not None:
+            append_once(stage_modules[-1], f"{layers_prefix}hc_head")
+        if self.mtp is not None:
+            append_once(stage_modules[-1], "mtp")
+
+        return stage_modules
+
+    def get_pipeline_stage_metas(
+        self,
+        *,
+        is_first: bool,
+        microbatch_size: int,
+        seq_len: int,
+        dtype: torch.dtype,
+    ) -> tuple[tuple[torch.Tensor, ...], tuple[torch.Tensor, ...]]:
+        """Return PP input/output meta tensors for DSV4's HC and MTP contract."""
+
+        hidden_shape = (microbatch_size, seq_len, self.config.hidden_size)
+        hc_hidden_shape = (microbatch_size, seq_len, self.config.hc_mult, self.config.hidden_size)
+        mtp_depth = int(getattr(self.mtp_config, "num_layers", 0) or 0)
+
+        def meta(shape: tuple[int, ...]) -> torch.Tensor:
+            return torch.empty(*shape, device="meta", dtype=dtype)
+
+        def append_mtp_metas(primary: torch.Tensor) -> tuple[torch.Tensor, ...]:
+            mtp_metas = (meta(hidden_shape) for _ in range(mtp_depth))
+            return (primary, *mtp_metas)
+
+        if is_first:
+            inputs_meta = (torch.empty(microbatch_size, seq_len, device="meta", dtype=torch.long),)
+        else:
+            inputs_meta = append_mtp_metas(meta(hc_hidden_shape if self.config.hc_mult > 1 else hidden_shape))
+
+        if self.lm_head is not None:
+            output_meta = meta((microbatch_size, seq_len, self.config.vocab_size))
+        elif getattr(self.model, "norm", None) is not None:
+            output_meta = meta(hidden_shape)
+        else:
+            output_meta = meta(hc_hidden_shape if self.config.hc_mult > 1 else hidden_shape)
+
+        return inputs_meta, append_mtp_metas(output_meta)
+
+    def _is_pipeline_parallel_stage(self) -> bool:
+        if self.lm_head is None:
+            return True
+        if getattr(self.model, "embed_tokens", None) is None:
+            return True
+        try:
+            return len(self.model.layers) != int(self.config.num_hidden_layers)
+        except TypeError:
+            return False
+
+    def _build_mtp_embed_inputs_for_pp(self, input_ids: torch.Tensor) -> tuple[torch.Tensor, ...]:
+        if getattr(self.model, "embed_tokens", None) is None:
+            raise ValueError("First PP stage must own embed_tokens to build MTP embeddings")
+        if input_ids.dtype not in (torch.int32, torch.int64, torch.long):
+            raise ValueError("First PP stage must receive token ids to build MTP embeddings")
+
+        from nemo_automodel.components.models.common.mtp import roll_tensor  # noqa: PLC0415
+
+        cur_input_ids = input_ids
+        embeds = []
+        for _ in range(self.mtp_config.num_layers):
+            cur_input_ids = roll_tensor(cur_input_ids, shifts=-1, dim=-1)
+            embeds.append(self.model.embed_tokens(cur_input_ids))
+        return tuple(embeds)
+
     def forward(
         self,
         input_ids: torch.Tensor,
-        *,
+        *mtp_embed_inputs: torch.Tensor,
         position_ids: torch.Tensor | None = None,
         attention_mask: torch.Tensor | None = None,
         padding_mask: torch.Tensor | None = None,
         **attn_kwargs: Any,
-    ) -> "DeepseekV4CausalLMOutput":
+    ) -> "DeepseekV4CausalLMOutput" | tuple[torch.Tensor, ...] | torch.Tensor:
+        is_pp_stage = self._is_pipeline_parallel_stage()
+        pp_mtp_enabled = is_pp_stage and self.mtp_config.enabled
+
         thd_mode = "qkv_format" in attn_kwargs and attn_kwargs["qkv_format"] == "thd"
         if thd_mode:
             input_ids, position_ids, padding_mask, attn_kwargs = squeeze_input_for_thd(
@@ -633,8 +722,15 @@ def forward(
         if thd_mode:
             logits = logits.unsqueeze(0)
 
+        if pp_mtp_enabled and self.lm_head is None:
+            if not mtp_embed_inputs:
+                mtp_embed_inputs = self._build_mtp_embed_inputs_for_pp(input_ids)
+            return (logits, *mtp_embed_inputs)
+
         mtp_per_depth_h = None
         if use_mtp:
+            if is_pp_stage and not mtp_embed_inputs:
+                raise ValueError("Final PP stage requires propagated MTP embeddings")
             # MTP consumes the pre-final-head HC stream [B, S, hc_mult, hidden]
             # and returns collapsed per-depth [B, S, hidden] tensors for CE.
             seq_len = hidden_states.shape[1]
@@ -650,14 +746,27 @@ def forward(
                 batch_size=batch_size,
                 sliding_window=sliding_window,
             )
-            mtp_per_depth_h = self.mtp(
-                input_ids=input_ids,
-                hidden_states=mtp_hc_hidden,
-                embed_fn=self.model.embed_tokens,
-                position_ids=position_ids,
-                attention_mask=mtp_attn_mask,
-                padding_mask=padding_mask,
-            )
+            mtp_kwargs = {
+                "hidden_states": mtp_hc_hidden,
+                "position_ids": position_ids,
+                "attention_mask": mtp_attn_mask,
+                "padding_mask": padding_mask,
+            }
+            if mtp_embed_inputs:
+                mtp_kwargs["embed_inputs"] = tuple(mtp_embed_inputs)
+            else:
+                mtp_kwargs["input_ids"] = input_ids
+                mtp_kwargs["embed_fn"] = self.model.embed_tokens
+            mtp_per_depth_h = self.mtp(**mtp_kwargs)
+        elif pp_mtp_enabled and self.lm_head is not None:
+            mtp_per_depth_h = [hidden_states.new_empty(hidden_states.shape) for _ in range(self.mtp_config.num_layers)]
+
+        if is_pp_stage:
+            if pp_mtp_enabled:
+                if self.training and self.mtp is None:
+                    raise ValueError("Final PP stage has MTP enabled but does not own the MTP module")
+                return (logits, *mtp_per_depth_h)
+            return logits
 
         return DeepseekV4CausalLMOutput(
             logits=logits,
 
@@ -225,17 +225,28 @@ def num_depths(self) -> int:
 
     def forward(
         self,
-        input_ids: torch.LongTensor,
         hidden_states: torch.Tensor,
-        embed_fn,
+        input_ids: torch.LongTensor | None = None,
+        embed_fn=None,
+        embed_inputs: tuple[torch.Tensor, ...] | list[torch.Tensor] | None = None,
         position_ids: torch.LongTensor | None = None,
         **block_kwargs,
     ) -> list[torch.Tensor]:
         per_depth_h: list[torch.Tensor] = []
         cur_input_ids = input_ids
-        for block in self.layers:
-            cur_input_ids = roll_tensor(cur_input_ids, shifts=-1, dim=-1)
-            decoder_input = embed_fn(cur_input_ids)
+        if embed_inputs is not None and len(embed_inputs) != len(self.layers):
+            raise ValueError(
+                f"Expected {len(self.layers)} MTP embedding tensors, got {len(embed_inputs)}"
+            )
+        if embed_inputs is None and (cur_input_ids is None or embed_fn is None):
+            raise ValueError("MTP requires either embed_inputs or both input_ids and embed_fn")
+
+        for depth, block in enumerate(self.layers):
+            if embed_inputs is None:
+                cur_input_ids = roll_tensor(cur_input_ids, shifts=-1, dim=-1)
+                decoder_input = embed_fn(cur_input_ids)
+            else:
+                decoder_input = embed_inputs[depth]
             kwargs = dict(block_kwargs)
             if position_ids is not None:
                 kwargs["position_ids"] = position_ids
 
@@ -125,6 +125,30 @@ def filter_forward_kwargs(model: nn.Module, kwargs: dict) -> dict:
     return filtered
 
 
+def get_lm_head_module(model: nn.Module) -> nn.Module | None:
+    """Return the model's LM head module, if one can be found."""
+    if hasattr(model, "get_output_embeddings"):
+        lm_head = model.get_output_embeddings()
+        if lm_head is not None:
+            return lm_head
+    for name, module in model.named_modules():
+        if (name == "lm_head" or name.endswith(".lm_head")) and hasattr(module, "weight"):
+            return module
+    return None
+
+
+def get_lm_head_weight(model: nn.Module) -> torch.Tensor:
+    """Return the model's LM-head weight, materializing DTensor weights when needed."""
+    lm_head = get_lm_head_module(model)
+    if lm_head is not None:
+        weight = lm_head.weight
+        return weight.full_tensor() if hasattr(weight, "full_tensor") else weight
+    for name, param in model.named_parameters(remove_duplicate=False):
+        if "lm_head" in name and name.endswith(".weight"):
+            return param.full_tensor() if hasattr(param, "full_tensor") else param
+    raise ValueError("lm_head.weight not found in model")
+
+
 def _get_logical_numel(param) -> int:
     """Return the logical number of elements for a parameter,
     accounting for quantized (packed) storage.