fix(nemotron-v3): support THD with input_embeds instead of input_ids (#2185)

pzelasko · claude · web-flow · commit 660ed94b0680 · 2026-05-11T13:16:47.000-07:00
fix(thd): support inputs_embeds-only callers in NemotronHForCausalLM

`SALMAutomodel` and other multimodal callers feed the LLM through
`inputs_embeds` (audio frames spliced into the token stream have no
integer ID) and leave `input_ids=None`. Two bugs surfaced when running
that path under `qkv_format="thd"`:

1. `squeeze_input_for_thd` did `input_ids.squeeze(0)` unconditionally
   and crashed with `AttributeError: 'NoneType' object has no attribute
   'squeeze'`.  Add the same `is-not-None` guard the helper already uses
   for `padding_mask`; document `None` as a valid value.

2. `NemotronHForCausalLM.forward` did `logits = logits.unsqueeze(0)`
   whenever `is_thd`, producing `[1, 1, T, V]` for the
   `inputs_embeds` path because `NemotronHModel.forward` already
   restores the batch dim (`squeezed_for_thd` branch). Restrict the
   outer unsqueeze to the case where the inner returned 2D logits;
   the standard `input_ids` path still satisfies that.

Tests:
- `TestSqueezeInputForThd` (5 cases) covers the helper-level contract:
  standard `input_ids` path, `input_ids=None` path, `padding_mask=None`
  composition, 3D `[1, T, H]` embedding-via-`input_ids` slot path, and
  `cu_seqlens_padded` filtering.
- `TestNemotronHForCausalLM::test_causal_lm_thd_*` (2 cases) covers the
  outer logits-shape contract: `inputs_embeds`-only stays `[1, T, V]`
  (no double-unsqueeze), and `input_ids`-only still gets the batch dim
  re-added. The inner forward is stubbed via a tiny `nn.Module` because
  THD shapes only run end-to-end on TE/GPU.

Co-authored-by: Claude Opus 4.7 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/nemo_automodel/components/models/nemotron_v3/model.py b/nemo_automodel/components/models/nemotron_v3/model.py
@@ -418,7 +418,14 @@ def forward(
                 shift_labels.view(-1),
             )
 
-        if is_thd:
+        # Restore the batch dim for THD only when the inner forward returned
+        # 2D logits.  When the caller feeds the model via ``inputs_embeds``
+        # (shape ``[1, T, H]``), ``NemotronHModel.forward`` squeezes to
+        # ``[T, H]`` for the layer stack and unsqueezes back to ``[1, T, H]``
+        # before returning (see the ``squeezed_for_thd`` branch); the lm_head
+        # then yields ``[1, T, V]`` already and a second unsqueeze here would
+        # produce a spurious ``[1, 1, T, V]``.
+        if is_thd and logits.dim() == 2:
             logits = logits.unsqueeze(0)
 
         return CausalLMOutputWithPast(
diff --git a/nemo_automodel/components/utils/model_utils.py b/nemo_automodel/components/utils/model_utils.py
@@ -386,8 +386,13 @@ def squeeze_input_for_thd(input_ids, position_ids, padding_mask, attn_kwargs, se
     3. Converts max_seqlen from tensor to scalar if needed
 
     Args:
-        input_ids (torch.Tensor): Input token IDs with shape [1, total_tokens] or
-            [1, total_tokens, hidden_dim]. The first dimension will be squeezed.
+        input_ids (torch.Tensor or None): Input token IDs with shape [1, total_tokens]
+            or [1, total_tokens, hidden_dim]. The first dimension will be squeezed.
+            ``None`` is permitted when the caller is feeding the model via
+            ``inputs_embeds`` instead — embeddings are squeezed inside the model
+            forward (the ``squeezed_for_thd`` branch in ``NemotronHModel.forward``
+            and analogous code paths), so this helper has nothing to squeeze and
+            simply returns ``None`` for the ``input_ids`` slot.
         position_ids (torch.Tensor): Position IDs with shape [1, total_tokens].
             The first dimension will be squeezed.
         padding_mask (torch.Tensor): Padding mask with shape [1, total_tokens].
@@ -435,7 +440,8 @@ def squeeze_input_for_thd(input_ids, position_ids, padding_mask, attn_kwargs, se
         This function modifies attn_kwargs in-place. If you need to preserve the original
         dictionary, pass a copy.
     """
-    input_ids = input_ids.squeeze(0)
+    if input_ids is not None:
+        input_ids = input_ids.squeeze(0)
     position_ids = position_ids.squeeze(0)
     if isinstance(padding_mask, torch.Tensor):
         padding_mask = padding_mask.squeeze(0)
diff --git a/tests/unit_tests/models/nemotron_v3/test_nemotron_v3_model.py b/tests/unit_tests/models/nemotron_v3/test_nemotron_v3_model.py
@@ -405,6 +405,91 @@ def test_causal_lm_forward_no_input_ids_no_inputs_embeds_raises(self, config, ba
         with pytest.raises(ValueError, match="input_ids must be provided if inputs_embeds is not provided"):
             model()
 
+    def _build_stub_inner_model(self, hidden):
+        """Tiny ``nn.Module`` whose forward returns a fixed tensor.  Lets us
+        replace ``NemotronHForCausalLM.model`` (an ``nn.Module``) without
+        tripping ``nn.Module.__setattr__``'s child-module type check."""
+
+        class _StubInner(torch.nn.Module):
+            def forward(self, *args, **kwargs):
+                return hidden
+
+        return _StubInner()
+
+    def test_causal_lm_thd_inputs_embeds_does_not_double_unsqueeze(self, config, backend):
+        """Regression test: in THD mode with ``inputs_embeds``-only inputs, the
+        outer ``NemotronHForCausalLM.forward`` used to double-unsqueeze the
+        logits to ``[1, 1, T, V]``. The inner ``NemotronHModel.forward`` already
+        restores the batch dim (``squeezed_for_thd`` branch), so the outer must
+        only re-add it when the inner returned 2D logits.
+
+        We bypass the attention stack (which needs TE/GPU for THD shapes) by
+        replacing ``model.model`` with a stub that returns a fixed 3D tensor —
+        the same shape the real inner forward returns when it took the
+        ``inputs_embeds`` → squeeze → unsqueeze round-trip.
+        """
+        from nemo_automodel.components.models.nemotron_v3.model import NemotronHForCausalLM
+
+        model = NemotronHForCausalLM(config, backend=backend)
+        model = model.to(torch.bfloat16)
+
+        seq_len = 8
+        # Stand in for the inner forward that unsqueezed back to [1, T, H].
+        stub_hidden = torch.randn(1, seq_len, config.hidden_size, dtype=torch.bfloat16)
+        model.model = self._build_stub_inner_model(stub_hidden)
+
+        inputs_embeds = torch.randn(1, seq_len, config.hidden_size, dtype=torch.bfloat16)
+        position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0)
+        cu_seqlens = torch.tensor([0, seq_len], dtype=torch.int32)
+        max_seqlen = torch.tensor(seq_len, dtype=torch.int32)
+
+        output = model(
+            inputs_embeds=inputs_embeds,
+            position_ids=position_ids,
+            cu_seqlens=cu_seqlens,
+            cu_seqlens_padded=cu_seqlens,
+            max_seqlen_q=max_seqlen,
+            max_seqlen_kv=max_seqlen,
+            qkv_format="thd",
+        )
+
+        assert output.logits.shape == (1, seq_len, config.vocab_size)
+        assert output.logits.dim() == 3
+
+    def test_causal_lm_thd_input_ids_unsqueezes_2d_logits(self, config, backend):
+        """The original THD path (``input_ids`` only, no ``inputs_embeds``) is
+        the one most callers use; the unsqueeze fix must not regress it. The
+        inner forward returns ``[T, H]`` (2D, never went through the
+        ``squeezed_for_thd`` round-trip because ``embed_tokens(input_ids[T])``
+        is already 2D), so the outer still has to add the batch dim."""
+        from nemo_automodel.components.models.nemotron_v3.model import NemotronHForCausalLM
+
+        model = NemotronHForCausalLM(config, backend=backend)
+        model = model.to(torch.bfloat16)
+
+        seq_len = 8
+        # Stand in for the inner forward that returned 2D hidden_states.
+        stub_hidden = torch.randn(seq_len, config.hidden_size, dtype=torch.bfloat16)
+        model.model = self._build_stub_inner_model(stub_hidden)
+
+        input_ids = torch.randint(0, config.vocab_size, (1, seq_len))
+        position_ids = torch.arange(seq_len, dtype=torch.long).unsqueeze(0)
+        cu_seqlens = torch.tensor([0, seq_len], dtype=torch.int32)
+        max_seqlen = torch.tensor(seq_len, dtype=torch.int32)
+
+        output = model(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            cu_seqlens=cu_seqlens,
+            cu_seqlens_padded=cu_seqlens,
+            max_seqlen_q=max_seqlen,
+            max_seqlen_kv=max_seqlen,
+            qkv_format="thd",
+        )
+
+        assert output.logits.shape == (1, seq_len, config.vocab_size)
+        assert output.logits.dim() == 3
+
     def test_causal_lm_from_config(self, config, backend):
         """Test from_config classmethod."""
         from nemo_automodel.components.models.nemotron_v3.model import NemotronHForCausalLM
diff --git a/tests/unit_tests/utils/test_model_utils.py b/tests/unit_tests/utils/test_model_utils.py
@@ -453,3 +453,93 @@ def forward(self, input_ids, **kwargs):
         filtered = model_utils.filter_forward_kwargs(model, batch)
 
         assert filtered == batch
+
+
+class TestSqueezeInputForThd:
+    """``squeeze_input_for_thd`` strips the placeholder batch dim (``[1, T, ...] -> [T, ...]``)
+    before THD attention/Mamba kernels see the inputs. The contract has to handle
+    ``input_ids=None`` because callers feeding the model via ``inputs_embeds`` only
+    (multimodal LMs, speech-language models) leave ``input_ids`` unset; the embeddings
+    are squeezed inside the model forward instead.
+    """
+
+    def _attn_kwargs(self, *, padded: bool = False, with_max_seqlen: bool = True):
+        """Build the kwargs dict in the canonical [1, num_seqs+1] layout, padded
+        with the ``-1000`` sentinel that ``squeeze_input_for_thd`` filters out."""
+        kwargs: dict = {
+            "cu_seqlens": torch.tensor([[0, 3, 5, -1000]], dtype=torch.int32),
+        }
+        if padded:
+            kwargs["cu_seqlens_padded"] = torch.tensor([[0, 4, 6, -1000]], dtype=torch.int32)
+        if with_max_seqlen:
+            kwargs["max_seqlen"] = torch.tensor([3])
+        return kwargs
+
+    def test_squeezes_input_ids_when_provided(self):
+        input_ids = torch.tensor([[1, 2, 3, 4, 5]])
+        position_ids = torch.tensor([[0, 1, 2, 0, 1]])
+        padding_mask = torch.tensor([[False, False, False, False, False]])
+        kwargs = self._attn_kwargs()
+
+        ids, pos, mask, kw = model_utils.squeeze_input_for_thd(input_ids, position_ids, padding_mask, kwargs)
+
+        assert ids.shape == (5,)
+        assert pos.shape == (5,)
+        assert mask.shape == (5,)
+        # Sentinel filtered out and dtype/shape preserved.
+        assert kw["cu_seqlens"].tolist() == [0, 3, 5]
+        assert kw["cu_seqlens"].dtype == torch.int32
+        # max_seqlen tensor → Python int.
+        assert kw["max_seqlen"] == 3
+        assert isinstance(kw["max_seqlen"], int)
+
+    def test_accepts_input_ids_none_for_inputs_embeds_callers(self):
+        """The bug: prior code did ``input_ids.squeeze(0)`` unconditionally and
+        crashed when the caller used ``inputs_embeds`` only. The fix returns
+        ``None`` for the ``input_ids`` slot and squeezes everything else."""
+        position_ids = torch.tensor([[0, 1, 2, 0, 1]])
+        padding_mask = torch.tensor([[False, False, False, False, False]])
+        kwargs = self._attn_kwargs()
+
+        ids, pos, mask, kw = model_utils.squeeze_input_for_thd(None, position_ids, padding_mask, kwargs)
+
+        assert ids is None
+        assert pos.shape == (5,)
+        assert mask.shape == (5,)
+        assert kw["cu_seqlens"].tolist() == [0, 3, 5]
+        assert kw["max_seqlen"] == 3
+
+    def test_padding_mask_none_is_passed_through(self):
+        """Existing behavior: ``padding_mask`` may be ``None`` (unmasked path).
+        The new ``input_ids=None`` branch must compose with this."""
+        position_ids = torch.tensor([[0, 1, 2, 3, 4]])
+        kwargs = self._attn_kwargs(with_max_seqlen=False)
+
+        ids, pos, mask, kw = model_utils.squeeze_input_for_thd(None, position_ids, None, kwargs)
+
+        assert ids is None
+        assert mask is None
+        assert pos.shape == (5,)
+
+    def test_3d_inputs_embeds_via_input_ids_slot_still_works(self):
+        """Belt-and-braces: the docstring claims ``input_ids`` may carry a 3D
+        ``[1, T, H]`` embedding tensor. Squeezing dim 0 of that yields ``[T, H]``."""
+        embeds = torch.randn(1, 5, 16)
+        position_ids = torch.tensor([[0, 1, 2, 3, 4]])
+        kwargs = self._attn_kwargs(with_max_seqlen=False)
+
+        ids, pos, _mask, _kw = model_utils.squeeze_input_for_thd(embeds, position_ids, None, kwargs)
+
+        assert ids.shape == (5, 16)
+        assert pos.shape == (5,)
+
+    def test_cu_seqlens_padded_filtered_alongside_cu_seqlens(self):
+        """Both ``cu_seqlens`` and ``cu_seqlens_padded`` (CP path) get the
+        sentinel filter — the bug fix must not regress this."""
+        position_ids = torch.tensor([[0, 1, 2, 0, 1]])
+        kwargs = self._attn_kwargs(padded=True, with_max_seqlen=False)
+
+        _ids, _pos, _mask, kw = model_utils.squeeze_input_for_thd(None, position_ids, None, kwargs)
+
+        assert kw["cu_seqlens"].tolist() == [0, 3, 5]
+        assert kw["cu_seqlens_padded"].tolist() == [0, 4, 6]