restrict Gemma4 manual CP route

HuiyingLi · HuiyingLi · commit d1a8ff8b9555 · 2026-05-09T23:37:22.000-07:00
diff --git a/nemo_automodel/components/distributed/cp_utils.py b/nemo_automodel/components/distributed/cp_utils.py
@@ -503,10 +503,10 @@ def _get_mesh_size(mesh):
     # Gemma4 needs a local-query/global-key attention mask that PyTorch's
     # ring-template CP path cannot represent. Its pre-embed step marks the
     # batch so we use explicit contiguous sequence sharding and let
-    # attach_cp_sdpa_hooks all-gather K/V and token types inside attention.
-    manual_allgather = (
-        bool(batch.pop("_cp_manual_allgather", False)) or "mm_token_type_ids" in batch or "_packed_seq_ids" in batch
-    )
+    # attach_cp_sdpa_hooks all-gather K/V and token metadata inside attention.
+    # Metadata such as mm_token_type_ids or _packed_seq_ids does not select this
+    # path by itself because other VLMs can carry those fields.
+    manual_allgather = bool(batch.pop("_cp_manual_allgather", False))
 
     # Remove attention_mask from the batch so the model does not attempt to
     # build a local 4D mask with the wrong key length. Preserve padding
diff --git a/tests/unit_tests/distributed/test_cp_utils.py b/tests/unit_tests/distributed/test_cp_utils.py
@@ -146,6 +146,7 @@ def test_make_cp_batch_and_ctx_pads_to_cp_load_balance_multiple(monkeypatch):
         "input_ids": torch.tensor([[1, 2, 3]]),
         "labels": torch.tensor([[1, 2, 3]]),
         "mm_token_type_ids": torch.tensor([[0, 1, 0]]),
+        "_cp_manual_allgather": True,
     }
 
     _cu.make_cp_batch_and_ctx(device_mesh, batch, padding_token_id=99)
@@ -156,6 +157,37 @@ def test_make_cp_batch_and_ctx_pads_to_cp_load_balance_multiple(monkeypatch):
     assert batch["mm_token_type_ids"][0, -1].item() == 0
 
 
+def test_make_cp_batch_and_ctx_mm_token_type_ids_do_not_select_manual_allgather(monkeypatch):
+    """VLM metadata alone should not opt non-Gemma4 models into manual all-gather CP."""
+    device_mesh = _DummyDeviceMesh(cp_size=2, tp_size=1)
+    calls = {}
+
+    def fake_create_context_parallel_ctx(**kwargs):
+        calls["cp_buffers"] = kwargs["cp_buffers"]
+        return "cp_ctx"
+
+    def fake_get_train_context(enable_loss_parallel, enable_compiled_autograd, cp_context=None):
+        calls["cp_context"] = cp_context
+        return contextlib.nullcontext
+
+    monkeypatch.setattr(_cu, "create_context_parallel_ctx", fake_create_context_parallel_ctx)
+    monkeypatch.setattr(_cu, "get_train_context", fake_get_train_context)
+
+    batch = {
+        "input_ids": torch.tensor([[1, 2, 3, 4]]),
+        "labels": torch.tensor([[1, 2, 3, 4]]),
+        "mm_token_type_ids": torch.tensor([[0, 1, 1, 0]]),
+    }
+
+    ctx_obj, new_batch = _cu.make_cp_batch_and_ctx(device_mesh, batch, padding_token_id=99)
+
+    assert ctx_obj is contextlib.nullcontext
+    assert calls["cp_context"] == "cp_ctx"
+    assert len(calls["cp_buffers"]) == 3
+    assert torch.equal(new_batch["input_ids"], torch.tensor([[1, 2, 3, 4]]))
+    assert torch.equal(new_batch["mm_token_type_ids"], torch.tensor([[0, 1, 1, 0]]))
+
+
 def test_make_cp_batch_and_ctx_supports_inputs_embeds_and_per_layer_inputs(monkeypatch):
     """Gemma4 CP pre-embedding path should shard inputs_embeds side inputs."""
     device_mesh = _DummyDeviceMesh(cp_size=2, tp_size=1)
@@ -167,6 +199,7 @@ def test_make_cp_batch_and_ctx_supports_inputs_embeds_and_per_layer_inputs(monke
         "labels": labels,
         "per_layer_inputs": per_layer_inputs,
         "mm_token_type_ids": torch.zeros(1, 4, dtype=torch.long),
+        "_cp_manual_allgather": True,
     }
 
     _cu.make_cp_batch_and_ctx(device_mesh, batch)
@@ -184,6 +217,7 @@ def test_make_cp_batch_and_ctx_pads_and_slices_packed_seq_ids(monkeypatch):
         "input_ids": torch.tensor([[1, 2, 3]]),
         "labels": torch.tensor([[1, 2, 3]]),
         "_packed_seq_ids": torch.tensor([[1, 1, 2]]),
+        "_cp_manual_allgather": True,
     }
 
     _cu.make_cp_batch_and_ctx(device_mesh, batch, padding_token_id=99)