fix(deepseek-v4): keep MoE routing scores and attention softmax in fp32

khazic · khazic · commit 3b86b96ea562 · 2026-05-09T16:05:56.000+08:00
Two precision issues that compound across 61 layers and degrade backbone parity vs reference (observed during MTP parity testing in #2191): 1. sqrtsoftplus Gate cast routing scores back to bf16 immediately after computing sqrt(softplus(x.float())), losing precision for expert selection. The HashGate counterpart stays in fp32. Remove the .to(scores.dtype) cast so non-hash layers match. 2. eager_attention_with_sink ran softmax in the input dtype (bf16 under autocast). Force fp32 softmax for numerical stability, matching standard practice. Also fix a stale docstring claiming compress-ratio attention was not yet implemented — it has been wired in. Signed-off-by: khazic <khazzz1c@gmail.com>
diff --git a/nemo_automodel/components/models/deepseek_v4/layers.py b/nemo_automodel/components/models/deepseek_v4/layers.py
@@ -46,8 +46,9 @@
   See ``_hc_split_sinkhorn`` for the pure-torch port of the reference mixer
   (ported from miles PR 1045's ``kernel/sinkhorn.py``).
 
-Sliding-window / compress-ratio attention is NOT yet implemented.
-All layers use full causal attention regardless of compress_ratios.
+Compress-ratio attention (Compressor + Indexer) is wired into
+DeepseekV4Attention.forward for layers with compress_ratio > 0.
+All layers share the same sliding-window causal mask on the local KV path.
 """
 
 from __future__ import annotations
@@ -473,7 +474,7 @@ def eager_attention_with_sink(
     sinks = module.sinks.reshape(1, -1, 1, 1).expand(query.shape[0], -1, query.shape[-2], -1)
     combined = torch.cat([attn_weights, sinks.to(attn_weights.dtype)], dim=-1)
     combined = combined - combined.max(dim=-1, keepdim=True).values
-    probs = F.softmax(combined, dim=-1, dtype=combined.dtype)[..., :-1]
+    probs = F.softmax(combined, dim=-1, dtype=torch.float32)[..., :-1]
     probs = F.dropout(probs, p=dropout, training=module.training).to(value_states.dtype)
     return torch.matmul(probs, value_states).transpose(1, 2).contiguous(), probs
 
diff --git a/nemo_automodel/components/moe/layers.py b/nemo_automodel/components/moe/layers.py
@@ -355,7 +355,7 @@ def forward(
             weights = original_scores.gather(1, indices)
         elif self.score_func == "sqrtsoftplus":
             # sqrt(softplus(x)) = sqrt(log(1 + exp(x))), used in DeepSeek V4.
-            scores = torch.sqrt(F.softplus(scores.float())).to(scores.dtype)
+            scores = torch.sqrt(F.softplus(scores.float()))
             original_scores = scores
 
             if self.e_score_correction_bias is not None: