style(deepseek-v4): apply ruff formatting

HuiyingLi · HuiyingLi · commit 95cde23a6700 · 2026-05-11T16:43:36.000-07:00
Signed-off-by: HuiyingLi &lt;willwin.lee@gmail.com&gt;
diff --git a/nemo_automodel/components/distributed/pipelining/functional.py b/nemo_automodel/components/distributed/pipelining/functional.py
@@ -317,17 +317,13 @@ def _precompute_stage_shapes(
             # First stage receives input_ids: [mb, seq_len] int64
             stage.inputs_meta = (torch.empty(microbatch_size, seq_len, device="meta", dtype=torch.long),)
         else:
-            stage.inputs_meta = (
-                torch.empty(microbatch_size, seq_len, hidden_size, device="meta", dtype=model_dtype),
-            )
+            stage.inputs_meta = (torch.empty(microbatch_size, seq_len, hidden_size, device="meta", dtype=model_dtype),)
 
         # --- outputs_meta ---
         has_lm_head = hasattr(stage.submod, "lm_head") and stage.submod.lm_head is not None
         if has_lm_head:
             # Last stage with lm_head produces logits: [mb, seq_len, vocab_size]
-            primary_output_meta = torch.empty(
-                microbatch_size, seq_len, vocab_size, device="meta", dtype=model_dtype
-            )
+            primary_output_meta = torch.empty(microbatch_size, seq_len, vocab_size, device="meta", dtype=model_dtype)
         else:
             primary_output_meta = torch.empty(microbatch_size, seq_len, hidden_size, device="meta", dtype=model_dtype)
         outputs_meta = (primary_output_meta,)
diff --git a/nemo_automodel/components/models/deepseek_v4/mtp.py b/nemo_automodel/components/models/deepseek_v4/mtp.py
@@ -141,7 +141,9 @@ def forward(
 
         if position_ids is None:
             seq_len = embed_input.shape[1]
-            position_ids = torch.arange(seq_len, device=embed_input.device).unsqueeze(0).expand(embed_input.shape[0], -1)
+            position_ids = (
+                torch.arange(seq_len, device=embed_input.device).unsqueeze(0).expand(embed_input.shape[0], -1)
+            )
         position_embeddings = self._rotary_emb(embed_input, position_ids)
         position_embeddings_compress = self._rotary_emb_compress(embed_input, position_ids)
 
@@ -235,9 +237,7 @@ def forward(
         per_depth_h: list[torch.Tensor] = []
         cur_input_ids = input_ids
         if embed_inputs is not None and len(embed_inputs) != len(self.layers):
-            raise ValueError(
-                f"Expected {len(self.layers)} MTP embedding tensors, got {len(embed_inputs)}"
-            )
+            raise ValueError(f"Expected {len(self.layers)} MTP embedding tensors, got {len(embed_inputs)}")
         if embed_inputs is None and (cur_input_ids is None or embed_fn is None):
             raise ValueError("MTP requires either embed_inputs or both input_ids and embed_fn")
 
@@ -263,7 +263,9 @@ def forward(
 def build_mtp_config_from_hf(config, *, loss_scaling_factor: float = 0.1) -> MTPConfig:
     """Build an MTPConfig from a DeepseekV4Config."""
     num_layers = int(getattr(config, "num_nextn_predict_layers", 0) or 0)
-    return MTPConfig(num_layers=num_layers, layer_pattern="*" if num_layers > 0 else "", loss_scaling_factor=loss_scaling_factor)
+    return MTPConfig(
+        num_layers=num_layers, layer_pattern="*" if num_layers > 0 else "", loss_scaling_factor=loss_scaling_factor
+    )
 
 
 def build_deepseek_v4_mtp(