fix(recipes): correct validation loss averaging in LLM KD recipe

khazic · khazic · commit d620b1c05e83 · 2026-05-11T16:17:41.000+08:00
_forward_backward_step returns per-token-averaged losses, but the
validation loop accumulated them without un-averaging first. This caused
val_loss to be divided twice (yielding an artificially small value) and
ce_loss/kd_loss to be reported as raw sums instead of per-token means.

Multiply each per-batch loss by its num_label_tokens before accumulating,
then divide by total_num_label_tokens at the end for a proper weighted
average — matching the pattern used in the parent FinetuneRecipe.

Signed-off-by: khazic &lt;khazzz1c@gmail.com&gt;
diff --git a/nemo_automodel/recipes/llm/kd.py b/nemo_automodel/recipes/llm/kd.py
@@ -797,9 +797,9 @@ def _run_validation_epoch(self, val_dataloader):
             for mp in self.model_parts:
                 mp.eval()
 
-            total_loss = torch.tensor(0.0, dtype=torch.float32, device=self.dist_env.device)
-            ce_loss = torch.tensor(0.0, dtype=torch.float32, device=self.dist_env.device)
-            kd_loss = torch.tensor(0.0, dtype=torch.float32, device=self.dist_env.device)
+            total_loss = 0.0
+            total_ce_loss = 0.0
+            total_kd_loss = 0.0
             total_num_label_tokens = 0
 
             for batch in val_dataloader:
@@ -811,24 +811,35 @@ def _run_validation_epoch(self, val_dataloader):
                     num_batches=1,
                     is_train=False,
                 )
+                # _forward_backward_step returns per-token-averaged losses.
+                # Multiply back by num_label_tokens to get the raw sum for
+                # correct weighted averaging across batches.
+                total_loss += local_loss.item() * num_label_tokens
+                total_ce_loss += _ce_loss.item() * num_label_tokens
+                total_kd_loss += _kd_loss.item() * num_label_tokens
                 total_num_label_tokens += num_label_tokens
-                ce_loss += _ce_loss
-                kd_loss += _kd_loss
-                total_loss += local_loss
 
-        total_loss = self._dp_allreduce(total_loss, include_cp=True).item()
-        ce_loss = self._dp_allreduce(ce_loss, include_cp=True).item()
-        kd_loss = self._dp_allreduce(kd_loss, include_cp=True).item()
+        total_loss = self._dp_allreduce(
+            torch.tensor(total_loss, dtype=torch.float32, device=self.dist_env.device), include_cp=True
+        ).item()
+        total_ce_loss = self._dp_allreduce(
+            torch.tensor(total_ce_loss, dtype=torch.float32, device=self.dist_env.device), include_cp=True
+        ).item()
+        total_kd_loss = self._dp_allreduce(
+            torch.tensor(total_kd_loss, dtype=torch.float32, device=self.dist_env.device), include_cp=True
+        ).item()
         total_num_label_tokens = self._dp_allreduce(torch.tensor(total_num_label_tokens, dtype=torch.long)).item()
 
         val_loss = total_loss / max(total_num_label_tokens, 1e-8)
+        val_ce_loss = total_ce_loss / max(total_num_label_tokens, 1e-8)
+        val_kd_loss = total_kd_loss / max(total_num_label_tokens, 1e-8)
         return MetricsSample(
             step=self.step_scheduler.step,
             epoch=self.step_scheduler.epoch,
             metrics={
                 "val_loss": val_loss,
-                "ce_loss": ce_loss,
-                "kd_loss": kd_loss,
+                "ce_loss": val_ce_loss,
+                "kd_loss": val_kd_loss,
                 "lr": self.optimizer[0].param_groups[0]["lr"],
                 "num_label_tokens": total_num_label_tokens,
                 "mem": torch.cuda.max_memory_allocated() / 1024**3,