NVIDIA-NeMo
diff --git a/‎examples/vlm_kd/qwen3_5/qwen3_5_vl_4b_kd.yaml‎
Lines changed: 124 additions & 0 deletions b/‎examples/vlm_kd/qwen3_5/qwen3_5_vl_4b_kd.yaml‎
Lines changed: 124 additions & 0 deletions
diff --git a/‎nemo_automodel/components/loss/kd_loss.py‎
Lines changed: 39 additions & 3 deletions b/‎nemo_automodel/components/loss/kd_loss.py‎
Lines changed: 39 additions & 3 deletions
@@ -0,0 +1,124 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# VLM Knowledge Distillation: Qwen3.5-9B (teacher) → Qwen3.5-4B (student)
+# Dataset: MedPix-VQA (medical image VQA with real images)
+#
+# To run:
+#   automodel examples/vlm_kd/qwen3_5/qwen3_5_vl_4b_kd.yaml --nproc-per-node 8
+
+recipe: KnowledgeDistillationRecipeForVLM
+
+step_scheduler:
+  global_batch_size: 16
+  local_batch_size: 1
+  ckpt_every_steps: 200
+  val_every_steps: 50
+  num_epochs: 2
+  max_steps: 300
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 10
+
+rng:
+  _target_: nemo_automodel.components.training.rng.StatefulRNG
+  seed: 42
+  ranked: true
+
+# Student
+model:
+  _target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
+  pretrained_model_name_or_path: Qwen/Qwen3.5-4B
+  attn_implementation: sdpa
+
+# Teacher
+teacher_model:
+  _target_: nemo_automodel.NeMoAutoModelForImageTextToText.from_pretrained
+  pretrained_model_name_or_path: Qwen/Qwen3.5-9B
+  attn_implementation: sdpa
+
+processor:
+  _target_: transformers.AutoProcessor.from_pretrained
+  pretrained_model_name_or_path: Qwen/Qwen3.5-4B
+
+checkpoint:
+  enabled: true
+  checkpoint_dir: checkpoints/qwen3_5_vl_4b_kd/
+  model_save_format: safetensors
+  save_consolidated: true
+
+distributed:
+  _target_: nemo_automodel.components.distributed.fsdp2.FSDP2Manager
+  tp_size: 1
+  cp_size: 1
+  pp_size: 1
+  dp_replicate_size: 1
+  ep_size: 1
+  sequence_parallel: false
+
+clip_grad_norm:
+  max_norm: 1.0
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+# KD hyper-params
+kd_ratio: 0.5
+kd_loss_fn:
+  _target_: nemo_automodel.components.loss.kd_loss.KDLoss
+  ignore_index: -100
+  temperature: 1.0
+  fp32_upcast: true
+  chunk_size: 512
+
+optimizer:
+  _target_: torch.optim.AdamW
+  lr: 5e-5
+  weight_decay: 0.01
+  betas: [0.9, 0.95]
+  eps: 1e-8
+
+lr_scheduler:
+  lr_warmup_steps: 30
+  lr_decay_style: cosine
+
+# MedPix-VQA dataset (medical images + VQA)
+dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
+  path_or_dataset: mmoukouba/MedPix-VQA
+  split: train
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  num_workers: 0
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.vlm.datasets.make_medpix_dataset
+  path_or_dataset: mmoukouba/MedPix-VQA
+  split: validation
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  num_workers: 0
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.vlm.collate_fns.default_collate_fn
+
+# Student: freeze vision+audio towers, train language model only
+freeze_config:
+  freeze_vision_tower: true
+  freeze_audio_tower: true
+  freeze_language_model: false
@@ -89,6 +89,37 @@ def _kl_forward_tp(
     return ce_local  # shape: [valid_tokens]
 
 
+def _kl_forward_chunked(
+    t_logits: torch.Tensor,
+    s_logits: torch.Tensor,
+    chunk_size: int,
+) -> torch.Tensor:
+    """Compute per-token sum(P * log Q) in chunks to reduce peak memory.
+
+    Processes ``chunk_size`` tokens at a time so that only one chunk's worth of the
+    ``[chunk_size, vocab_size]`` fp32 probability matrix is live at any moment.
+
+    Args:
+        t_logits: Teacher logits, shape ``[num_valid_tokens, vocab_size]``.
+        s_logits: Student logits, shape ``[num_valid_tokens, vocab_size]``.
+        chunk_size: Number of tokens per chunk.
+
+    Returns:
+        Per-token sum(P * log Q), shape ``[num_valid_tokens]``.
+    """
+    num_tokens = t_logits.shape[0]
+    kl_parts: list[torch.Tensor] = []
+    for start in range(0, num_tokens, chunk_size):
+        end = min(start + chunk_size, num_tokens)
+        t_chunk = t_logits[start:end]
+        s_chunk = s_logits[start:end]
+        teacher_prob = F.softmax(t_chunk, dim=-1, dtype=torch.float32)
+        student_logprob = F.log_softmax(s_chunk, dim=-1, dtype=torch.float32)
+        inf_mask = torch.isinf(s_chunk)
+        kl_parts.append(torch.masked_fill(teacher_prob * student_logprob, inf_mask, 0).sum(-1))
+    return torch.cat(kl_parts, dim=0)
+
+
 class KDLoss(nn.Module):
     """Forward KL divergence loss for knowledge distillation.
 
@@ -108,6 +139,10 @@ class KDLoss(nn.Module):
         tp_group: Explicit TP process group.  When ``None`` (default) the group is inferred from
             the DTensor placement of ``student_logits``, or the non-TP path is used for plain
             tensors.
+        chunk_size: When positive, valid tokens are processed in chunks of this size to avoid
+            materializing the full ``[num_valid_tokens, vocab_size]`` probability matrix in fp32.
+            Reduces peak memory at the cost of slightly more kernel launches.  ``0`` (default)
+            disables chunking.  Ignored when using the TP path.
     """
 
     def __init__(
@@ -116,12 +151,14 @@ def __init__(
         temperature: float = 1.0,
         fp32_upcast: bool = True,
         tp_group: Optional[torch.distributed.ProcessGroup] = None,
+        chunk_size: int = 0,
     ):
         super().__init__()
         self.ignore_index = ignore_index
         self.temperature = temperature
         self.fp32_upcast = fp32_upcast
         self.tp_group = tp_group
+        self.chunk_size = chunk_size
 
     def forward(
         self,
@@ -191,12 +228,11 @@ def forward(
         # Compute per-token negative cross-entropy: sum(P * log Q).
         if tp_group is not None:
             kl_per_token = _kl_forward_tp(t_logits, s_logits, tp_group)
+        elif self.chunk_size > 0:
+            kl_per_token = _kl_forward_chunked(t_logits, s_logits, self.chunk_size)
         else:
             teacher_prob = F.softmax(t_logits, dim=-1, dtype=torch.float32)
             student_logprob = F.log_softmax(s_logits, dim=-1, dtype=torch.float32)
-            # mask out infinities originating *only* from student logits
-            # (teacher logits infs are extremely rare and do not
-            # affect gradients w.r.t. student parameters).
             inf_mask = torch.isinf(s_logits)
             kl_per_token = torch.masked_fill(teacher_prob * student_logprob, inf_mask, 0).sum(-1).view(-1)