NVIDIA-NeMo
diff --git a/‎nemo_automodel/_transformers/retrieval.py‎
Lines changed: 147 additions & 21 deletions b/‎nemo_automodel/_transformers/retrieval.py‎
Lines changed: 147 additions & 21 deletions
diff --git a/‎nemo_automodel/components/datasets/vlm/collate_fns.py‎
Lines changed: 11 additions & 3 deletions b/‎nemo_automodel/components/datasets/vlm/collate_fns.py‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎nemo_automodel/components/distributed/parallelizer.py‎
Lines changed: 32 additions & 16 deletions b/‎nemo_automodel/components/distributed/parallelizer.py‎
Lines changed: 32 additions & 16 deletions
diff --git a/‎nemo_automodel/components/models/deepseek_v4/layers.py‎
Lines changed: 4 additions & 3 deletions b/‎nemo_automodel/components/models/deepseek_v4/layers.py‎
Lines changed: 4 additions & 3 deletions
@@ -22,6 +22,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from transformers import AutoConfig, AutoModel, AutoModelForSequenceClassification, PreTrainedModel
+from transformers.models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING
 from transformers.utils import logging
 
 from nemo_automodel._transformers.registry import ModelRegistry
@@ -30,6 +31,114 @@
 logger = logging.get_logger(__name__)
 
 
+def _extract_submodel(model: nn.Module, extract_submodel: str) -> PreTrainedModel:
+    """Extract a nested submodel from a loaded model using a dotted attribute path."""
+    extracted_model = model
+    for attr in extract_submodel.split("."):
+        extracted_model = getattr(extracted_model, attr)
+    if not hasattr(extracted_model, "config"):
+        raise ValueError(
+            f"Extracted submodel at '{extract_submodel}' has no .config attribute. "
+            f"The submodel must be a PreTrainedModel for save/reload to work. "
+            f"Got {type(extracted_model).__name__}."
+        )
+    return extracted_model
+
+
+def _get_supported_backbone_class(model_type: str, task: str) -> type[nn.Module] | None:
+    """Return the registered retrieval backbone class for a model type and task."""
+    task_map = SUPPORTED_BACKBONES.get(model_type.lower())
+    if task_map is None:
+        return None
+
+    arch_name = task_map.get(task)
+    if arch_name is None:
+        raise ValueError(
+            f"Unsupported task '{task}' for model type '{model_type}'. Available tasks: {', '.join(task_map)}."
+        )
+
+    if arch_name not in ModelRegistry.model_arch_name_to_cls:
+        raise ValueError(f"Model class '{arch_name}' not found in ModelRegistry.")
+
+    logger.info(f"Using {arch_name} from registry")
+    return ModelRegistry.model_arch_name_to_cls[arch_name]
+
+
+def _move_to_extracted_dtype(model: nn.Module, extracted_model: nn.Module) -> nn.Module:
+    """Move a newly-built model to the dtype used by the extracted model."""
+    for parameter in extracted_model.parameters():
+        return model.to(dtype=parameter.dtype)
+    for buffer in extracted_model.buffers():
+        return model.to(dtype=buffer.dtype)
+    return model
+
+
+def _load_from_extracted_state(
+    backbone_class: type[PreTrainedModel],
+    config,
+    extracted_model: PreTrainedModel,
+) -> PreTrainedModel:
+    """Load a target backbone from an extracted model's in-memory state dict."""
+    # Use the base HF loader because some retrieval classes override
+    # from_pretrained for path-based checkpoint loading.
+    backbone = PreTrainedModel.from_pretrained.__func__(
+        backbone_class,
+        None,
+        config=config,
+        state_dict=extracted_model.state_dict(),
+    )
+    return _move_to_extracted_dtype(backbone, extracted_model)
+
+
+def _build_backbone_from_extracted_submodel(
+    extracted_model: PreTrainedModel,
+    task: str,
+    pooling: Optional[str],
+    num_labels: Optional[int],
+    temperature: Optional[float],
+) -> PreTrainedModel:
+    """Build a task-specific retrieval backbone from an extracted text submodel."""
+    text_config = extracted_model.config
+    model_type = getattr(text_config, "model_type", "")
+    task_map = SUPPORTED_BACKBONES.get(model_type.lower())
+    has_supported_target = task_map is not None and task in task_map
+
+    if task_map is not None and not has_supported_target and task != "score":
+        raise ValueError(
+            f"Unsupported task '{task}' for model type '{model_type}'. Available tasks: {', '.join(task_map)}."
+        )
+
+    if task == "score" and not has_supported_target:
+        config = text_config.__class__.from_dict(text_config.to_dict())
+        try:
+            backbone_class = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING[type(config)]
+        except KeyError as exc:
+            raise ValueError(f"No HuggingFace sequence-classification model found for '{model_type}'.") from exc
+    elif not has_supported_target:
+        return extracted_model
+    else:
+        backbone_class = _get_supported_backbone_class(model_type, task)
+        config_class = getattr(backbone_class, "config_class", None)
+        if config_class is None or not hasattr(text_config, "to_dict"):
+            return extracted_model
+
+        config_dict = text_config.to_dict()
+        config_dict.pop("model_type", None)
+        config = config_class(**config_dict)
+
+    attn_implementation = getattr(text_config, "_attn_implementation", None)
+    if attn_implementation is not None:
+        config._attn_implementation = attn_implementation
+    if has_supported_target and pooling is not None:
+        config.pooling = pooling
+    if num_labels is not None:
+        config.num_labels = num_labels
+    if has_supported_target and temperature is not None:
+        config.temperature = temperature
+
+    return _load_from_extracted_state(backbone_class, config, extracted_model)
+
+
 def pool(last_hidden_states: torch.Tensor, attention_mask: torch.Tensor, pool_type: str) -> torch.Tensor:
     """
     Pool hidden states using the specified pooling method.
@@ -100,15 +209,25 @@ def build_encoder_backbone(
     task: str,
     trust_remote_code: bool = False,
     pooling: Optional[str] = None,
+    extract_submodel: Optional[str] = None,
+    num_labels: Optional[int] = None,
+    temperature: Optional[float] = None,
     **hf_kwargs,
 ) -> PreTrainedModel:
     """Build an encoder backbone from a pretrained checkpoint.
 
-    For model types listed in :data:`SUPPORTED_BACKBONES`, resolves the
-    custom bidirectional architecture class from :class:`ModelRegistry`.
-    For all other model types, falls back to
-    ``AutoModel.from_pretrained`` (or ``AutoModelForSequenceClassification``
-    for the ``"score"`` task).
+    When ``extract_submodel`` is set, loads the parent model with HuggingFace
+    Auto classes and extracts the dotted path. For supported extracted text
+    backbones, it then builds the registered retrieval class for the requested
+    task (bidirectional base model for ``"embedding"``, sequence-classification
+    wrapper for ``"score"``). For unsupported extracted text backbones, it
+    returns the extracted model for ``"embedding"`` and wraps it with
+    ``AutoModelForSequenceClassification`` for ``"score"``.
+
+    Without ``extract_submodel``, model types listed in
+    :data:`SUPPORTED_BACKBONES` resolve to custom bidirectional classes from
+    :class:`ModelRegistry`; all other model types fall back to HuggingFace Auto
+    classes.
 
     Args:
         model_name_or_path: Path or HuggingFace Hub identifier.
@@ -117,6 +236,10 @@ def build_encoder_backbone(
         pooling: Bi-encoder pooling strategy for registry backbones (e.g. Llama bidirectional)
             that accept it on ``from_pretrained``. Must not be forwarded to standard HF models
             (e.g. Qwen3) loaded via ``AutoModel``; those only receive ``**hf_kwargs``.
+        extract_submodel: Dotted attribute path to extract from the loaded model
+            (e.g. ``"language_model"`` to extract the text backbone from a VLM).
+        num_labels: Number of labels for reranking/classification backbones.
+        temperature: Optional retrieval score temperature for custom retrieval backbones.
         **hf_kwargs: Extra keyword arguments forwarded to ``from_pretrained``.
 
     Returns:
@@ -129,29 +252,34 @@ def build_encoder_backbone(
     config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code)
     model_type = getattr(config, "model_type", "")
 
-    task_map = SUPPORTED_BACKBONES.get(model_type.lower())
-
-    if task_map is not None:
-        arch_name = task_map.get(task)
-        if arch_name is None:
-            raise ValueError(
-                f"Unsupported task '{task}' for model type '{model_type}'. Available tasks: {', '.join(task_map)}."
-            )
-
-        if arch_name not in ModelRegistry.model_arch_name_to_cls:
-            raise ValueError(f"Model class '{arch_name}' not found in ModelRegistry.")
-
-        BidirectionalModelClass = ModelRegistry.model_arch_name_to_cls[arch_name]
-        logger.info(f"Using {arch_name} from registry")
+    if extract_submodel is not None:
+        logger.info(f"Loading {model_name_or_path} with HuggingFace Auto classes to extract {extract_submodel}")
+        model = AutoModel.from_pretrained(model_name_or_path, trust_remote_code=trust_remote_code, **hf_kwargs)
+        extracted_model = _extract_submodel(model, extract_submodel)
+        return _build_backbone_from_extracted_submodel(
+            extracted_model,
+            task=task,
+            pooling=pooling,
+            num_labels=num_labels,
+            temperature=temperature,
+        )
 
+    BidirectionalModelClass = _get_supported_backbone_class(model_type, task)
+    if BidirectionalModelClass is not None:
         if pooling is not None:
             hf_kwargs["pooling"] = pooling
+        if num_labels is not None:
+            hf_kwargs["num_labels"] = num_labels
+        if temperature is not None:
+            hf_kwargs["temperature"] = temperature
         return BidirectionalModelClass.from_pretrained(
             model_name_or_path, trust_remote_code=trust_remote_code, **hf_kwargs
         )
 
     # Fallback: use HuggingFace Auto classes for model types not in SUPPORTED_BACKBONES
     logger.info(f"Model type '{model_type}' not in SUPPORTED_BACKBONES; falling back to HuggingFace Auto classes")
+    if task == "score" and num_labels is not None:
+        hf_kwargs["num_labels"] = num_labels
     if task == "score":
         return AutoModelForSequenceClassification.from_pretrained(
             model_name_or_path, trust_remote_code=trust_remote_code, **hf_kwargs
@@ -205,8 +333,6 @@ def save_encoder_pretrained(model: nn.Module, save_directory: str, **kwargs) ->
     "llama_bidirec": _LLAMA_TASKS,
     "ministral3": _MINISTRAL3_BIDIREC_TASKS,
     "ministral3_bidirec": _MINISTRAL3_BIDIREC_TASKS,
-    # Mistral3-VL Hub configs use top-level model_type "mistral3" (language is nested under text_config).
-    "mistral3": _MINISTRAL3_BIDIREC_TASKS,
 }
 
 
 
@@ -871,6 +871,10 @@ def kimi_k25_vl_collate_fn(
     all_expanded = []
     all_pixel_values = []
     all_grid_thws = []
+    # Per-sample image counts, kept in lockstep with all_expanded so that
+    # n_images_per_sample length matches batch_size downstream. Samples that
+    # are text-only or whose image region was orphaned by truncation get 0.
+    per_sample_image_count: List[int] = []
 
     for i, conversation in enumerate(conversations):
         # Collect medias for this conversation
@@ -923,12 +927,14 @@ def kimi_k25_vl_collate_fn(
 
         # Only include image data if all expanded image tokens survived truncation.
         # Partial truncation into image regions would cause a mismatch in the model forward.
+        sample_image_count = 0
         if grid_thws is not None:
             merge_h, merge_w = _DEFAULT_MERGE_KERNEL
             expected_image_tokens = sum(int((h // merge_h) * (w // merge_w)) for _, h, w in grid_thws.tolist())
             actual_image_tokens = (input_ids == media_token_id).sum().item()
             if actual_image_tokens == expected_image_tokens:
                 all_grid_thws.append(grid_thws)
+                sample_image_count = int(grid_thws.shape[0])
                 if "pixel_values" in sample_batch:
                     all_pixel_values.append(sample_batch["pixel_values"])
             else:
@@ -943,6 +949,7 @@ def kimi_k25_vl_collate_fn(
                 "attention_mask": attention_mask,
             }
         )
+        per_sample_image_count.append(sample_image_count)
 
     if not all_expanded:
         raise ValueError(
@@ -990,9 +997,10 @@ def kimi_k25_vl_collate_fn(
         result["grid_thws"] = torch.cat(all_grid_thws, dim=0)
         # Also add as image_grid_hws for PP chunking in finetune.py
         result["image_grid_hws"] = result["grid_thws"][:, 1:]  # [N, 3] -> [N, 2] (drop temporal dim, keep H,W)
-        # Per-sample image counts for PP chunking
-        image_counts = [g.shape[0] for g in all_grid_thws]
-        result["n_images_per_sample"] = torch.tensor(image_counts, dtype=torch.long)
+        # Per-sample image counts for PP chunking. Length must equal batch_size,
+        # so include zeros for text-only samples and for samples whose image
+        # region was orphaned by truncation.
+        result["n_images_per_sample"] = torch.tensor(per_sample_image_count, dtype=torch.long)
 
     # Build labels
     labels = build_labels_from_template(
 
@@ -1195,46 +1195,55 @@ def validate_tp_mesh(model, tp_mesh):
     )
 
 
-def _find_largest_module_list(model: nn.Module) -> Optional[nn.ModuleList]:
+def _find_largest_module_list(model: nn.Module) -> Optional[Union[nn.ModuleList, nn.ModuleDict]]:
     """
-    Heuristic function to find the largest nn.ModuleList in a model.
+    Heuristic function to find the largest layer container in a model.
 
-    This function recursively traverses the model to find all nn.ModuleList instances
-    and returns the one with the most modules. This is useful as a fallback when
-    the model architecture is unknown, since transformer layers are typically
-    organized in ModuleLists.
+    This function recursively traverses the model to find all nn.ModuleList and
+    pipeline-split nn.ModuleDict instances and returns the one with the most
+    modules. This is useful as a fallback when the model architecture is unknown,
+    since transformer layers are typically organized in ModuleLists. Pipeline
+    splitting converts ModuleLists to ModuleDicts keyed by original layer index.
 
     Args:
         model (nn.Module): The model to search through.
 
     Returns:
-        Optional[nn.ModuleList]: The largest ModuleList found, or None if no ModuleList exists.
+        Optional[Union[nn.ModuleList, nn.ModuleDict]]: The largest layer container found, or None.
     """
-    largest_module_list = None
+    largest_module_list: Optional[Union[nn.ModuleList, nn.ModuleDict]] = None
     largest_size = 0
 
+    def _is_pp_layer_module_dict(module: nn.ModuleDict) -> bool:
+        # functional.py converts split ModuleLists to ModuleDicts with stringified
+        # numeric indices. Avoid treating arbitrary named ModuleDicts (for example
+        # adapter registries) as transformer layer containers in the heuristic path.
+        return all(key.isdigit() for key in module.keys())
+
     def _recursive_search(module: nn.Module, path: str = ""):
         nonlocal largest_module_list, largest_size
 
         for name, child in module.named_children():
             current_path = f"{path}.{name}" if path else name
 
-            if isinstance(child, nn.ModuleList):
+            if isinstance(child, nn.ModuleList) or (
+                isinstance(child, nn.ModuleDict) and _is_pp_layer_module_dict(child)
+            ):
                 current_size = len(child)
                 if current_size > largest_size:
                     largest_size = current_size
                     largest_module_list = child
-                    logger.debug(f"Found ModuleList at {current_path} with {current_size} modules")
+                    logger.debug(f"Found {type(child).__name__} at {current_path} with {current_size} modules")
 
             # Continue recursive search
             _recursive_search(child, current_path)
 
     _recursive_search(model)
 
     if largest_module_list is not None:
-        logger.info(f"Largest ModuleList found with {largest_size} modules")
+        logger.info(f"Largest layer container found with {largest_size} modules")
     else:
-        logger.warning("No ModuleList found in the model")
+        logger.warning("No ModuleList or ModuleDict found in the model")
 
     return largest_module_list
 
@@ -1320,6 +1329,8 @@ def _extend_layers(layers, modules):
         for m in modules:
             if isinstance(m, nn.ModuleList):
                 layers.extend(m)
+            elif isinstance(m, nn.ModuleDict):
+                layers.extend(m.values())
             else:
                 layers.append(m)
 
@@ -1338,15 +1349,20 @@ def _extend_layers(layers, modules):
     elif hasattr(model, "layers"):
         layers.extend(model.layers)
     else:
-        # Use heuristic to find the largest ModuleList in the model
+        # Use heuristic to find the largest layer container in the model.
         logger.warning(f"Unknown model type: {model_cls}. Using heuristic to find transformer layers.")
         largest_module_list = _find_largest_module_list(model)
         if largest_module_list is None:
-            # If no ModuleList found, still raise an exception
+            # If no layer container is found, still raise an exception.
             print(model)
-            raise ValueError(f"Unknown model type: {model_cls} and no ModuleList found in model structure")
+            raise ValueError(
+                f"Unknown model type: {model_cls} and no ModuleList or ModuleDict found in model structure"
+            )
 
-        layers.extend(largest_module_list)
+        if isinstance(largest_module_list, nn.ModuleDict):
+            layers.extend(largest_module_list.values())
+        else:
+            layers.extend(largest_module_list)
         logger.info(f"Successfully extracted {len(largest_module_list)} layers using heuristic")
 
     assert all(isinstance(m, nn.Module) for m in layers), "layers shoudl be nn.Module instances"
 
@@ -46,8 +46,9 @@
   See ``_hc_split_sinkhorn`` for the pure-torch port of the reference mixer
   (ported from miles PR 1045's ``kernel/sinkhorn.py``).
 
-Sliding-window / compress-ratio attention is NOT yet implemented.
-All layers use full causal attention regardless of compress_ratios.
+Compress-ratio attention (Compressor + Indexer) is wired into
+DeepseekV4Attention.forward for layers with compress_ratio > 0.
+All layers share the same sliding-window causal mask on the local KV path.
 """
 
 from __future__ import annotations
@@ -473,7 +474,7 @@ def eager_attention_with_sink(
     sinks = module.sinks.reshape(1, -1, 1, 1).expand(query.shape[0], -1, query.shape[-2], -1)
     combined = torch.cat([attn_weights, sinks.to(attn_weights.dtype)], dim=-1)
     combined = combined - combined.max(dim=-1, keepdim=True).values
-    probs = F.softmax(combined, dim=-1, dtype=combined.dtype)[..., :-1]
+    probs = F.softmax(combined, dim=-1, dtype=torch.float32)[..., :-1]
     probs = F.dropout(probs, p=dropout, training=module.training).to(value_states.dtype)
     return torch.matmul(probs, value_states).transpose(1, 2).contiguous(), probs