NVIDIA-NeMo
diff --git a/‎.github/workflows/cicd-approve-test-queue.yml‎
Lines changed: 145 additions & 16 deletions b/‎.github/workflows/cicd-approve-test-queue.yml‎
Lines changed: 145 additions & 16 deletions
diff --git a/‎examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag_lora.yaml‎
Lines changed: 127 additions & 0 deletions b/‎examples/llm_finetune/deepseek_v4/deepseek_v4_flash_hellaswag_lora.yaml‎
Lines changed: 127 additions & 0 deletions
@@ -16,13 +16,18 @@ name: Approve Test Queue
 
 on:
   schedule:
-    - cron: '*/5 * * * *'  # Runs every 5 minutes
-  workflow_dispatch:  # Allows manual triggering
+    - cron: "*/5 * * * *" # Runs every 5 minutes
+  workflow_dispatch: # Allows manual triggering
 
 jobs:
   approve-queue:
     runs-on: ubuntu-latest
     environment: main
+    if: github.repository == 'NVIDIA-NeMo/Automodel'
+    strategy:
+      matrix:
+        branch: [main, others, workflow_dispatch]
+        contributor_type: [internal, external]
     steps:
       - name: Checkout repository
         uses: actions/checkout@v6
@@ -37,22 +42,53 @@ jobs:
           python -m pip install --upgrade pip
           pip install requests
 
+      - name: Download SSO users list
+        run: |
+          gh release download v0.1.0 \
+            --repo NVIDIA-GitHub-Management/github-audits \
+            --pattern users_sso.json \
+            --output users_sso.json || echo '{}' > users_sso.json
+        env:
+          GH_TOKEN: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
+
       - name: Approve waiting deployments
         env:
           GITHUB_TOKEN: ${{ secrets.PAT }}
           MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
+          MAX_CONCURRENCY_EXTERNAL: ${{ vars.MAX_CONCURRENCY_EXTERNAL || 3 }}
+          MAX_CONCURRENCY_WORKFLOW_DISPATCH: ${{ vars.MAX_CONCURRENCY || 1 }}
+          CONTRIBUTOR_TYPE: ${{ matrix.contributor_type }}
+          MATRIX_BRANCH: ${{ matrix.branch }}
+          SSO_USERS_FILE: users_sso.json
+          PYTHONUNBUFFERED: 1
+        shell: python
         run: |
-          python - <<EOF
           import os
+          import json
           import requests
+          import re
           import time
 
-
           # GitHub API configuration
           GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
           REPO = os.environ["GITHUB_REPOSITORY"]
-          MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"])
-          API_BASE = f"https://api.github.com/repos/{REPO}"
+          CONTRIBUTOR_TYPE = os.environ["CONTRIBUTOR_TYPE"]
+          MATRIX_BRANCH = os.environ["MATRIX_BRANCH"]
+          if MATRIX_BRANCH == "workflow_dispatch":
+              MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_WORKFLOW_DISPATCH"])
+              API_BASE = f"https://api.github.com/repos/{REPO}"
+              WORKFLOW_NAME = "CICD NeMo"
+          else:
+              if CONTRIBUTOR_TYPE == "external":
+                  MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_EXTERNAL"])
+              else:
+                  MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"])
+              API_BASE = "https://api.github.com/repos/NVIDIA-NeMo/Automodel"
+              WORKFLOW_NAME = "CICD NeMo"
+
+          # Load SSO users for internal/external classification
+          with open(os.environ["SSO_USERS_FILE"]) as f:
+              sso_users = json.load(f)
 
           # Headers for GitHub API
           headers = {
@@ -94,7 +130,91 @@ jobs:
               print(f"Max retries ({max_retries}) exceeded for {endpoint}")
               return None
 
+          def is_internal_contributor(pr_info):
+              """Return True if the PR author is a member of NVIDIA or NVIDIA-NeMo org (is_org_member)."""
+              login = pr_info.get("user", {}).get("login", "")
+              org_roles = sso_users.get(login, {}).get("org_roles", [])
+              return any(role in ("NVIDIA:Member", "NVIDIA-NeMo:Member") for role in org_roles)
+
+          def get_pr_base_branch(workflow_run):
+              """
+              Return the base branch of the PR associated with a workflow run, or None.
+              Extracts PR number from head branch like 'pull-request/1913' and fetches PR info.
+              Returns (base_branch, pr_info) tuple, or (None, None) if not a PR run.
+              """
+              print(workflow_run.get("head_branch", ""))
+              head_branch = workflow_run.get("head_branch", "")
+              match = re.match(r"pull-request/(\d+)", head_branch)
+              if not match:
+                  return None, None  # Not a PR branch pattern
+
+              pr_number = int(match.group(1))
+
+              # Fetch PR info from GitHub API
+              pr_info = make_request(f"pulls/{pr_number}")
+              if not pr_info:
+                  print(f"Failed to fetch PR #{pr_number}")
+                  return None, None
+
+              base_branch = pr_info.get("base", {}).get("ref")
+              return base_branch, pr_info
+
+          def is_internal_actor(workflow_run):
+              """Return True if the actor who triggered the workflow run is an NVIDIA/NVIDIA-NeMo member."""
+              login = (workflow_run.get("triggering_actor") or workflow_run.get("actor") or {}).get("login", "")
+              org_roles = sso_users.get(login, {}).get("org_roles", [])
+              return any(role in ("NVIDIA:Member", "NVIDIA-NeMo:Member") for role in org_roles)
+
+          def is_pr_run(workflow_run):
+              """Return True if this run was triggered by a PR (head_branch matches pull-request/<number>)."""
+              return bool(re.match(r"pull-request/\d+", workflow_run.get("head_branch", "")))
+
+          def is_workflow_dispatch_run(workflow_run):
+              """Return True if this run was manually triggered (head_branch starts with mcore-testing-)."""
+              return workflow_run.get("head_branch", "").startswith("mcore-testing-")
+
+          def matches_queue(workflow_run, target_branch, contributor_type):
+              """
+              Return True if the workflow run belongs to this queue cell:
+              matching target branch AND matching contributor type (internal/external).
+
+              workflow_dispatch runs (head_branch: mcore-testing-*) are routed to the 'workflow_dispatch' queue only.
+              PR runs (head_branch: pull-request/<n>) are routed to 'main' or 'others' queues only.
+              """
+              if target_branch == "workflow_dispatch":
+                  if not is_workflow_dispatch_run(workflow_run):
+                      return False
+                  internal = is_internal_actor(workflow_run)
+                  contributor_match = (contributor_type == "internal") == internal
+                  if contributor_match:
+                      actor = (workflow_run.get("triggering_actor") or workflow_run.get("actor") or {}).get("login", "unknown")
+                      print(f"workflow_dispatch run by {actor}, contributor_type={contributor_type} (internal={internal})")
+                  return contributor_match
+
+              # PR queue: skip non-PR runs
+              if not is_pr_run(workflow_run):
+                  return False
+
+              base_branch, pr_info = get_pr_base_branch(workflow_run)
+              if base_branch is None:
+                  return False
+
+              branch_match = (
+                  (base_branch == target_branch) or
+                  (base_branch != "main" and base_branch != "dev" and target_branch == "others")
+              )
+              if not branch_match:
+                  return False
+
+              pr_number = re.match(r"pull-request/(\d+)", workflow_run.get("head_branch", "")).group(1)
+              internal = is_internal_contributor(pr_info)
+              contributor_match = (contributor_type == "internal") == internal
+              if branch_match and contributor_match:
+                  print(f"PR #{pr_number} targets {target_branch}, contributor_type={contributor_type} (internal={internal})")
+              return branch_match and contributor_match
+
           # Get current running and queued workflows
+          print(f"\n=== Queue cell: branch=${{ matrix.branch }}, contributor_type={CONTRIBUTOR_TYPE} ===")
           print("Fetching workflow runs...")
           queued_resp = make_request("actions/runs?status=queued")
           if queued_resp is None:
@@ -107,13 +227,25 @@ jobs:
               exit(1)
           in_progress_workflow_runs = in_progress_resp.get("workflow_runs", [])
 
+          def log_and_filter(runs, label):
+              cicd_runs = [r for r in runs if r["name"] == WORKFLOW_NAME]
+              print(f"{label}: {len(runs)} total, {len(cicd_runs)} {WORKFLOW_NAME}")
+              for r in cicd_runs:
+                  actor = (r.get("triggering_actor") or r.get("actor") or {}).get("login", "unknown")
+                  matched = matches_queue(r, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)
+                  print(f"  run={r['id']} head_branch={r.get('head_branch')} event={r.get('event')} actor={actor} -> matched={matched}")
+              return [r for r in cicd_runs if matches_queue(r, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
+
+          queued_workflow_runs = log_and_filter(queued_workflow_runs, "queued")
+          in_progress_workflow_runs = log_and_filter(in_progress_workflow_runs, "in_progress")
+
           # Count running and queued workflows
-          queued_workflows = sum(1 for run in queued_workflow_runs if run["name"] == "CICD NeMo")
-          in_progress_workflows = sum(1 for run in in_progress_workflow_runs if run["name"] == "CICD NeMo")
+          queued_workflows = len(queued_workflow_runs)
+          in_progress_workflows = len(in_progress_workflow_runs)
 
           total_workflows = queued_workflows + in_progress_workflows
-          print(f"Current queued workflows: {queued_workflows}")
-          print(f"Current running workflows: {in_progress_workflows}")
+          print(f"Current queued workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {queued_workflows}")
+          print(f"Current running workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {in_progress_workflows}")
           print(f"Total workflows: {total_workflows}")
           print(f"Max concurrency: {MAX_CONCURRENCY}")
 
@@ -122,20 +254,19 @@ jobs:
               exit(0)
 
           # Get waiting CI workflows for test environment
-          print("Fetching deployments...")
+          print("Fetching waiting deployments...")
           waiting_resp = make_request("actions/runs?status=waiting")
           if waiting_resp is None:
               print("Failed to fetch waiting workflow runs after retries, exiting")
               exit(1)
-          pending_workflows = waiting_resp.get("workflow_runs", [])
-          pending_workflows = [run for run in pending_workflows if run["name"] == "CICD NeMo"]
+          pending_workflows = log_and_filter(waiting_resp.get("workflow_runs", []), "waiting")
 
           # Sort deployments by creation date (oldest first)
           print("Sorting workflows...")
           pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"])
 
           # Process each deployment
-          print("Processing ...")
+          print(f"Processing {len(pending_workflows)} pending workflows...")
           for workflow in pending_workflows:
               if total_workflows >= MAX_CONCURRENCY:
                   print("Maximum concurrency reached, stopping approvals")
@@ -166,8 +297,6 @@ jobs:
               else:
                   print(f"Failed to approve deployment {deployment['id']}")
                   exit(1)
-
-          EOF
   notify:
     if: failure()
     runs-on: ubuntu-latest
 
@@ -0,0 +1,127 @@
+# Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# LoRA fine-tuning recipe for deepseek-ai/DeepSeek-V4-Flash on HellaSwag.
+# LoRA target: 4 nodes x 8 H100-80GB = 32 GPUs, PP=1 EP=32 DP=32.
+# Estimate: routed-expert bf16 weights are ~550 GB; EP=32 gives 8 routed
+# experts/rank and avoids full base optimizer state.
+
+recipe: TrainFinetuneRecipeForNextTokenPrediction
+
+seed: 1234
+
+step_scheduler:
+  global_batch_size: 128
+  local_batch_size: 1
+  ckpt_every_steps: 500
+  val_every_steps: 500
+  num_epochs: 1
+  max_steps: 100
+
+distributed:
+  strategy: fsdp2
+  tp_size: 1
+  cp_size: 1
+  pp_size: 1
+  ep_size: 32
+
+  sequence_parallel: false
+  activation_checkpointing: false
+
+  moe:
+    reshard_after_forward: false
+    wrap_outer_model: false
+
+dist_env:
+  backend: nccl
+  timeout_minutes: 30
+
+model:
+  _target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config
+  config:
+    _target_: nemo_automodel.components.models.deepseek_v4.config.DeepseekV4Config.from_pretrained
+    pretrained_model_name_or_path: deepseek-ai/DeepSeek-V4-Flash
+    name_or_path: deepseek-ai/DeepSeek-V4-Flash
+    num_nextn_predict_layers: 0
+  trust_remote_code: false
+  load_base_model: true
+  backend:
+    _target_: nemo_automodel.components.models.common.BackendConfig
+    attn: sdpa
+    linear: torch
+    rms_norm: torch_fp32
+    rope_fusion: false
+    dispatcher: deepep
+    experts: torch_mm
+    enable_hf_state_dict_adapter: true
+    enable_fsdp_optimizations: true
+
+peft:
+  _target_: nemo_automodel.components._peft.lora.PeftConfig
+  target_modules:
+    - "*wq_a"
+    - "*wq_b"
+    - "*wkv"
+    - "*wo_b"
+  dim: 8
+  alpha: 32
+  use_triton: True
+
+checkpoint:
+  enabled: false
+  dequantize_base_checkpoint: true
+
+loss_fn:
+  _target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
+
+dataset:
+  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
+  path_or_dataset: rowan/hellaswag
+  split: train
+  tokenizer:
+    _target_: transformers.AutoTokenizer.from_pretrained
+    pretrained_model_name_or_path: deepseek-ai/DeepSeek-V4-Flash
+
+packed_sequence:
+  packed_sequence_size: 0
+
+dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.utils.default_collater
+    pad_seq_len_divisible: 64
+  shuffle: true
+
+validation_dataset:
+  _target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
+  path_or_dataset: rowan/hellaswag
+  split: validation
+  tokenizer:
+    _target_: transformers.AutoTokenizer.from_pretrained
+    pretrained_model_name_or_path: deepseek-ai/DeepSeek-V4-Flash
+
+validation_dataloader:
+  _target_: torchdata.stateful_dataloader.StatefulDataLoader
+  collate_fn:
+    _target_: nemo_automodel.components.datasets.utils.default_collater
+    pad_seq_len_divisible: 64
+  shuffle: false
+  drop_last: true
+
+optimizer:
+  _target_: torch.optim.AdamW
+  betas: [0.9, 0.95]
+  eps: 1e-8
+  lr: 1e-5
+  weight_decay: 0.1