Skip to content

Commit 587287b

Browse files
authored
Merge branch 'main' into feat/remove-moe-mesh-user-api
2 parents 8f06fbd + 67880f7 commit 587287b

6 files changed

Lines changed: 668 additions & 19 deletions

File tree

.github/workflows/cicd-approve-test-queue.yml

Lines changed: 145 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,18 @@ name: Approve Test Queue
1616

1717
on:
1818
schedule:
19-
- cron: '*/5 * * * *' # Runs every 5 minutes
20-
workflow_dispatch: # Allows manual triggering
19+
- cron: "*/5 * * * *" # Runs every 5 minutes
20+
workflow_dispatch: # Allows manual triggering
2121

2222
jobs:
2323
approve-queue:
2424
runs-on: ubuntu-latest
2525
environment: main
26+
if: github.repository == 'NVIDIA-NeMo/Automodel'
27+
strategy:
28+
matrix:
29+
branch: [main, others, workflow_dispatch]
30+
contributor_type: [internal, external]
2631
steps:
2732
- name: Checkout repository
2833
uses: actions/checkout@v6
@@ -37,22 +42,53 @@ jobs:
3742
python -m pip install --upgrade pip
3843
pip install requests
3944
45+
- name: Download SSO users list
46+
run: |
47+
gh release download v0.1.0 \
48+
--repo NVIDIA-GitHub-Management/github-audits \
49+
--pattern users_sso.json \
50+
--output users_sso.json || echo '{}' > users_sso.json
51+
env:
52+
GH_TOKEN: ${{ secrets.NVIDIA_MANAGEMENT_ORG_PAT }}
53+
4054
- name: Approve waiting deployments
4155
env:
4256
GITHUB_TOKEN: ${{ secrets.PAT }}
4357
MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
58+
MAX_CONCURRENCY_EXTERNAL: ${{ vars.MAX_CONCURRENCY_EXTERNAL || 3 }}
59+
MAX_CONCURRENCY_WORKFLOW_DISPATCH: ${{ vars.MAX_CONCURRENCY || 1 }}
60+
CONTRIBUTOR_TYPE: ${{ matrix.contributor_type }}
61+
MATRIX_BRANCH: ${{ matrix.branch }}
62+
SSO_USERS_FILE: users_sso.json
63+
PYTHONUNBUFFERED: 1
64+
shell: python
4465
run: |
45-
python - <<EOF
4666
import os
67+
import json
4768
import requests
69+
import re
4870
import time
4971
50-
5172
# GitHub API configuration
5273
GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
5374
REPO = os.environ["GITHUB_REPOSITORY"]
54-
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"])
55-
API_BASE = f"https://api.github.com/repos/{REPO}"
75+
CONTRIBUTOR_TYPE = os.environ["CONTRIBUTOR_TYPE"]
76+
MATRIX_BRANCH = os.environ["MATRIX_BRANCH"]
77+
if MATRIX_BRANCH == "workflow_dispatch":
78+
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_WORKFLOW_DISPATCH"])
79+
API_BASE = f"https://api.github.com/repos/{REPO}"
80+
WORKFLOW_NAME = "CICD NeMo"
81+
else:
82+
if CONTRIBUTOR_TYPE == "external":
83+
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY_EXTERNAL"])
84+
else:
85+
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"])
86+
API_BASE = "https://api.github.com/repos/NVIDIA-NeMo/Automodel"
87+
WORKFLOW_NAME = "CICD NeMo"
88+
89+
# Load SSO users for internal/external classification
90+
with open(os.environ["SSO_USERS_FILE"]) as f:
91+
sso_users = json.load(f)
5692
5793
# Headers for GitHub API
5894
headers = {
@@ -94,7 +130,91 @@ jobs:
94130
print(f"Max retries ({max_retries}) exceeded for {endpoint}")
95131
return None
96132
133+
def is_internal_contributor(pr_info):
134+
"""Return True if the PR author is a member of NVIDIA or NVIDIA-NeMo org (is_org_member)."""
135+
login = pr_info.get("user", {}).get("login", "")
136+
org_roles = sso_users.get(login, {}).get("org_roles", [])
137+
return any(role in ("NVIDIA:Member", "NVIDIA-NeMo:Member") for role in org_roles)
138+
139+
def get_pr_base_branch(workflow_run):
140+
"""
141+
Return the base branch of the PR associated with a workflow run, or None.
142+
Extracts PR number from head branch like 'pull-request/1913' and fetches PR info.
143+
Returns (base_branch, pr_info) tuple, or (None, None) if not a PR run.
144+
"""
145+
print(workflow_run.get("head_branch", ""))
146+
head_branch = workflow_run.get("head_branch", "")
147+
match = re.match(r"pull-request/(\d+)", head_branch)
148+
if not match:
149+
return None, None # Not a PR branch pattern
150+
151+
pr_number = int(match.group(1))
152+
153+
# Fetch PR info from GitHub API
154+
pr_info = make_request(f"pulls/{pr_number}")
155+
if not pr_info:
156+
print(f"Failed to fetch PR #{pr_number}")
157+
return None, None
158+
159+
base_branch = pr_info.get("base", {}).get("ref")
160+
return base_branch, pr_info
161+
162+
def is_internal_actor(workflow_run):
163+
"""Return True if the actor who triggered the workflow run is an NVIDIA/NVIDIA-NeMo member."""
164+
login = (workflow_run.get("triggering_actor") or workflow_run.get("actor") or {}).get("login", "")
165+
org_roles = sso_users.get(login, {}).get("org_roles", [])
166+
return any(role in ("NVIDIA:Member", "NVIDIA-NeMo:Member") for role in org_roles)
167+
168+
def is_pr_run(workflow_run):
169+
"""Return True if this run was triggered by a PR (head_branch matches pull-request/<number>)."""
170+
return bool(re.match(r"pull-request/\d+", workflow_run.get("head_branch", "")))
171+
172+
def is_workflow_dispatch_run(workflow_run):
173+
"""Return True if this run was manually triggered (head_branch starts with mcore-testing-)."""
174+
return workflow_run.get("head_branch", "").startswith("mcore-testing-")
175+
176+
def matches_queue(workflow_run, target_branch, contributor_type):
177+
"""
178+
Return True if the workflow run belongs to this queue cell:
179+
matching target branch AND matching contributor type (internal/external).
180+
181+
workflow_dispatch runs (head_branch: mcore-testing-*) are routed to the 'workflow_dispatch' queue only.
182+
PR runs (head_branch: pull-request/<n>) are routed to 'main' or 'others' queues only.
183+
"""
184+
if target_branch == "workflow_dispatch":
185+
if not is_workflow_dispatch_run(workflow_run):
186+
return False
187+
internal = is_internal_actor(workflow_run)
188+
contributor_match = (contributor_type == "internal") == internal
189+
if contributor_match:
190+
actor = (workflow_run.get("triggering_actor") or workflow_run.get("actor") or {}).get("login", "unknown")
191+
print(f"workflow_dispatch run by {actor}, contributor_type={contributor_type} (internal={internal})")
192+
return contributor_match
193+
194+
# PR queue: skip non-PR runs
195+
if not is_pr_run(workflow_run):
196+
return False
197+
198+
base_branch, pr_info = get_pr_base_branch(workflow_run)
199+
if base_branch is None:
200+
return False
201+
202+
branch_match = (
203+
(base_branch == target_branch) or
204+
(base_branch != "main" and base_branch != "dev" and target_branch == "others")
205+
)
206+
if not branch_match:
207+
return False
208+
209+
pr_number = re.match(r"pull-request/(\d+)", workflow_run.get("head_branch", "")).group(1)
210+
internal = is_internal_contributor(pr_info)
211+
contributor_match = (contributor_type == "internal") == internal
212+
if branch_match and contributor_match:
213+
print(f"PR #{pr_number} targets {target_branch}, contributor_type={contributor_type} (internal={internal})")
214+
return branch_match and contributor_match
215+
97216
# Get current running and queued workflows
217+
print(f"\n=== Queue cell: branch=${{ matrix.branch }}, contributor_type={CONTRIBUTOR_TYPE} ===")
98218
print("Fetching workflow runs...")
99219
queued_resp = make_request("actions/runs?status=queued")
100220
if queued_resp is None:
@@ -107,13 +227,25 @@ jobs:
107227
exit(1)
108228
in_progress_workflow_runs = in_progress_resp.get("workflow_runs", [])
109229
230+
def log_and_filter(runs, label):
231+
cicd_runs = [r for r in runs if r["name"] == WORKFLOW_NAME]
232+
print(f"{label}: {len(runs)} total, {len(cicd_runs)} {WORKFLOW_NAME}")
233+
for r in cicd_runs:
234+
actor = (r.get("triggering_actor") or r.get("actor") or {}).get("login", "unknown")
235+
matched = matches_queue(r, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)
236+
print(f" run={r['id']} head_branch={r.get('head_branch')} event={r.get('event')} actor={actor} -> matched={matched}")
237+
return [r for r in cicd_runs if matches_queue(r, "${{ matrix.branch }}", CONTRIBUTOR_TYPE)]
238+
239+
queued_workflow_runs = log_and_filter(queued_workflow_runs, "queued")
240+
in_progress_workflow_runs = log_and_filter(in_progress_workflow_runs, "in_progress")
241+
110242
# Count running and queued workflows
111-
queued_workflows = sum(1 for run in queued_workflow_runs if run["name"] == "CICD NeMo")
112-
in_progress_workflows = sum(1 for run in in_progress_workflow_runs if run["name"] == "CICD NeMo")
243+
queued_workflows = len(queued_workflow_runs)
244+
in_progress_workflows = len(in_progress_workflow_runs)
113245
114246
total_workflows = queued_workflows + in_progress_workflows
115-
print(f"Current queued workflows: {queued_workflows}")
116-
print(f"Current running workflows: {in_progress_workflows}")
247+
print(f"Current queued workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {queued_workflows}")
248+
print(f"Current running workflows (PRs targeting ${{ matrix.branch }}, {CONTRIBUTOR_TYPE}): {in_progress_workflows}")
117249
print(f"Total workflows: {total_workflows}")
118250
print(f"Max concurrency: {MAX_CONCURRENCY}")
119251
@@ -122,20 +254,19 @@ jobs:
122254
exit(0)
123255
124256
# Get waiting CI workflows for test environment
125-
print("Fetching deployments...")
257+
print("Fetching waiting deployments...")
126258
waiting_resp = make_request("actions/runs?status=waiting")
127259
if waiting_resp is None:
128260
print("Failed to fetch waiting workflow runs after retries, exiting")
129261
exit(1)
130-
pending_workflows = waiting_resp.get("workflow_runs", [])
131-
pending_workflows = [run for run in pending_workflows if run["name"] == "CICD NeMo"]
262+
pending_workflows = log_and_filter(waiting_resp.get("workflow_runs", []), "waiting")
132263
133264
# Sort deployments by creation date (oldest first)
134265
print("Sorting workflows...")
135266
pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"])
136267
137268
# Process each deployment
138-
print("Processing ...")
269+
print(f"Processing {len(pending_workflows)} pending workflows...")
139270
for workflow in pending_workflows:
140271
if total_workflows >= MAX_CONCURRENCY:
141272
print("Maximum concurrency reached, stopping approvals")
@@ -166,8 +297,6 @@ jobs:
166297
else:
167298
print(f"Failed to approve deployment {deployment['id']}")
168299
exit(1)
169-
170-
EOF
171300
notify:
172301
if: failure()
173302
runs-on: ubuntu-latest
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# LoRA fine-tuning recipe for deepseek-ai/DeepSeek-V4-Flash on HellaSwag.
16+
# LoRA target: 4 nodes x 8 H100-80GB = 32 GPUs, PP=1 EP=32 DP=32.
17+
# Estimate: routed-expert bf16 weights are ~550 GB; EP=32 gives 8 routed
18+
# experts/rank and avoids full base optimizer state.
19+
20+
recipe: TrainFinetuneRecipeForNextTokenPrediction
21+
22+
seed: 1234
23+
24+
step_scheduler:
25+
global_batch_size: 128
26+
local_batch_size: 1
27+
ckpt_every_steps: 500
28+
val_every_steps: 500
29+
num_epochs: 1
30+
max_steps: 100
31+
32+
distributed:
33+
strategy: fsdp2
34+
tp_size: 1
35+
cp_size: 1
36+
pp_size: 1
37+
ep_size: 32
38+
39+
sequence_parallel: false
40+
activation_checkpointing: false
41+
42+
moe:
43+
reshard_after_forward: false
44+
wrap_outer_model: false
45+
46+
dist_env:
47+
backend: nccl
48+
timeout_minutes: 30
49+
50+
model:
51+
_target_: nemo_automodel.NeMoAutoModelForCausalLM.from_config
52+
config:
53+
_target_: nemo_automodel.components.models.deepseek_v4.config.DeepseekV4Config.from_pretrained
54+
pretrained_model_name_or_path: deepseek-ai/DeepSeek-V4-Flash
55+
name_or_path: deepseek-ai/DeepSeek-V4-Flash
56+
num_nextn_predict_layers: 0
57+
trust_remote_code: false
58+
load_base_model: true
59+
backend:
60+
_target_: nemo_automodel.components.models.common.BackendConfig
61+
attn: sdpa
62+
linear: torch
63+
rms_norm: torch_fp32
64+
rope_fusion: false
65+
dispatcher: deepep
66+
experts: torch_mm
67+
enable_hf_state_dict_adapter: true
68+
enable_fsdp_optimizations: true
69+
70+
peft:
71+
_target_: nemo_automodel.components._peft.lora.PeftConfig
72+
target_modules:
73+
- "*wq_a"
74+
- "*wq_b"
75+
- "*wkv"
76+
- "*wo_b"
77+
dim: 8
78+
alpha: 32
79+
use_triton: True
80+
81+
checkpoint:
82+
enabled: false
83+
dequantize_base_checkpoint: true
84+
85+
loss_fn:
86+
_target_: nemo_automodel.components.loss.masked_ce.MaskedCrossEntropy
87+
88+
dataset:
89+
_target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
90+
path_or_dataset: rowan/hellaswag
91+
split: train
92+
tokenizer:
93+
_target_: transformers.AutoTokenizer.from_pretrained
94+
pretrained_model_name_or_path: deepseek-ai/DeepSeek-V4-Flash
95+
96+
packed_sequence:
97+
packed_sequence_size: 0
98+
99+
dataloader:
100+
_target_: torchdata.stateful_dataloader.StatefulDataLoader
101+
collate_fn:
102+
_target_: nemo_automodel.components.datasets.utils.default_collater
103+
pad_seq_len_divisible: 64
104+
shuffle: true
105+
106+
validation_dataset:
107+
_target_: nemo_automodel.components.datasets.llm.hellaswag.HellaSwag
108+
path_or_dataset: rowan/hellaswag
109+
split: validation
110+
tokenizer:
111+
_target_: transformers.AutoTokenizer.from_pretrained
112+
pretrained_model_name_or_path: deepseek-ai/DeepSeek-V4-Flash
113+
114+
validation_dataloader:
115+
_target_: torchdata.stateful_dataloader.StatefulDataLoader
116+
collate_fn:
117+
_target_: nemo_automodel.components.datasets.utils.default_collater
118+
pad_seq_len_divisible: 64
119+
shuffle: false
120+
drop_last: true
121+
122+
optimizer:
123+
_target_: torch.optim.AdamW
124+
betas: [0.9, 0.95]
125+
eps: 1e-8
126+
lr: 1e-5
127+
weight_decay: 0.1

0 commit comments

Comments
 (0)