Skip to content

Commit c2410fc

Browse files
Merge branch 'dev' into nrwu/eagercp
2 parents 370aa4a + 221747d commit c2410fc

File tree

1,113 files changed

+42105
-17469
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,113 files changed

+42105
-17469
lines changed

.github/CODEOWNERS

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
megatron/core @NVIDIA/core-nemo @NVIDIA/core-devtech
2+
3+
.gitlab/ @NVIDIA/ci
4+
.github/ @NVIDIA/ci
5+
.gitlab-ci.yml @NVIDIA/ci
6+
docker/ @NVIDIA/ci
7+
tests/unit_tests/run_ci_test.sh @NVIDIA/ci
8+
tests/test_utils/python_scripts/
9+
tests/functional_tests/python_test_utils/ @NVIDIA/ci
10+
tests/functional_tests/shell_test_utils/ @NVIDIA/ci
11+
megatron/core/transformer/transformer_block.py @NVIDIA/ci
12+
megatron/core/transformer/transformer_layer.py @NVIDIA/ci
13+
tests/functional_tests/test_cases/ @NVIDIA/ci
14+
tests/functional_tests/recipes/ @NVIDIA/ci
15+
tests/unit_tests/ @NVIDIA/ci

.github/actions/action.yml

Lines changed: 94 additions & 158 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ name: "Test Template"
1515
description: "Template for running NeMo tests in a containerized environment"
1616

1717
inputs:
18+
container-image:
19+
description: "Container image to use for test"
20+
required: true
1821
timeout:
1922
description: "Max runtime of test in minutes"
2023
required: false
@@ -46,210 +49,146 @@ inputs:
4649
runs:
4750
using: "composite"
4851
steps:
49-
- name: Copy data
50-
shell: bash
51-
if: inputs.is_unit_test == 'false'
52-
env:
53-
SOURCE_DIR: /mnt/datadrive/TestData/megatron-lm/artifacts
54-
TARGET_DIR: /home/runner/_work/TestData/megatron-lm/artifacts
55-
MODEL: ${{ inputs.model }}
56-
run: |
57-
mkdir -p $TARGET_DIR/text/data/
58-
59-
if [[ "$MODEL" == "bert" ]]; then
60-
mkdir -p $TARGET_DIR/text/the_pile/bert_shard00/
61-
cp -a $SOURCE_DIR/text/the_pile/bert_shard00/. $TARGET_DIR/text/data/
62-
elif [[ "$MODEL" == "gpt" ]] || [[ "$MODEL" == "moe" ]]; then
63-
cp -a $SOURCE_DIR/text/the_pile/shard00/. $TARGET_DIR/text/data/
64-
fi
65-
66-
- name: Install curl, sudo
67-
shell: bash
68-
run: |
69-
sudo apt-get update
70-
sudo apt-get install -y curl uuid-runtime
71-
7252
- name: Checkout repository
7353
uses: actions/checkout@v2
74-
with:
75-
path: ${{ github.workspace }}/Megatron-LM
76-
77-
- name: Cache uv
78-
uses: actions/cache@v4
79-
id: cache
80-
with:
81-
path: cache-mount
82-
key: ${{ runner.os }}-uv-${{ hashFiles('**/uv.lock') }}
83-
restore-keys: |
84-
${{ runner.os }}-uv-
8554

86-
- name: Restore Docker cache mounts
87-
uses: reproducible-containers/buildkit-cache-dance@5b81f4d29dc8397a7d341dba3aeecc7ec54d6361
88-
with:
89-
cache-dir: cache-mount
90-
dockerfile: docker/Dockerfile.ci.dev
91-
skip-extraction: ${{ steps.cache.outputs.cache-hit }}
55+
- name: Change ownership of /home/runner/
56+
shell: bash
57+
run: sudo chown -R $(whoami) /home/runner/
9258

9359
- name: Setup python
9460
uses: actions/setup-python@v5
9561
with:
9662
python-version: 3.12
9763

98-
- name: Download test data
99-
shell: bash
100-
env:
101-
GH_TOKEN: ${{ inputs.PAT }}
102-
TIMEOUT: ${{ inputs.timeout }}
103-
IS_UNIT_TEST: ${{ inputs.is_unit_test == 'true' }}
64+
- name: Install uuidgen
65+
shell: bash -x -e -u -o pipefail {0}
10466
run: |
105-
echo "::group::Download test data"
106-
pip install --no-cache-dir pygithub click
107-
python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
108-
echo "::endgroup::"
67+
apt-get update
68+
apt-get install -y uuid-runtime
10969
11070
- name: Create run-script (unit test)
111-
shell: bash
71+
shell: bash -x -e -u -o pipefail {0}
11272
if: inputs.is_unit_test == 'true'
11373
run: |
11474
echo "::group::Create run-script"
11575
cmd=$(cat <<'RUN_TEST_EOF'
11676
#!/bin/bash
11777
118-
docker exec -t test_container_${{ github.run_id }} bash -c '
119-
set -e
120-
bash /opt/megatron-lm/tests/unit_tests/run_ci_test.sh \
121-
--tag ${{ inputs.tag }} \
122-
--environment dev \
123-
--bucket '\''${{ inputs.test_case }}'\'' \
124-
--log-dir /opt/megatron-lm/outputs/logs
125-
'
78+
export PYTHONPATH=$(pwd)
79+
export NEMORUN_HOME=$(pwd)
80+
pip install --no-cache-dir uv
81+
uv sync --only-group test
82+
uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
83+
--scope unit-tests \
84+
--model unit-tests \
85+
--test-case "${{ inputs.test_case }}" \
86+
--environment dev \
87+
--platform dgx_h100 \
88+
--tag ${{ inputs.tag }} \
89+
--container-image ${{ inputs.container-image }}
12690
12791
RUN_TEST_EOF
12892
)
12993
echo "$cmd" | tee "job.sh"
13094
echo "::endgroup::"
13195
96+
- name: Get PR info
97+
id: get-pr-info
98+
if: startsWith(github.ref, 'refs/heads/pull-request/')
99+
uses: nv-gha-runners/get-pr-info@main
100+
101+
- name: Install GH CLI
102+
shell: bash -x -e -u -o pipefail {0}
103+
run: |
104+
apt-get update
105+
apt-get install -y gh
106+
107+
- name: Has Run tests label
108+
shell: bash -x -e -u -o pipefail {0}
109+
id: has-run-tests-label
110+
env:
111+
GH_TOKEN: ${{ github.token }}
112+
run: |
113+
PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
114+
HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false"
115+
echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
116+
132117
- name: Create run-script (e2e test)
133-
shell: bash
118+
shell: bash -x -e -u -o pipefail {0}
134119
if: inputs.is_unit_test == 'false'
135120
env:
136121
MODEL: ${{ inputs.model }}
137122
run: |
138123
echo "::group::Create run-script"
139124
cmd=$(cat <<'RUN_TEST_EOF'
140125
#!/bin/bash
141-
142-
143-
144-
docker exec -t test_container_${{ github.run_id }} bash -c '
145-
146-
set -e
147-
ls -al /workspace/data
148-
149-
if [[ "${{ inputs.model }}" == "bert" ]]; then
150-
TRAINING_SCRIPT_PATH=pretrain_bert.py
151-
elif [[ "${{ inputs.model }}" == "gpt" ]] || [[ "${{ inputs.model }}" == "moe" ]]; then
152-
TRAINING_SCRIPT_PATH=pretrain_gpt.py
153-
fi
154-
155-
ARGUMENTS=(
156-
"DATA_PATH=/workspace/data"
157-
"DATA_CACHE_PATH=/workspace/data/cache"
158-
"OUTPUT_PATH=$(pwd)/outputs/"
159-
"TENSORBOARD_PATH=$(pwd)/tensorboard"
160-
"CHECKPOINT_SAVE_PATH=$(pwd)/checkpoints"
161-
"CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME"
162-
"TRAINING_SCRIPT_PATH=$TRAINING_SCRIPT_PATH"
163-
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/${{inputs.model}}/${{inputs.test_case}}/model_config.yaml"
164-
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/${{inputs.model}}/${{inputs.test_case}}/golden_values_dev_dgx_h100.json"
165-
"N_REPEAT=5"
166-
"ENABLE_LIGHTWEIGHT_MODE=false"
167-
"RECORD_CHECKPOINTS=false"
168-
)
169-
170-
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${ARGUMENTS[@]}
171-
'
126+
set -euxo pipefail
127+
128+
export PYTHONPATH=$(pwd)
129+
export NEMORUN_HOME=$(pwd)
130+
pip install --no-cache-dir uv
131+
uv sync --only-group test
132+
uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
133+
--scope mr \
134+
--model ${{ inputs.model }} \
135+
--test-case ${{ inputs.test_case }} \
136+
--environment dev \
137+
--platform dgx_h100 \
138+
--container-image ${{ inputs.container-image }} \
139+
--data-dir /mnt/datadrive/TestData/megatron-lm/artifacts \
140+
--enable-lightweight-mode
172141
173142
RUN_TEST_EOF
174143
)
175144
echo "$cmd" | tee "job.sh"
176145
echo "::endgroup::"
177146
178-
- name: Build container
179-
shell: bash
180-
env:
181-
GH_TOKEN: ${{ inputs.PAT }}
182-
run: |
183-
echo "::group::Build test container"
184-
docker build -f docker/Dockerfile.ci.dev --build-arg FROM_IMAGE_NAME="nvcr.io/nvidia/pytorch:25.06-py3" --target=main -t megatron-core .
185-
echo "::endgroup::"
186-
187-
- name: Start container
188-
shell: bash
189-
run: |
190-
echo "::group::Start test container"
191-
set -x
192-
193-
cmd=$(cat <<RUN_TEST_EOF
194-
#!/bin/bash
195-
docker container rm -f test_container_${{ github.run_id }} || true
196-
docker run \
197-
--rm \
198-
-d \
199-
--name test_container_${{ github.run_id }} \
200-
--runtime=nvidia --gpus all \
201-
--shm-size=64g \
202-
--ipc=host \
203-
-e NCCL_IB_DISABLE=1 \
204-
-e NCCL_P2P_LEVEL=NVL \
205-
--workdir /opt/megatron-lm/ \
206-
-v /home/runner/_work/TestData/megatron-lm/artifacts/text/data/:/workspace/data \
207-
--volume ${{ github.workspace }}/Megatron-LM:/opt/megatron-lm/ \
208-
$VOLUME_ARGS \
209-
megatron-core \
210-
bash -c "sleep $(( ${{ inputs.timeout }} * 60 + 60 ))"
211-
RUN_TEST_EOF
212-
)
213-
214-
echo "$cmd" | tee "retry_job.sh"
215-
bash retry_job.sh
216-
echo "::endgroup::"
217-
218147
- name: Set timeout
219-
shell: bash
148+
shell: bash -x -e -u -o pipefail {0}
220149
id: timeout_in_seconds
221150
run: |
222151
echo "::group::Set timeout"
223152
echo "main=$(( ${{ inputs.timeout }} * 60 ))" | tee -a "$GITHUB_OUTPUT"
224153
echo "::endgroup::"
225154
155+
- name: Pull container
156+
shell: bash -x -e -u -o pipefail {0}
157+
run: |
158+
echo "::group::Pull container"
159+
docker pull ${{ inputs.container-image }}
160+
echo "::endgroup::"
161+
226162
- name: Run main script
227-
uses: nick-fields/retry@v3
163+
shell: bash -x -e -u -o pipefail {0}
228164
id: run-main-script
229-
with:
230-
timeout_seconds: ${{ steps.timeout_in_seconds.outputs.main }}
231-
max_attempts: 3
232-
shell: bash
233-
retry_on: any
234-
command: /bin/bash job.sh
235-
on_retry_command: /bin/bash retry_job.sh
165+
run: |
166+
echo "::group::Run main script"
167+
EXIT_CODE=0
168+
/bin/bash job.sh || EXIT_CODE=$?
169+
echo "exit_code=$EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
170+
exit $EXIT_CODE
171+
echo "::endgroup::"
236172
237173
- name: Check result
238174
id: check
239-
shell: bash
175+
shell: bash -x -e -u -o pipefail {0}
176+
if: always()
177+
env:
178+
IS_UNIT_TEST: ${{ inputs.is_unit_test == 'true' }}
240179
run: |
241180
echo "::group::Check result"
242181
243-
docker exec test_container_${{ github.run_id }} /opt/venv/bin/coverage xml
244-
docker cp test_container_${{ github.run_id }}:/opt/megatron-lm/outputs/logs ./
245-
docker cp test_container_${{ github.run_id }}:/opt/megatron-lm/.coverage .coverage
246-
docker cp test_container_${{ github.run_id }}:/opt/megatron-lm/coverage.xml coverage.xml
247-
248-
coverage_report=coverage-${{ inputs.is_unit_test == 'true' && 'unit-test' || 'e2e' }}-${{ github.run_id }}-$(uuidgen)
249-
echo "coverage_report=$coverage_report" | tee -a "$GITHUB_OUTPUT"
250182
logs_report=logs-${{ inputs.test_case }}-${{ github.run_id }}-$(uuidgen)
251183
echo "logs_report=$logs_report" | sed 's/\//-/g' | sed 's/\*/-/g' | tee -a "$GITHUB_OUTPUT"
252184
185+
if [[ "$IS_UNIT_TEST" == "true" ]]; then
186+
coverage_report=coverage-${{ inputs.is_unit_test == 'true' && 'unit-test' || 'e2e' }}-${{ github.run_id }}-$(uuidgen)
187+
else
188+
coverage_report=none
189+
fi
190+
echo "coverage_report=$coverage_report" | tee -a "$GITHUB_OUTPUT"
191+
253192
EXIT_CODE=${{ steps.run-main-script.outputs.exit_code }}
254193
IS_SUCCESS=$([[ "$EXIT_CODE" -eq 0 ]] && echo "true" || echo "false")
255194
@@ -261,16 +200,18 @@ runs:
261200
if [[ "$IS_SUCCESS" == "false" ]]; then
262201
echo Test did not finish successfully.
263202
exit 1
264-
else
265-
docker exec -t test_container_${{ github.run_id }} /opt/venv/bin/coverage report -i
203+
fi
204+
205+
if [[ "$coverage_report" != "none" ]]; then
206+
uv run coverage report -i
266207
fi
267208
268209
exit $EXIT_CODE
269210
echo "::endgroup::"
270211
271212
- name: Upload coverage
272213
uses: actions/upload-artifact@v4
273-
if: ${{ steps.check.outputs.coverage_report != 'none' }}
214+
if: ${{ always() && steps.check.outputs.coverage_report != 'none' }}
274215
with:
275216
name: ${{ steps.check.outputs.coverage_report }}
276217
path: |
@@ -280,13 +221,8 @@ runs:
280221

281222
- name: Upload logs
282223
uses: actions/upload-artifact@v4
224+
if: always()
283225
with:
284226
name: ${{ steps.check.outputs.logs_report }}
285-
path: logs
227+
path: ${{ inputs.is_unit_test == 'true' && 'logs' || 'assets_dir' }}
286228
include-hidden-files: true
287-
288-
- name: Container shutdown
289-
if: always()
290-
shell: bash
291-
run: |
292-
docker container rm -f test_container_${{ github.run_id }} || true

0 commit comments

Comments
 (0)