Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
5a981c7
Update base container to be pytorch:25.05-py3
chtruong814 Jun 12, 2025
80f5009
Update TE to 2.4
chtruong814 Jun 12, 2025
d47d040
Remove torch accelerator patch
chtruong814 Jun 12, 2025
3437c93
Update triton patch
chtruong814 Jun 12, 2025
5d0754f
Bump TE and Mcore commits
chtruong814 Jun 12, 2025
2be52f5
Fix triton patch
chtruong814 Jun 13, 2025
a7803e5
Fix triton patch
chtruong814 Jun 15, 2025
4904f2a
No fail fast
chtruong814 Jun 15, 2025
9275b23
Update trt-llm to 0.20.0
chtruong814 Jun 15, 2025
cd5ca33
Merge remote-tracking branch 'origin/main' into chtruong/bump-pytorch…
chtruong814 Jun 17, 2025
b7a2869
Fix test_sched_config_parse_reduce_on_plateau
chtruong814 Jun 17, 2025
30015f1
Add no build isolation to TE
chtruong814 Jun 17, 2025
aeb235c
Update trt-llm dependencies
chtruong814 Jun 17, 2025
354e44b
Update manifest
chtruong814 Jun 17, 2025
bddb0c5
Merge remote-tracking branch 'origin/chtruong/bump-pytorch-25-05' int…
chtruong814 Jun 17, 2025
5a1da6c
Revert "Enable LoRA for TELinear layers (#13929)"
akoumpa Jun 18, 2025
76f1dc1
update mcore with wd_mult key fix
ko3n1g Jun 18, 2025
fa736c5
Revert "Revert "Enable LoRA for TELinear layers (#13929)""
chtruong814 Jun 19, 2025
96c91e7
Fix nemo install
chtruong814 Jun 19, 2025
ac6eb4b
Fix nemo install
chtruong814 Jun 19, 2025
4b1c9ad
Fix export image build
chtruong814 Jun 23, 2025
96744d6
Merge remote-tracking branch 'origin/main' into chtruong/bump-pytorch…
chtruong814 Jun 25, 2025
219de5e
Remove unnecessary sed for torch_tensorrt
chtruong814 Jun 25, 2025
459f989
Update TE and Mcore commits
chtruong814 Jun 25, 2025
f84385c
Add optional tests
chtruong814 Jun 25, 2025
7bbf8f9
Fix install
chtruong814 Jun 25, 2025
8845081
Ensure test script arg types are correct for top_p and top_k
chtruong814 Jun 28, 2025
319aa53
Increase export deploy timeouts
chtruong814 Jun 28, 2025
023ca6c
Merge remote-tracking branch 'origin/main' into chtruong/bump-pytorch…
chtruong814 Jun 28, 2025
c6c3a76
Skip failing test_rnnt_logprobs_random after pytorch bump
chtruong814 Jun 29, 2025
1e2675d
Skip coverage artifact config-3.12.py
chtruong814 Jun 29, 2025
c94b278
Merge remote-tracking branch 'origin/main' into chtruong/bump-pytorch…
chtruong814 Jul 1, 2025
eb2373e
Include more config files ot exclude during coverage
chtruong814 Jul 1, 2025
76cf089
Update dependencies
chtruong814 Jul 1, 2025
782d352
Merge remote-tracking branch 'origin/main' into chtruong/bump-pytorch…
chtruong814 Jul 3, 2025
c470906
Ensure top_p is float in nemo_export test script
chtruong814 Jul 3, 2025
34a8a0c
Set Optional_L2_Speech_Batch_Size_OOMptimizer_Canary to truly be opti…
chtruong814 Jul 3, 2025
7275eaf
Fix top_k and top_p types in megatronllm_deployable
chtruong814 Jul 4, 2025
fb29767
Revert "Skip failing test_rnnt_logprobs_random after pytorch bump"
chtruong814 Jul 5, 2025
67da2f5
Fix optional export test
chtruong814 Jul 5, 2025
0d2d02e
Revert unnecessary changes
chtruong814 Jul 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion .coveragerc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,11 @@ omit =
nemo/collections/audio/parts/utils/maxine.py

nemo/core/*
nemo/collections/common/*
nemo/collections/common/*

/workspace/config-3.12.py
/workspace/config-3.py
/workspace/config.py

[paths]
source =
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/cicd-main-automodel.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ jobs:
script: L2_VLM_HF_Transformer_PEFT_FSDP2
- runner: self-hosted-azure-gpus-1
script: L2_VLM_HF_Transformer_PEFT_4bit
is-optional: true
- runner: self-hosted-azure
script: L2_VLM_HF_Transformer_SFT_FSDP2
- runner: self-hosted-azure
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/cicd-main-export-deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ jobs:
runner: self-hosted-azure-gpus-1
needs: [unit-tests]
runs-on: ${{ matrix.runner }}
name: ${{ matrix.is_optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
name: ${{ matrix.is-optional && 'PLEASEFIXME_' || '' }}${{ matrix.script }}
steps:
- name: Checkout
uses: actions/checkout@v4
Expand All @@ -109,4 +109,4 @@ jobs:
script: ${{ matrix.script }}
tests_to_run: ${{ inputs.test_to_run }}
image: ${{ inputs.image-name }}
is_optional: ${{ matrix.is_optional || false }}
is_optional: ${{ matrix.is-optional || false }}
1 change: 1 addition & 0 deletions .github/workflows/cicd-main-speech.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ jobs:
script: L2_Speech_Batch_Size_OOMptimizer
- runner: self-hosted-azure
script: Optional_L2_Speech_Batch_Size_OOMptimizer_Canary
is-optional: true
- runner: self-hosted-azure
script: L2_Speech_Transcription_Speech_to_Text_Transcribe
- runner: self-hosted-azure
Expand Down
20 changes: 10 additions & 10 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ jobs:
runs-on: ubuntu-latest
environment: test
if: |
needs.pre-flight.outputs.test_to_run != '[]'
needs.pre-flight.outputs.test_to_run != '[]'
&& needs.pre-flight.outputs.components_to_run != '[]'
&& needs.pre-flight.outputs.is_ci_workload == 'false'
steps:
Expand All @@ -147,10 +147,10 @@ jobs:
uses: ./.github/workflows/_build_container.yml
needs: [pre-flight, code-linting, cicd-wait-in-queue]
if: |
needs.pre-flight.outputs.test_to_run != '[]'
needs.pre-flight.outputs.test_to_run != '[]'
&& needs.pre-flight.outputs.components_to_run != '[]'
&& (
success()
success()
|| (
needs.cicd-wait-in-queue.result == 'skipped'
&& needs.pre-flight.outputs.is_ci_workload == 'true'
Expand Down Expand Up @@ -385,8 +385,8 @@ jobs:

- name: Remove label if not cancelled
if: |
steps.result.outputs.code != 'cancelled'
&& github.event.label.name == 'Run CICD'
steps.result.outputs.code != 'cancelled'
&& github.event.label.name == 'Run CICD'
&& github.event.pull_request.head.repo.full_name == github.repository
env:
GH_TOKEN: ${{ github.token }}
Expand All @@ -395,8 +395,8 @@ jobs:

- name: Pipeline successful, add PR comment
if: |
steps.result.outputs.code == 'success'
&& github.event_name == 'pull_request'
steps.result.outputs.code == 'success'
&& github.event_name == 'pull_request'
&& env.SLACK_WEBHOOK != ''
uses: peter-evans/create-or-update-comment@v4
env:
Expand All @@ -416,8 +416,8 @@ jobs:

- name: "Pipeline not successful and not cancelled: Send Slack alert & create step summary"
if: |
steps.result.outputs.code == 'failure'
&& github.event.label.name == 'Run CICD'
steps.result.outputs.code == 'failure'
&& github.event.label.name == 'Run CICD'
&& env.SLACK_WEBHOOK != ''
env:
SLACK_WEBHOOK: ${{ secrets.SLACK_WEBHOOK }}
Expand Down Expand Up @@ -451,7 +451,7 @@ jobs:
needs.pre-flight.outputs.test_to_run != '[]'
&& needs.pre-flight.outputs.components_to_run != '[]'
&& (
success()
success()
|| needs.Nemo_CICD_Test.result == 'success'
)
&& !cancelled()
Expand Down
57 changes: 29 additions & 28 deletions docker/Dockerfile.ci.export_deploy
Original file line number Diff line number Diff line change
Expand Up @@ -32,64 +32,65 @@ apt-get install -y bc
apt-get clean
EOF

WORKDIR /tmp/NeMo
WORKDIR /opt/NeMo
ARG TRTLLM_REPO
ARG TRTLLM_TAG
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/tmp/NeMo/install_dep.sh bash -ex <<"EOF"
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh \
--mount=type=bind,source=external/patches,target=/opt/NeMo/external/patches bash -ex <<"EOF"

bash /tmp/NeMo/install_dep.sh --library trt --mode install
bash /opt/NeMo/install_dep.sh --library trt --mode install
EOF

FROM base-image AS trt-llm-wheel
WORKDIR /tmp/NeMo
WORKDIR /opt/NeMo
ARG TRTLLM_REPO
ARG TRTLLM_TAG
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/tmp/NeMo/install_dep.sh bash -ex <<"EOF"
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh \
--mount=type=bind,source=external/patches,target=/opt/NeMo/external/patches bash -ex <<"EOF"

bash /tmp/NeMo/install_dep.sh --library trtllm --mode build
bash /opt/NeMo/install_dep.sh --library trtllm --mode build
EOF

FROM base-image as te-wheel
WORKDIR /tmp/NeMo
WORKDIR /opt/NeMo
ARG TE_REPO
ARG TE_TAG
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/tmp/NeMo/install_dep.sh \
--mount=type=bind,source=external/patches,target=/tmp/NeMo/external/patches bash -ex <<"EOF"
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh \
--mount=type=bind,source=external/patches,target=/opt/NeMo/external/patches bash -ex <<"EOF"

bash /tmp/NeMo/install_dep.sh --library te --mode build
ls -al /tmp/Megatron-LM || true
bash /opt/NeMo/install_dep.sh --library te --mode build
ls -al /opt/Megatron-LM || true
EOF

FROM base-image as mcore-wheel
WORKDIR /tmp/NeMo
WORKDIR /opt/NeMo
ARG MLM_REPO
ARG MLM_TAG
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/tmp/NeMo/install_dep.sh bash -ex <<"EOF"
RUN --mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh bash -ex <<"EOF"

bash /tmp/NeMo/install_dep.sh --library mcore --mode build
ls -al /tmp/Megatron-LM || true
bash /opt/NeMo/install_dep.sh --library mcore --mode build
ls -al /opt/Megatron-LM || true
EOF

FROM base-image
WORKDIR /tmp/NeMo
WORKDIR /opt/NeMo
ENV INSTALL_DIR="/opt"
RUN \
--mount=type=bind,from=trt-llm-wheel,source=/opt/wheels/trtllm,target=/opt/wheels/trtllm \
--mount=type=bind,from=te-wheel,source=/opt/wheels/te,target=/opt/wheels/te \
--mount=type=bind,from=mcore-wheel,source=/opt/wheels/mcore,target=/opt/wheels/mcore \
--mount=type=bind,source=requirements,target=/tmp/NeMo/requirements \
--mount=type=bind,source=tools/ctc_segmentation/requirements.txt,target=/tmp/NeMo/tools/ctc_segmentation/requirements.txt \
--mount=type=bind,source=docker/common/install_dep.sh,target=/tmp/NeMo/install_dep.sh \
--mount=type=bind,source=setup.py,target=/tmp/NeMo/setup.py \
--mount=type=bind,source=external/patches,target=/tmp/NeMo/external/patches \
--mount=type=bind,source=README.md,target=/tmp/NeMo/README.md \
--mount=type=bind,source=nemo/package_info.py,target=/tmp/NeMo/nemo/package_info.py \
--mount=type=bind,source=nemo/__init__.py,target=/tmp/NeMo/nemo/__init__.py bash -ex <<"EOF"

bash /tmp/NeMo/install_dep.sh --library all --mode install
--mount=type=bind,source=requirements,target=/opt/NeMo/requirements \
--mount=type=bind,source=tools/ctc_segmentation/requirements.txt,target=/opt/NeMo/tools/ctc_segmentation/requirements.txt \
--mount=type=bind,source=docker/common/install_dep.sh,target=/opt/NeMo/install_dep.sh \
--mount=type=bind,source=setup.py,target=/opt/NeMo/setup.py \
--mount=type=bind,source=external/patches,target=/opt/NeMo/external/patches \
--mount=type=bind,source=README.md,target=/opt/NeMo/README.md \
--mount=type=bind,source=nemo/package_info.py,target=/opt/NeMo/nemo/package_info.py \
--mount=type=bind,source=nemo/__init__.py,target=/opt/NeMo/nemo/__init__.py bash -ex <<"EOF"

bash /opt/NeMo/install_dep.sh --library all --mode install
pip install --no-cache-dir ".[deploy,test]"
rm -rf $NEMO_DIR || true


EOF

WORKDIR /workspace
Expand Down
28 changes: 13 additions & 15 deletions docker/common/install_dep.sh
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ trt() {
git submodule update --init --recursive
sed -i "/torch/d" requirements.txt
git lfs pull
patch -p1 < $CURR/external/patches/trt_llm.patch
popd

if [[ "$mode" == "install" ]]; then
Expand All @@ -81,11 +82,12 @@ trt() {
bash docker/common/install_ccache.sh

. docker/common/install_tensorrt.sh \
--TRT_VER="10.9.0.34" \
--CUDA_VER="12.8" \
--CUDNN_VER="9.8.0.87-1" \
--NCCL_VER="2.25.1-1+cuda12.8" \
--CUBLAS_VER="12.8.4.1-1"
--TRT_VER="10.10.0.31" \
--CUDA_VER="12.9" \
--CUDNN_VER="9.9.0.52-1" \
--NCCL_VER="2.26.5-1+cuda12.9" \
--CUBLAS_VER="12.9.0.13-1" \
--NVRTC_VER="12.9.41-1"
set -u
fi
fi
Expand Down Expand Up @@ -133,12 +135,15 @@ trtllm() {
git submodule update --init --recursive
sed -i "/torch/d" requirements.txt
git lfs pull
patch -p1 < $CURR/external/patches/trt_llm.patch
popd

build() {
if [[ "${NVIDIA_PYTORCH_VERSION}" != "" ]]; then
# CONDA_PREFIX causes an error in trt-llm's build script
unset CONDA_PREFIX
cd $TRTLLM_DIR
python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --dist_dir $WHEELS_DIR --python_bindings --benchmarks
TORCH_CXX_FLAGS="-D_GLIBCXX_USE_CXX11_ABI=1" python3 ./scripts/build_wheel.py --job_count $(nproc) --clean --trt_root /usr/local/tensorrt --dist_dir $WHEELS_DIR --python_bindings --benchmarks
fi
}

Expand All @@ -149,8 +154,7 @@ trtllm() {
build
fi

pip install --no-cache-dir $WHEELS_DIR/tensorrt_llm*.whl --extra-index-url https://pypi.nvidia.com &&
sed -i '57d' /usr/local/lib/python3.12/dist-packages/torch_tensorrt/dynamo/conversion/custom_ops_converters.py || true
pip install --no-cache-dir $WHEELS_DIR/tensorrt_llm*.whl --extra-index-url https://pypi.nvidia.com || true
fi
}

Expand All @@ -167,15 +171,14 @@ te() {
fi
pushd $TE_DIR
git checkout -f $TE_TAG
patch -p1 </$CURR/external/patches/nemo_2.3.0_te.patch
popd

build() {
if [[ "${NVIDIA_PYTORCH_VERSION}" != "" ]]; then
cd $TE_DIR
git submodule init
git submodule update
pip wheel --wheel-dir $WHEELS_DIR/ $TE_DIR
pip wheel --wheel-dir $WHEELS_DIR/ --no-build-isolation $TE_DIR
fi
}

Expand Down Expand Up @@ -308,11 +311,6 @@ extra() {
"git+https://github.com/NVIDIA/nvidia-resiliency-ext.git@b6eb61dbf9fe272b1a943b1b0d9efdde99df0737 ; platform_machine == 'x86_64'" # Compiling NvRX requires CUDA
)
fi
if [[ "${NVIDIA_PYTORCH_VERSION}" != "" ]]; then
patch \
/usr/local/lib/python3.12/dist-packages/torch/accelerator/__init__.py \
/$CURR/external/patches/torch_accelerator_144567_fix.patch
fi

if [[ "$mode" == "install" ]]; then
pip install --force-reinstall --no-deps --no-cache-dir "${DEPS[@]}"
Expand Down
Loading
Loading