Merge branch 'main' into zeel2104/feat/yaml-linter #5653
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Installation Test | |
| on: | |
| push: | |
| branches: | |
| - main | |
| - "pull-request/[0-9]+" | |
| - "deploy-release/*" | |
| concurrency: | |
| group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }} | |
| cancel-in-progress: true | |
| jobs: | |
| pre-flight: | |
| uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2 | |
| pip-test: | |
| runs-on: ${{ matrix.arch }} | |
| name: Pip - Python${{ matrix.python-version }} - ${{ matrix.arch == 'ubuntu-latest' && 'AMD64/Linux' || (matrix.arch == 'ubuntu-24.04-arm' && 'ARM64/Linux' || 'ARM64/Darwin') }} - No CUDA | |
| needs: [pre-flight] | |
| if: | | |
| !(needs.pre-flight.outputs.docs_only == 'true' | |
| || needs.pre-flight.outputs.is_deployment_workflow == 'true') | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| arch: ["ubuntu-latest", "macos-latest"] # After setting the repo to public: "ubuntu-24.04-arm"] | |
| python-version: ["3.10", "3.11", "3.12"] | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v6 | |
| - name: Set up Python ${{ matrix.python-version }} | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| - name: Upgrade pip | |
| run: | | |
| python -m pip install --upgrade pip | |
| - name: Install project | |
| run: | | |
| pip install "." | |
| - name: Checkout check-imports | |
| uses: actions/checkout@v6 | |
| with: | |
| repository: NVIDIA-NeMo/FW-CI-templates | |
| ref: v0.39.0 | |
| path: FW-CI-templates | |
| - name: Check imports for nemo_automodel | |
| uses: ./FW-CI-templates/.github/actions/check-imports | |
| with: | |
| package-name: nemo_automodel | |
| python-binary: python | |
| ngc-cuda-test-uv: | |
| runs-on: linux-amd64-cpu16 | |
| name: UV - Python${{ matrix.python-version }} - AMD64/Linux - NGC CUDA | |
| needs: [pre-flight] | |
| if: | | |
| !(needs.pre-flight.outputs.docs_only == 'true' | |
| || needs.pre-flight.outputs.is_deployment_workflow == 'true') | |
| container: | |
| image: nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu24.04 | |
| environment: nemo-ci | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| python-version: ["3.12"] | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v6 | |
| - name: Set up Python ${{ matrix.python-version }} | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| - name: Install wget | |
| run: | | |
| for i in 1 2 3; do | |
| apt-get update && apt-get install -y wget git && break | |
| echo "Attempt $i failed, retrying in 10s..." | |
| sleep 10 | |
| done | |
| - name: Upgrade pip | |
| run: | | |
| python -m pip install --upgrade pip | |
| - name: Set up UV | |
| uses: astral-sh/setup-uv@v1 | |
| with: | |
| version: 0.9.26 | |
| - name: Set up yq | |
| shell: bash | |
| run: | | |
| wget https://github.com/mikefarah/yq/releases/download/v4.45.4/yq_linux_amd64.tar.gz | |
| tar -xzf yq_linux_amd64.tar.gz | |
| mkdir -p ./bin | |
| mv yq_linux_amd64 ./bin/yq | |
| chmod +x ./bin/yq | |
| - name: Install project | |
| env: | |
| PAT: ${{ secrets.PAT }} | |
| UV_PROJECT_ENVIRONMENT: ./venv | |
| TORCH_CUDA_ARCH_LIST: "9.0 10.0 12.0" | |
| shell: bash | |
| run: | | |
| echo -e "machine github.com\n login token\n password ${{ secrets.PAT }}" > ~/.netrc | |
| chmod 600 ~/.netrc | |
| uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages | |
| source ./venv/bin/activate | |
| export PATH="./bin/:$PATH" | |
| uv sync --link-mode copy --locked --all-groups --extra all | |
| uv pip install --no-deps -e . | |
| - name: Checkout check-imports | |
| uses: actions/checkout@v6 | |
| with: | |
| repository: NVIDIA-NeMo/FW-CI-templates | |
| ref: v0.39.0 | |
| path: FW-CI-templates | |
| - name: Check imports for nemo_automodel | |
| uses: ./FW-CI-templates/.github/actions/check-imports | |
| with: | |
| package-name: nemo_automodel | |
| python-binary: ./venv/bin/python | |
| cuda-wheelhouse: | |
| runs-on: linux-amd64-cpu16 | |
| name: Build CUDA wheelhouse - Python${{ matrix.python-version }} - AMD64/Linux - NGC CUDA | |
| container: | |
| image: nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu24.04 | |
| environment: nemo-ci | |
| needs: [pre-flight] | |
| if: | | |
| !(needs.pre-flight.outputs.docs_only == 'true' | |
| || needs.pre-flight.outputs.is_deployment_workflow == 'true') | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| python-version: ["3.12"] | |
| env: | |
| WHEELHOUSE_DIR: /tmp/cuda-wheelhouse | |
| TORCH_CUDA_ARCH_LIST: "9.0 10.0 12.0" | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v6 | |
| - name: Set up Python ${{ matrix.python-version }} | |
| uses: actions/setup-python@v6 | |
| with: | |
| python-version: ${{ matrix.python-version }} | |
| - name: Restore CUDA wheelhouse cache | |
| id: cuda-wheelhouse-cache | |
| uses: actions/cache@v4 | |
| with: | |
| path: ${{ env.WHEELHOUSE_DIR }} | |
| key: install-test-cuda-wheelhouse-${{ runner.os }}-py${{ matrix.python-version }}-${{ hashFiles('pyproject.toml', 'uv.lock', '.github/workflows/install-test.yml') }} | |
| restore-keys: | | |
| install-test-cuda-wheelhouse-${{ runner.os }}-py${{ matrix.python-version }}- | |
| install-test-cuda-wheelhouse-${{ runner.os }}- | |
| - name: Build cached CUDA dependency wheels | |
| if: steps.cuda-wheelhouse-cache.outputs.cache-hit != 'true' | |
| shell: bash -x -e -u -o pipefail {0} | |
| run: | | |
| python -m venv ./venv | |
| . ./venv/bin/activate | |
| pip install --upgrade pip | |
| pip install --index-url https://download.pytorch.org/whl/cu128 "torch<=2.8.0" | |
| pip install numpy packaging psutil pybind11 setuptools wheel wheel_stub | |
| mkdir -p "${WHEELHOUSE_DIR}" | |
| pip wheel --no-deps --no-build-isolation --wheel-dir "${WHEELHOUSE_DIR}" \ | |
| causal-conv1d \ | |
| mamba-ssm \ | |
| nv-grouped-gemm \ | |
| transformer-engine-torch \ | |
| "transformer-engine[pytorch]<=2.11.0" | |
| - name: Verify CUDA wheelhouse contents | |
| shell: bash -x -e -u -o pipefail {0} | |
| run: | | |
| # Ensure cache hit and freshly-built paths both provide the expected heavy wheels. | |
| ls -1 "${WHEELHOUSE_DIR}"/*.whl | |
| test -n "$(ls -1 "${WHEELHOUSE_DIR}"/causal_conv1d*.whl 2>/dev/null)" | |
| test -n "$(ls -1 "${WHEELHOUSE_DIR}"/mamba_ssm*.whl 2>/dev/null)" | |
| test -n "$(ls -1 "${WHEELHOUSE_DIR}"/nv_grouped_gemm*.whl 2>/dev/null)" | |
| test -n "$(ls -1 "${WHEELHOUSE_DIR}"/transformer_engine_torch*.whl 2>/dev/null)" | |
| - name: Upload CUDA wheelhouse artifact | |
| uses: actions/upload-artifact@v6 | |
| with: | |
| name: cuda-wheelhouse-py${{ matrix.python-version }} | |
| path: ${{ env.WHEELHOUSE_DIR }}/*.whl | |
| if-no-files-found: error | |
| retention-days: 1 | |
| ngc-cuda-test-pip: | |
| runs-on: linux-amd64-cpu16 | |
| name: Pip - Python${{ matrix.python-version }}${{ matrix.extra-groups != '' && format('[{0}]', matrix.extra-groups) || '' }} - AMD64/Linux - NGC CUDA | |
| container: | |
| image: nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu24.04 | |
| environment: nemo-ci | |
| needs: [pre-flight, cuda-wheelhouse] | |
| if: | | |
| !(needs.pre-flight.outputs.docs_only == 'true' | |
| || needs.pre-flight.outputs.is_deployment_workflow == 'true') | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| python-version: ["3.12"] | |
| extra-groups: ["", "cuda", "vlm", "fa", "all"] | |
| env: | |
| EXTRA: ${{ matrix.extra-groups != '' && format('[{0}]', matrix.extra-groups) || '' }} | |
| WHEELHOUSE_DIR: /tmp/cuda-wheelhouse | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v6 | |
| - name: Download cached CUDA wheelhouse | |
| uses: actions/download-artifact@v7 | |
| with: | |
| name: cuda-wheelhouse-py${{ matrix.python-version }} | |
| path: ${{ env.WHEELHOUSE_DIR }} | |
| - name: Install automodel${{ matrix.extra-groups != '' && format('[{0}]', matrix.extra-groups) || '' }} | |
| shell: bash -x -e -u -o pipefail {0} | |
| run: | | |
| for i in 1 2 3; do | |
| apt-get update && apt-get install -y python3 python3-pip python3-venv git && break | |
| echo "Attempt $i failed, retrying in 10s..." | |
| sleep 10 | |
| done | |
| python3 -m venv ./venv | |
| . ./venv/bin/activate | |
| export PIP_FIND_LINKS="${WHEELHOUSE_DIR}" | |
| export PIP_PREFER_BINARY=1 | |
| pip install --upgrade pip | |
| PIP_ARGS=() | |
| if [[ $EXTRA == *"fa"* || $EXTRA == *"cuda"* || $EXTRA == *"all"* ]]; then | |
| pip install --find-links "${PIP_FIND_LINKS}" --index-url https://download.pytorch.org/whl/cu128 "torch<=2.8.0" | |
| pip install --find-links "${PIP_FIND_LINKS}" numpy packaging psutil pybind11 setuptools wheel wheel_stub | |
| PIP_ARGS=(--no-build-isolation) | |
| export TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" | |
| fi | |
| pip install --find-links "${PIP_FIND_LINKS}" ${PIP_ARGS[@]} .$EXTRA | |
| - name: Checkout check-imports | |
| uses: actions/checkout@v6 | |
| with: | |
| repository: NVIDIA-NeMo/FW-CI-templates | |
| ref: v0.39.0 | |
| path: FW-CI-templates | |
| - name: Check imports for nemo_automodel | |
| uses: ./FW-CI-templates/.github/actions/check-imports | |
| with: | |
| package-name: nemo_automodel | |
| python-binary: ./venv/bin/python | |
| uv-test: | |
| runs-on: ${{ matrix.arch }} | |
| name: UV - Python ${{ matrix.python-version }} - ${{ matrix.arch == 'ubuntu-latest' && 'AMD64/Linux' || 'ARM64/Darwin' }} - No CUDA | |
| needs: [pre-flight] | |
| if: | | |
| !(needs.pre-flight.outputs.docs_only == 'true' | |
| || needs.pre-flight.outputs.is_deployment_workflow == 'true') | |
| environment: nemo-ci | |
| strategy: | |
| fail-fast: false | |
| matrix: | |
| arch: ["${{ vars.LINT_AND_INSTALL_TEST_RUNNER || 'ubuntu-latest' }}", "macos-latest"] | |
| steps: | |
| - name: Checkout repository | |
| uses: actions/checkout@v6 | |
| - name: Set up UV | |
| uses: astral-sh/setup-uv@v1 | |
| with: | |
| version: 0.9.26 | |
| - name: Install dependencies with UV | |
| env: | |
| UV_PROJECT_ENVIRONMENT: ./venv | |
| PAT: ${{ secrets.PAT }} | |
| run: | | |
| echo -e "machine github.com\n login token\n password ${{ secrets.PAT }}" > ~/.netrc | |
| chmod 600 ~/.netrc | |
| export PATH="${UV_PROJECT_ENVIRONMENT}/bin/:$PATH" | |
| uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages | |
| uv sync --link-mode copy --locked --only-group build | |
| uv sync --link-mode copy --locked --all-groups | |
| uv pip install --no-deps -e . | |
| - name: Checkout check-imports | |
| uses: actions/checkout@v6 | |
| with: | |
| repository: NVIDIA-NeMo/FW-CI-templates | |
| ref: v0.39.0 | |
| path: FW-CI-templates | |
| - name: Check imports for nemo_automodel | |
| uses: ./FW-CI-templates/.github/actions/check-imports | |
| with: | |
| package-name: nemo_automodel | |
| python-binary: ./venv/bin/python | |
| install-test-summary: | |
| needs: [pip-test, uv-test, ngc-cuda-test-uv, cuda-wheelhouse, ngc-cuda-test-pip, pre-flight] | |
| runs-on: ubuntu-latest | |
| name: Install test summary | |
| if: | | |
| ( | |
| needs.pre-flight.outputs.docs_only == 'true' | |
| || needs.pre-flight.outputs.is_deployment_workflow == 'true' | |
| || always() | |
| ) | |
| && !cancelled() | |
| steps: | |
| - name: Checkout | |
| uses: actions/checkout@v6 | |
| - name: Get workflow result | |
| id: result | |
| shell: bash -x -e -u -o pipefail {0} | |
| env: | |
| GH_TOKEN: ${{ github.token }} | |
| RUN_ID: ${{ github.run_id }} | |
| SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }} | |
| run: | | |
| FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0 | |
| if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then | |
| echo "✅ All previous jobs completed successfully" | |
| exit 0 | |
| else | |
| echo "❌ Found $FAILED_JOBS failed job(s)" | |
| # Show which jobs failed | |
| gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name' | |
| exit 1 | |
| fi |