Skip to content

fix(vlm): fail loudly in PP chunker when pixel_values cannot be align… #5639

fix(vlm): fail loudly in PP chunker when pixel_values cannot be align…

fix(vlm): fail loudly in PP chunker when pixel_values cannot be align… #5639

Workflow file for this run

name: Installation Test
on:
push:
branches:
- main
- "pull-request/[0-9]+"
- "deploy-release/*"
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-${{ github.event.label.name || 'main' }}-${{ github.event_name }}
cancel-in-progress: true
jobs:
pre-flight:
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/_cicd_preflight.yml@v0.64.2
pip-test:
runs-on: ${{ matrix.arch }}
name: Pip - Python${{ matrix.python-version }} - ${{ matrix.arch == 'ubuntu-latest' && 'AMD64/Linux' || (matrix.arch == 'ubuntu-24.04-arm' && 'ARM64/Linux' || 'ARM64/Darwin') }} - No CUDA
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
strategy:
fail-fast: false
matrix:
arch: ["ubuntu-latest", "macos-latest"] # After setting the repo to public: "ubuntu-24.04-arm"]
python-version: ["3.10", "3.11", "3.12"]
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: Upgrade pip
run: |
python -m pip install --upgrade pip
- name: Install project
run: |
pip install "."
- name: Checkout check-imports
uses: actions/checkout@v6
with:
repository: NVIDIA-NeMo/FW-CI-templates
ref: v0.39.0
path: FW-CI-templates
- name: Check imports for nemo_automodel
uses: ./FW-CI-templates/.github/actions/check-imports
with:
package-name: nemo_automodel
python-binary: python
ngc-cuda-test-uv:
runs-on: linux-amd64-cpu16
name: UV - Python${{ matrix.python-version }} - AMD64/Linux - NGC CUDA
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
container:
image: nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu24.04
environment: nemo-ci
strategy:
fail-fast: false
matrix:
python-version: ["3.12"]
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: Install wget
run: |
for i in 1 2 3; do
apt-get update && apt-get install -y wget git && break
echo "Attempt $i failed, retrying in 10s..."
sleep 10
done
- name: Upgrade pip
run: |
python -m pip install --upgrade pip
- name: Set up UV
uses: astral-sh/setup-uv@v1
with:
version: 0.9.26
- name: Set up yq
shell: bash
run: |
wget https://github.com/mikefarah/yq/releases/download/v4.45.4/yq_linux_amd64.tar.gz
tar -xzf yq_linux_amd64.tar.gz
mkdir -p ./bin
mv yq_linux_amd64 ./bin/yq
chmod +x ./bin/yq
- name: Install project
env:
PAT: ${{ secrets.PAT }}
UV_PROJECT_ENVIRONMENT: ./venv
TORCH_CUDA_ARCH_LIST: "9.0 10.0 12.0"
shell: bash
run: |
echo -e "machine github.com\n login token\n password ${{ secrets.PAT }}" > ~/.netrc
chmod 600 ~/.netrc
uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
source ./venv/bin/activate
export PATH="./bin/:$PATH"
uv sync --link-mode copy --locked --all-groups --extra all
uv pip install --no-deps -e .
- name: Checkout check-imports
uses: actions/checkout@v6
with:
repository: NVIDIA-NeMo/FW-CI-templates
ref: v0.39.0
path: FW-CI-templates
- name: Check imports for nemo_automodel
uses: ./FW-CI-templates/.github/actions/check-imports
with:
package-name: nemo_automodel
python-binary: ./venv/bin/python
cuda-wheelhouse:
runs-on: linux-amd64-cpu16
name: Build CUDA wheelhouse - Python${{ matrix.python-version }} - AMD64/Linux - NGC CUDA
container:
image: nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu24.04
environment: nemo-ci
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
strategy:
fail-fast: false
matrix:
python-version: ["3.12"]
env:
WHEELHOUSE_DIR: /tmp/cuda-wheelhouse
TORCH_CUDA_ARCH_LIST: "9.0 10.0 12.0"
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v6
with:
python-version: ${{ matrix.python-version }}
- name: Restore CUDA wheelhouse cache
id: cuda-wheelhouse-cache
uses: actions/cache@v4
with:
path: ${{ env.WHEELHOUSE_DIR }}
key: install-test-cuda-wheelhouse-${{ runner.os }}-py${{ matrix.python-version }}-${{ hashFiles('pyproject.toml', 'uv.lock', '.github/workflows/install-test.yml') }}
restore-keys: |
install-test-cuda-wheelhouse-${{ runner.os }}-py${{ matrix.python-version }}-
install-test-cuda-wheelhouse-${{ runner.os }}-
- name: Build cached CUDA dependency wheels
if: steps.cuda-wheelhouse-cache.outputs.cache-hit != 'true'
shell: bash -x -e -u -o pipefail {0}
run: |
python -m venv ./venv
. ./venv/bin/activate
pip install --upgrade pip
pip install --index-url https://download.pytorch.org/whl/cu128 "torch<=2.8.0"
pip install numpy packaging psutil pybind11 setuptools wheel wheel_stub
mkdir -p "${WHEELHOUSE_DIR}"
pip wheel --no-deps --no-build-isolation --wheel-dir "${WHEELHOUSE_DIR}" \
causal-conv1d \
mamba-ssm \
nv-grouped-gemm \
transformer-engine-torch \
"transformer-engine[pytorch]<=2.11.0"
- name: Verify CUDA wheelhouse contents
shell: bash -x -e -u -o pipefail {0}
run: |
# Ensure cache hit and freshly-built paths both provide the expected heavy wheels.
ls -1 "${WHEELHOUSE_DIR}"/*.whl
test -n "$(ls -1 "${WHEELHOUSE_DIR}"/causal_conv1d*.whl 2>/dev/null)"
test -n "$(ls -1 "${WHEELHOUSE_DIR}"/mamba_ssm*.whl 2>/dev/null)"
test -n "$(ls -1 "${WHEELHOUSE_DIR}"/nv_grouped_gemm*.whl 2>/dev/null)"
test -n "$(ls -1 "${WHEELHOUSE_DIR}"/transformer_engine_torch*.whl 2>/dev/null)"
- name: Upload CUDA wheelhouse artifact
uses: actions/upload-artifact@v6
with:
name: cuda-wheelhouse-py${{ matrix.python-version }}
path: ${{ env.WHEELHOUSE_DIR }}/*.whl
if-no-files-found: error
retention-days: 1
ngc-cuda-test-pip:
runs-on: linux-amd64-cpu16
name: Pip - Python${{ matrix.python-version }}${{ matrix.extra-groups != '' && format('[{0}]', matrix.extra-groups) || '' }} - AMD64/Linux - NGC CUDA
container:
image: nvcr.io/nvidia/cuda:12.8.1-cudnn-devel-ubuntu24.04
environment: nemo-ci
needs: [pre-flight, cuda-wheelhouse]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
strategy:
fail-fast: false
matrix:
python-version: ["3.12"]
extra-groups: ["", "cuda", "vlm", "fa", "all"]
env:
EXTRA: ${{ matrix.extra-groups != '' && format('[{0}]', matrix.extra-groups) || '' }}
WHEELHOUSE_DIR: /tmp/cuda-wheelhouse
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Download cached CUDA wheelhouse
uses: actions/download-artifact@v7
with:
name: cuda-wheelhouse-py${{ matrix.python-version }}
path: ${{ env.WHEELHOUSE_DIR }}
- name: Install automodel${{ matrix.extra-groups != '' && format('[{0}]', matrix.extra-groups) || '' }}
shell: bash -x -e -u -o pipefail {0}
run: |
for i in 1 2 3; do
apt-get update && apt-get install -y python3 python3-pip python3-venv git && break
echo "Attempt $i failed, retrying in 10s..."
sleep 10
done
python3 -m venv ./venv
. ./venv/bin/activate
export PIP_FIND_LINKS="${WHEELHOUSE_DIR}"
export PIP_PREFER_BINARY=1
pip install --upgrade pip
PIP_ARGS=()
if [[ $EXTRA == *"fa"* || $EXTRA == *"cuda"* || $EXTRA == *"all"* ]]; then
pip install --find-links "${PIP_FIND_LINKS}" --index-url https://download.pytorch.org/whl/cu128 "torch<=2.8.0"
pip install --find-links "${PIP_FIND_LINKS}" numpy packaging psutil pybind11 setuptools wheel wheel_stub
PIP_ARGS=(--no-build-isolation)
export TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0"
fi
pip install --find-links "${PIP_FIND_LINKS}" ${PIP_ARGS[@]} .$EXTRA
- name: Checkout check-imports
uses: actions/checkout@v6
with:
repository: NVIDIA-NeMo/FW-CI-templates
ref: v0.39.0
path: FW-CI-templates
- name: Check imports for nemo_automodel
uses: ./FW-CI-templates/.github/actions/check-imports
with:
package-name: nemo_automodel
python-binary: ./venv/bin/python
uv-test:
runs-on: ${{ matrix.arch }}
name: UV - Python ${{ matrix.python-version }} - ${{ matrix.arch == 'ubuntu-latest' && 'AMD64/Linux' || 'ARM64/Darwin' }} - No CUDA
needs: [pre-flight]
if: |
!(needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
environment: nemo-ci
strategy:
fail-fast: false
matrix:
arch: ["${{ vars.LINT_AND_INSTALL_TEST_RUNNER || 'ubuntu-latest' }}", "macos-latest"]
steps:
- name: Checkout repository
uses: actions/checkout@v6
- name: Set up UV
uses: astral-sh/setup-uv@v1
with:
version: 0.9.26
- name: Install dependencies with UV
env:
UV_PROJECT_ENVIRONMENT: ./venv
PAT: ${{ secrets.PAT }}
run: |
echo -e "machine github.com\n login token\n password ${{ secrets.PAT }}" > ~/.netrc
chmod 600 ~/.netrc
export PATH="${UV_PROJECT_ENVIRONMENT}/bin/:$PATH"
uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
uv sync --link-mode copy --locked --only-group build
uv sync --link-mode copy --locked --all-groups
uv pip install --no-deps -e .
- name: Checkout check-imports
uses: actions/checkout@v6
with:
repository: NVIDIA-NeMo/FW-CI-templates
ref: v0.39.0
path: FW-CI-templates
- name: Check imports for nemo_automodel
uses: ./FW-CI-templates/.github/actions/check-imports
with:
package-name: nemo_automodel
python-binary: ./venv/bin/python
install-test-summary:
needs: [pip-test, uv-test, ngc-cuda-test-uv, cuda-wheelhouse, ngc-cuda-test-pip, pre-flight]
runs-on: ubuntu-latest
name: Install test summary
if: |
(
needs.pre-flight.outputs.docs_only == 'true'
|| needs.pre-flight.outputs.is_deployment_workflow == 'true'
|| always()
)
&& !cancelled()
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Get workflow result
id: result
shell: bash -x -e -u -o pipefail {0}
env:
GH_TOKEN: ${{ github.token }}
RUN_ID: ${{ github.run_id }}
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
run: |
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
if [ "${FAILED_JOBS:-0}" -eq 0 ] || [ "$SKIPPING_IS_ALLOWED" == "true" ]; then
echo "✅ All previous jobs completed successfully"
exit 0
else
echo "❌ Found $FAILED_JOBS failed job(s)"
# Show which jobs failed
gh run view $GITHUB_RUN_ID --json jobs --jq '.jobs[] | select(.status == "completed" and .conclusion != "success") | .name'
exit 1
fi