Skip to content

tilegym-ci

tilegym-ci #307

Workflow file for this run

# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# SPDX-License-Identifier: MIT
name: tilegym-ci
on:
push:
branches:
- "pull-request/[0-9]+"
schedule:
# Run nightly at 12 PM UTC
- cron: '0 12 * * *'
workflow_dispatch: # Allow manual trigger
permissions:
contents: read
packages: write
pull-requests: read
checks: write
env:
# PR images go to a temp repo, main/nightly go to main repo
IMAGE_NAME_PR: tilegym-pr
IMAGE_NAME_MAIN: tilegym
jobs:
config:
name: parse-ci-config
runs-on: ubuntu-latest
outputs:
build: ${{ steps.parse.outputs.build }}
run_ops: ${{ steps.parse.outputs.run_ops }}
run_benchmark: ${{ steps.parse.outputs.run_benchmark }}
run_sanity: ${{ steps.parse.outputs.run_sanity }}
image_tag: ${{ steps.parse.outputs.image_tag }}
image_name: ${{ steps.parse.outputs.image_name }}
is_pr: ${{ steps.context.outputs.is_pr }}
steps:
- name: Determine context
id: context
run: |
if [[ "${{ github.ref }}" == "refs/heads/main" ]] || [[ "${{ github.event_name }}" == "schedule" ]]; then
echo "is_pr=false" >> $GITHUB_OUTPUT
echo "image_name=${{ env.IMAGE_NAME_MAIN }}" >> $GITHUB_OUTPUT
echo "Running in main/nightly context"
else
echo "is_pr=true" >> $GITHUB_OUTPUT
echo "image_name=${{ env.IMAGE_NAME_PR }}" >> $GITHUB_OUTPUT
echo "Running in PR context"
fi
- name: Checkout code
uses: actions/checkout@v4
- name: Get PR info
id: pr
uses: actions/github-script@v7
with:
script: |
let prBody = '';
let prNumber = '';
const branchName = context.ref.replace('refs/heads/', '');
core.info(`Looking for PR for branch: ${branchName}`);
// Try method 1: Extract PR number from branch name
const branchMatch = branchName.match(/^pull-request\/(\d+)/);
if (branchMatch) {
prNumber = branchMatch[1];
core.info(`Extracted PR #${prNumber} from branch name`);
// Fetch PR body by number
try {
const { data: pr } = await github.rest.pulls.get({
owner: context.repo.owner,
repo: context.repo.repo,
pull_number: parseInt(prNumber),
});
prBody = pr.body || '';
core.info(`Fetched PR body (${prBody.length} characters)`);
} catch (error) {
core.warning(`Failed to fetch PR #${prNumber}: ${error.message}`);
}
} else {
// Try method 2: Search by branch name
try {
const { data: prs } = await github.rest.pulls.list({
owner: context.repo.owner,
repo: context.repo.repo,
state: 'open',
head: `${context.repo.owner}:${branchName}`,
});
if (prs.length > 0) {
prBody = prs[0].body || '';
prNumber = prs[0].number.toString();
core.info(`Found PR #${prNumber} via API search`);
core.info(`PR body length: ${prBody.length} characters`);
} else {
core.info(`No open PR found for branch ${branchName}`);
}
} catch (error) {
core.warning(`Error searching for PR: ${error.message}`);
}
}
return { prBody, prNumber };
- name: Parse config and set image tag
id: parse
env:
PR_BODY: ${{ fromJSON(steps.pr.outputs.result).prBody }}
PR_NUMBER: ${{ fromJSON(steps.pr.outputs.result).prNumber }}
IS_PR: ${{ steps.context.outputs.is_pr }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
# Parse CI config from PR body (only for PRs)
if [[ "$IS_PR" == "true" ]]; then
pip install pyyaml --quiet
python3 .github/scripts/parse_pr_config.py
# Set PR-specific image tag
if [ -n "$PR_NUMBER" ]; then
echo "image_tag=pr-${PR_NUMBER}" >> $GITHUB_OUTPUT
echo "Using image tag: pr-${PR_NUMBER}"
else
echo "image_tag=latest" >> $GITHUB_OUTPUT
echo "Using image tag: latest (PR without number)"
fi
else
# Main/nightly: check if image already exists before building
echo "image_tag=${{ github.sha }}" >> $GITHUB_OUTPUT
# Check if 'latest' already points to current SHA (tests passed previously)
OWNER_LOWER=$(echo '${{ github.repository_owner }}' | tr '[:upper:]' '[:lower:]')
export REGISTRY_IMAGE="ghcr.io/${OWNER_LOWER}/${{ steps.context.outputs.image_name }}"
export IMAGE_TAG="${{ github.sha }}"
export IS_PR="false"
python3 .github/scripts/check_image_exists.py
# Read the skipped output from check_image_exists.py
if [ -f "$GITHUB_OUTPUT" ] && grep -q "skipped=true" "$GITHUB_OUTPUT"; then
echo "✅ Image already exists and tests passed, skipping build"
echo "build=false" >> $GITHUB_OUTPUT
echo "run_ops=false" >> $GITHUB_OUTPUT
echo "run_benchmark=false" >> $GITHUB_OUTPUT
echo "run_sanity=false" >> $GITHUB_OUTPUT
else
echo "🔨 Building new image and running tests"
echo "build=true" >> $GITHUB_OUTPUT
echo "run_ops=true" >> $GITHUB_OUTPUT
echo "run_benchmark=true" >> $GITHUB_OUTPUT
echo "run_sanity=true" >> $GITHUB_OUTPUT
fi
fi
# Pass through image name from context
echo "image_name=${{ steps.context.outputs.image_name }}" >> $GITHUB_OUTPUT
build-wheel:
name: build-python-wheels
needs: config
if: needs.config.outputs.build == 'true'
uses: ./.github/workflows/build-wheel.yml
with:
package-name: tilegym
# Artifact naming: PR builds -> tilegym-pr-wheel-{sha}, Main -> tilegym-wheel-{sha}
artifact-suffix: ${{ needs.config.outputs.is_pr == 'true' && '-pr' || '' }}
python-versions: '["3.10", "3.11", "3.12"]' # Build for multiple Python versions
architectures: '["x86_64", "arm64"]' # Build for both architectures (6 wheels total)
retention-days: 7 # All wheels kept for 7 days; only tested wheel gets -verified (30 days)
skip-import-test: true # TileGym requires CUDA, test in Docker instead
run-pip-audit: true
check-wheel-contents-ignore: "W002" # Ignore duplicate files from symlinks
# Optional: Override default runners (ubuntu-latest for x86_64, ubuntu-24.04-arm for arm64)
# runner-x86-64: [self-hosted, linux, x64, gpu]
build:
name: build-tilegym-image
needs: [config, build-wheel]
if: needs.config.outputs.build == 'true'
runs-on: ubuntu-latest
# Note: Wheels are built for Python 3.10/3.11/3.12 and x86_64/arm64 (6 wheels total)
# However, Docker build and tests only use Python 3.10 x86_64 wheel
# Only the tested wheel (py310-x86_64) gets marked as "-verified" after tests pass
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Download TileGym wheel (Python 3.10, x86_64)
uses: actions/download-artifact@v4
with:
name: ${{ needs.build-wheel.outputs.artifact-name }}-py310-x86_64
path: ./wheel
- name: Set image variables
id: vars
run: |
OWNER_LOWER=$(echo '${{ github.repository_owner }}' | tr '[:upper:]' '[:lower:]')
REGISTRY_IMAGE="ghcr.io/${OWNER_LOWER}/${{ needs.config.outputs.image_name }}"
echo "owner_lower=${OWNER_LOWER}" >> $GITHUB_OUTPUT
echo "registry_image=${REGISTRY_IMAGE}" >> $GITHUB_OUTPUT
- name: Free up disk space
run: |
sudo rm -rf /usr/share/dotnet
sudo rm -rf /usr/local/lib/android
sudo rm -rf /opt/ghc
sudo rm -rf /opt/hostedtoolcache/CodeQL
docker system prune -af
df -h
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Login to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Generate tags
id: tags
run: |
TAGS="${{ steps.vars.outputs.registry_image }}:${{ needs.config.outputs.image_tag }}"
TAGS="${TAGS},${{ steps.vars.outputs.registry_image }}:${{ github.sha }}"
# Add datetime tag for nightly builds
if [[ "${{ needs.config.outputs.is_pr }}" == "false" ]]; then
DATETIME=$(date -u +%Y%m%d-%H%M%S)
TAGS="${TAGS},${{ steps.vars.outputs.registry_image }}:nightly-${DATETIME}"
fi
echo "tags=${TAGS}" >> $GITHUB_OUTPUT
- name: Build and push Docker image to GHCR
if: steps.check-existing.outputs.skipped != 'true'
uses: docker/build-push-action@v5
with:
context: .
file: ./modeling/transformers/Dockerfile
target: wheel # Use wheel target for CI builds
tags: ${{ steps.tags.outputs.tags }}
push: true
provenance: false
outputs: type=image,push=true,compression=zstd,compression-level=3
cache-from: |
type=gha
type=registry,ref=${{ steps.vars.outputs.registry_image }}:latest
type=registry,ref=${{ steps.vars.outputs.registry_image }}:${{ needs.config.outputs.image_tag }}
type=registry,ref=ghcr.io/${{ steps.vars.outputs.owner_lower }}/tilegym:latest
cache-to: type=gha,mode=max
sanity-check:
name: sanity-check
needs: [config, build-wheel]
if: |
always() &&
needs.config.outputs.run_sanity == 'true' &&
(needs.build-wheel.result == 'success' || needs.build-wheel.result == 'skipped')
runs-on: ubuntu-latest
steps:
- name: Set up Python 3.10
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Download wheel (Python 3.10, x86_64)
uses: actions/download-artifact@v4
with:
name: ${{ needs.build-wheel.outputs.artifact-name }}-py310-x86_64
path: ./wheel
- name: pip check
run: |
python -m venv /tmp/sanity-env
/tmp/sanity-env/bin/pip install --quiet ./wheel/*.whl
/tmp/sanity-env/bin/pip check
test-ops:
name: test-ops
needs: [config, build]
timeout-minutes: 40
if: |
always() &&
needs.config.outputs.run_ops == 'true' &&
(needs.build.result == 'success' || needs.build.result == 'skipped')
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
steps:
- name: Checkout code (sparse - need ops tests and shared utilities)
uses: actions/checkout@v4
with:
sparse-checkout: |
tests
sparse-checkout-cone-mode: false
- name: Create test results directory
run: mkdir -p ${{ github.workspace }}/test-results
- name: Login to GHCR
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Pull and run ops tests
timeout-minutes: 35
run: |
OWNER_LOWER=$(echo '${{ github.repository_owner }}' | tr '[:upper:]' '[:lower:]')
IMAGE="ghcr.io/${OWNER_LOWER}/${{ needs.config.outputs.image_name }}:${{ needs.config.outputs.image_tag }}"
docker pull ${IMAGE}
docker run --rm \
--gpus all \
-e DISABLE_AUTOTUNE=1 \
-v ${{ github.workspace }}/tests:/workspace/tilegym/tests \
-v ${{ github.workspace }}/test-results:/test-results \
-w /workspace/tilegym \
${IMAGE} \
bash -c "pip install --no-cache-dir pytest-xdist pytest-html && \
pytest -s tests/ops tests/suites -v -k test_op \
-n 12 \
--junitxml=/test-results/ops-results.xml \
--html=/test-results/ops-report.html \
--self-contained-html"
- name: Upload test results
if: always()
uses: actions/upload-artifact@v4
with:
name: ops-test-results
path: test-results/ops-*
retention-days: 30
- name: Publish test results
uses: EnricoMi/publish-unit-test-result-action@v2
if: always()
with:
files: test-results/ops-results.xml
check_name: Ops Test Results
comment_mode: off
test-benchmark:
name: test-benchmark
needs: [config, build]
timeout-minutes: 40
if: |
always() &&
needs.config.outputs.run_benchmark == 'true' &&
(needs.build.result == 'success' || needs.build.result == 'skipped')
runs-on: linux-amd64-gpu-rtxpro6000-latest-1
steps:
- name: Checkout code (sparse - need scripts and benchmarks)
uses: actions/checkout@v4
with:
sparse-checkout: |
.github/scripts
tests/benchmark
sparse-checkout-cone-mode: false
- name: Create test results directory
run: mkdir -p ${{ github.workspace }}/test-results
# Download previous baseline for regression detection
# Uses GitHub CLI instead of dawidd6 action (more reliable, no third-party dependencies)
# This runs for ALL builds (PRs and nightly) to catch regressions early
- name: Download baseline benchmark results
timeout-minutes: 5
continue-on-error: true
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
run: |
echo "Attempting to download baseline benchmark results..."
# Find the most recent successful workflow run on main with baseline artifact
RUN_ID=$(gh api \
-H "Accept: application/vnd.github+json" \
-H "X-GitHub-Api-Version: 2022-11-28" \
"/repos/${{ github.repository }}/actions/workflows/tilegym-ci.yml/runs?branch=main&status=success&per_page=10" \
--jq '.workflow_runs[].id' | head -1)
if [ -z "$RUN_ID" ]; then
echo "⚠️ No successful workflow runs found on main branch"
exit 0
fi
echo "Found workflow run: $RUN_ID"
# Download baseline artifact from that run
mkdir -p ${{ github.workspace }}/baseline-results
if gh run download "$RUN_ID" \
--name benchmark-baseline \
--dir ${{ github.workspace }}/baseline-results 2>/dev/null; then
echo "✅ Downloaded baseline benchmark results"
ls -lh ${{ github.workspace }}/baseline-results
else
echo "⚠️ No baseline artifact found (this may be the first run)"
fi
- name: Login to GHCR
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Pull and run benchmarks
timeout-minutes: 35
run: |
OWNER_LOWER=$(echo '${{ github.repository_owner }}' | tr '[:upper:]' '[:lower:]')
IMAGE="ghcr.io/${OWNER_LOWER}/${{ needs.config.outputs.image_name }}:${{ needs.config.outputs.image_tag }}"
docker pull ${IMAGE}
docker run --rm \
--gpus all \
-v ${{ github.workspace }}/tests/benchmark:/workspace/tilegym/tests/benchmark \
-v ${{ github.workspace }}/test-results:/test-results \
-w /workspace/tilegym/tests/benchmark \
${IMAGE} \
./run_all.sh /test-results --json
# Compare current results against baseline with three zones:
# - Regression zone (< -5%): Build fails
# - Neutral zone (-5% to +5%): Build passes, baseline NOT updated
# - Improvement zone (> +5%): Build passes, baseline updated
# Runs on ALL builds (PRs and nightly) to catch regressions early
# But only nightly builds can update the baseline
# Outputs: has_baseline, passed, should_update_baseline
- name: Check for performance regressions
id: regression_check
continue-on-error: false
run: |
if [ -d "${{ github.workspace }}/baseline-results" ] && [ "$(ls -A ${{ github.workspace }}/baseline-results/*.json 2>/dev/null)" ]; then
echo "Baseline results found, checking for regressions..."
echo "has_baseline=true" >> $GITHUB_OUTPUT
if python3 .github/scripts/check_benchmark_regression.py \
--current test-results \
--baseline baseline-results \
--threshold 5.0 \
--improvement-threshold 5.0 \
--output test-results/regression_report.json \
--fail-on-regression; then
echo "✅ No regressions detected"
echo "passed=true" >> $GITHUB_OUTPUT
# Check if we should update baseline (only if significant improvements)
SHOULD_UPDATE=$(python3 -c "import json; print(json.load(open('test-results/regression_report.json'))['summary']['should_update_baseline'])" 2>/dev/null || echo "false")
echo "should_update_baseline=${SHOULD_UPDATE}" >> $GITHUB_OUTPUT
if [ "$SHOULD_UPDATE" == "True" ]; then
echo "🎉 Significant improvements detected - will update baseline (nightly only)"
else
echo "🟡 Performance within neutral zone - baseline will not be updated"
fi
else
echo "❌ Performance regressions detected!"
echo "passed=false" >> $GITHUB_OUTPUT
echo "should_update_baseline=false" >> $GITHUB_OUTPUT
# Temporary disable build failure due to inconsistent benchmark results
# exit 1
fi
else
echo "No baseline results found - this will become the first baseline (nightly only)"
echo "has_baseline=false" >> $GITHUB_OUTPUT
echo "passed=true" >> $GITHUB_OUTPUT
echo "should_update_baseline=true" >> $GITHUB_OUTPUT
fi
# Note: PR builds check for regressions but cannot update the baseline
if [ "${{ needs.config.outputs.is_pr }}" == "true" ]; then
echo ""
echo "ℹ️ This is a PR build - regression check performed but baseline will not be updated"
echo " Baseline updates only happen on nightly builds after merge to main"
fi
- name: Debug - List test results directory
if: always()
run: |
echo "Contents of test-results directory:"
ls -lah ${{ github.workspace }}/test-results/ || echo "Directory does not exist"
echo ""
echo "JSON files:"
ls -lh ${{ github.workspace }}/test-results/*.json 2>/dev/null || echo "No JSON files found"
- name: Format benchmark summary
if: always()
run: python3 .github/scripts/format_benchmark_summary.py test-results
- name: Upload benchmark results
if: always()
uses: actions/upload-artifact@v4
with:
name: benchmark-results
path: test-results/*.json
retention-days: 30
# SELECTIVE BASELINE UPDATE STRATEGY:
# Instead of all-or-nothing, we update per-benchmark:
# - Benchmarks that improved/stayed neutral → update to new baseline
# - Benchmarks that regressed → keep old baseline (forces fix)
# - Build still FAILS if any regression exists
# This preserves progress on non-regressing benchmarks while catching issues
- name: Merge baseline selectively (nightly only)
if: |
needs.config.outputs.is_pr == 'false' &&
steps.regression_check.outputs.has_baseline == 'true'
run: |
mkdir -p ${{ github.workspace }}/merged-baseline
python3 .github/scripts/merge_baseline_selective.py \
--old-baseline baseline-results \
--new-results test-results \
--regression-report test-results/regression_report.json \
--output merged-baseline
- name: Update baseline (nightly only - selective or full)
if: needs.config.outputs.is_pr == 'false'
uses: actions/upload-artifact@v4
with:
name: benchmark-baseline
# Use merged baseline if it exists (partial update), otherwise use all new results (first run or all improved)
path: ${{ steps.regression_check.outputs.has_baseline == 'true' && 'merged-baseline/*.json' || 'test-results/*.json' }}
retention-days: 90
- name: Log baseline decision
if: always()
run: |
if [ "${{ needs.config.outputs.is_pr }}" == "true" ]; then
echo "📊 PR Build - Regression check completed"
if [ "${{ steps.regression_check.outputs.passed }}" == "true" ]; then
echo "✅ Performance check passed"
else
echo "❌ Performance regressions detected - fix before merging"
fi
echo "ℹ️ Note: Baseline will be updated after merge (on nightly build)"
else
# Nightly build - check if we have regression report for details
if [ -f "test-results/regression_report.json" ]; then
TOTAL_FILES=$(python3 -c "import json; print(json.load(open('test-results/regression_report.json'))['summary']['total_benchmark_files'])" 2>/dev/null || echo "0")
FILES_WITH_REGRESSIONS=$(python3 -c "import json; print(json.load(open('test-results/regression_report.json'))['summary']['files_with_regressions'])" 2>/dev/null || echo "0")
FILES_SAFE_TO_UPDATE=$(python3 -c "import json; print(json.load(open('test-results/regression_report.json'))['summary']['files_safe_to_update'])" 2>/dev/null || echo "0")
echo "📊 Baseline Update Summary:"
echo " Total benchmark files: $TOTAL_FILES"
echo " Files with regressions: $FILES_WITH_REGRESSIONS"
echo " Files updated: $FILES_SAFE_TO_UPDATE"
echo ""
if [ "$FILES_WITH_REGRESSIONS" -gt 0 ]; then
echo "⚠️ SELECTIVE UPDATE: Some benchmarks regressed, keeping old baseline for those"
echo " ✅ Updated baseline for $FILES_SAFE_TO_UPDATE non-regressing benchmarks"
echo " ❌ Kept old baseline for $FILES_WITH_REGRESSIONS regressing benchmarks"
echo " 🚨 Build FAILED - regressions must be fixed"
elif [ "${{ steps.regression_check.outputs.has_baseline }}" == "false" ]; then
echo "✅ FIRST RUN: Created initial baseline with all $TOTAL_FILES benchmarks"
elif [ "${{ steps.regression_check.outputs.should_update_baseline }}" == "True" ]; then
echo "✅ FULL UPDATE: All benchmarks improved, updated entire baseline"
else
echo "🟡 NO UPDATE: All benchmarks within neutral zone (±5%)"
fi
fi
fi
publish-wheel:
name: publish-verified-wheel
needs: [config, build-wheel, sanity-check, test-ops, test-benchmark]
if: |
always() &&
needs.build-wheel.result == 'success' &&
needs.sanity-check.result == 'success' &&
needs.test-ops.result == 'success' &&
needs.test-benchmark.result == 'success'
# Note: Only marks the py310-x86_64 wheel as "verified" because that's the wheel
# actually tested in Docker. Other wheels (py311, py312, arm64) are available but unverified.
uses: ./.github/workflows/publish-wheel.yml
with:
artifact-name: ${{ needs.build-wheel.outputs.artifact-name }}
python-versions: '["3.10"]' # Only verify the wheel actually tested in Docker
architectures: '["x86_64"]' # Only verify the x86_64 wheel used in Docker tests
retention-days: 30 # Verified wheel kept longer than initial builds (7 days)
publish-to-pypi: false # Set to true and add PYPI_TOKEN secret when ready
promote-to-latest:
name: promote-to-latest
needs: [config, build, sanity-check, test-ops, test-benchmark]
if: |
always() &&
needs.config.outputs.is_pr == 'false' &&
needs.build.result == 'success' &&
needs.sanity-check.result == 'success' &&
needs.test-ops.result == 'success' &&
needs.test-benchmark.result == 'success'
runs-on: ubuntu-latest
steps:
- name: Login to GHCR
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
- name: Promote SHA to latest and mark as verified
run: |
OWNER_LOWER=$(echo '${{ github.repository_owner }}' | tr '[:upper:]' '[:lower:]')
IMAGE="ghcr.io/${OWNER_LOWER}/${{ needs.config.outputs.image_name }}"
SHA="${{ github.sha }}"
echo "Promoting ${IMAGE}:${SHA} to latest and adding verified tags (tests passed)"
docker buildx imagetools create \
-t ${IMAGE}:latest \
-t ${IMAGE}:${SHA}-verified \
${IMAGE}:${SHA}