ci(model-streamer): restore stage 2 (full end-to-end with Docker)

tanushriya910 · tanushriya910 · commit 284b8e5cad4e · 2026-05-19T00:10:21.000-07:00
Signed-off-by: Tanushriya Singh &lt;tanushriyas@nvidia.com&gt;
diff --git a/.github/workflows/modelexpress-model-streamer-test.yml b/.github/workflows/modelexpress-model-streamer-test.yml
@@ -3,55 +3,48 @@
 
 # Standalone model-streamer CI lane.
 #
-# Status — STAGE 1 (S3-upload only): the Docker daemon is not yet
-# provisioned on the GPU runner (prod-modelexpress-tester-amd-gpu-v1), so
-# `docker build` / `docker run` would fail. While ops works on enabling
-# DinD / socket-mount + GPU passthrough, this workflow runs only the parts
-# that don't need Docker:
+# Status — STAGE 2 (full end-to-end): Docker is now provisioned on the GPU
+# runner (DinD) and NGC_API_KEY is available, so we can build the worker
+# image locally on the runner and exercise the full streamer-load + inference
+# flow without round-tripping through NGC. End-to-end this workflow does:
 #   1. Checkout
-#   2. Install boto3 + huggingface-cli
-#   3. Download safetensors from HuggingFace (if not already cached in S3)
-#      and upload them to s3://${MX_CI_S3_BUCKET}/models/${MX_CI_MODEL}/
+#   2. Stage safetensors in S3 (idempotent — skipped if already cached)
+#   3. Build the vLLM worker image locally on the runner (no push)
+#   4. Run a single vLLM container with --load-format mx and
+#      RUNAI_STREAMER_CONCURRENCY set; weights stream from S3 via IRSA
+#   5. Wait for "Model streamer weight loading complete" in the container
+#      logs
+#   6. Wait for /health on the OpenAI server
+#   7. Send a /v1/completions request; assert non-empty completion text
+#   8. Always-runs cleanup: stop+rm the container, remove the local image
+#      (cancel-in-progress concurrency means GHA could SIGTERM us mid-run;
+#      cleanup steps are marked `if: always()` so the runner's Docker daemon
+#      doesn't accumulate stale containers/images across runs)
 #
-# What this validates today:
-#   - Runner picks up GHA jobs end-to-end
-#   - Self-hosted runner has internet egress for `pip install` + HF download
-#   - IRSA on the runner has the right S3 permissions (list / put / head)
+# The steps are inlined from .github/actions/run-mx-streamer-test/action.yml
+# so we can iterate on them in isolation. Once stable here, port any
+# improvements back into that composite action.
 #
-# What it does NOT validate yet (deferred to STAGE 2 once Docker is up):
-#   - vLLM image build
-#   - runai-model-streamer reading from S3 inside the container
-#   - --load-format mx + ModelExpress plugin loading
-#   - OpenAI inference endpoint comes up
-#
-# When ops enables Docker on the runner, re-add the steps from the full
-# composite action at .github/actions/run-mx-streamer-test/action.yml
-# (build, docker run, wait, verify inference, cleanup) — they're already
-# written and tested in the main workflow's `model-streamer-vllm` job.
+# Local-build / no-NGC-push pattern: doing both build and run on the GPU
+# runner means the image lives in the runner's Docker daemon only — never
+# pushed to or pulled from any registry. Saves bandwidth + avoids exercising
+# the NGC pull path here (the main workflow's `model-streamer-vllm` job does
+# the registry round-trip; this one focuses on the runner-local mechanics).
 #
 # Required secrets:
-#   HF_TOKEN  — HuggingFace token; ignored when empty (Qwen2.5-0.5B is
-#               public). Required only if the model is gated.
+#   NGC_API_KEY  — used to `docker login nvcr.io` so the build can pull the
+#                  base image `vllm/vllm-openai:v0.17.1` (public Docker Hub)
+#                  + any nvcr.io transitive layers. Reuses the same secret
+#                  the main workflow uses.
+#   HF_TOKEN     — HuggingFace token; ignored when empty (Qwen2.5-0.5B is
+#                  public). Required only if the model is gated.
 #
 # Required IRSA on the GPU runner:
 #   IAM role with list / put / head on s3://${MX_CI_S3_BUCKET}/models/.
 
 name: ModelExpress Model Streamer Test
 
 on:
-  # Triggered indirectly via copy-pr-bot. PR-driven flow:
-  #   1. Open a PR (the PR itself does NOT fire this workflow — `pull_request`
-  #      events are forbidden on Velonix self-hosted runners because PR-head
-  #      code is untrusted).
-  #   2. Either:
-  #        a. All commits on the PR are GPG-signed → copy-pr-bot trusts the
-  #           author and auto-creates `pull-request/<N>` with the PR content.
-  #        b. Commits are unsigned or come from an external contributor →
-  #           a maintainer comments `/ok to test <commit_sha>` on the PR.
-  #   3. The bot pushes the PR content to `pull-request/<N>` in this repo.
-  #      That push (from the trusted bot identity, into an internal branch)
-  #      fires this workflow.
-  # See the dynamo reference: ai-dynamo/dynamo/.github/workflows/pr.yaml.
   push:
     branches:
       - "pull-request/[0-9]+"
@@ -67,14 +60,24 @@ env:
   MX_CI_MODEL: Qwen/Qwen2.5-0.5B
   MX_CI_S3_BUCKET: ai-dynamo-modelexpress-ci
   MX_CI_S3_REGION: us-east-1
+  # Docker / runtime knobs (previously composite-action inputs). Tweak here
+  # without touching the composite action.
+  MX_CI_VLLM_PORT: "18888"
+  MX_CI_STREAMER_CONCURRENCY: "16"
+  MX_CI_LOAD_TIMEOUT_SECONDS: "300"
 
 jobs:
-  s3-upload:
-    name: S3 upload (Stage 1, no Docker)
+  model-streamer:
+    name: Model Streamer test (vLLM, S3)
     runs-on: prod-modelexpress-tester-amd-gpu-v1
     permissions:
       contents: read
 
+    env:
+      # Local image tag, lives only in the GPU runner's Docker daemon —
+      # never pushed to or pulled from any registry.
+      WORKER_IMAGE: mx-worker-vllm:local
+
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -162,3 +165,127 @@ jobs:
                           )
                   print(f"Upload verified: {len(uploaded)} file(s).")
           EOF
+
+      - name: Log in to NGC for base image pulls
+        env:
+          NGC_API_KEY: ${{ secrets.NGC_API_KEY }}
+        run: |
+          echo "${NGC_API_KEY}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
+
+      - name: Build vLLM worker image (local, no registry)
+        # Build directly on the GPU runner — image lives in the local Docker
+        # daemon and is referenced by `docker run` below via the WORKER_IMAGE
+        # env. Build context is repo root (Dockerfile COPYs from
+        # modelexpress_client/python/).
+        run: |
+          docker build \
+            -f ci/k8s/client/vllm/Dockerfile \
+            -t "${WORKER_IMAGE}" \
+            .
+
+      - name: Start vLLM container with model streamer
+        env:
+          # WORKER_IMAGE from job-level env (local tag built above).
+          MODEL: ${{ env.MX_CI_MODEL }}
+          S3_BUCKET: ${{ env.MX_CI_S3_BUCKET }}
+          S3_REGION: ${{ env.MX_CI_S3_REGION }}
+          PORT: ${{ env.MX_CI_VLLM_PORT }}
+          STREAMER_CONCURRENCY: ${{ env.MX_CI_STREAMER_CONCURRENCY }}
+        run: |
+          set -euo pipefail
+          MODEL_S3_URI="s3://${S3_BUCKET}/models/${MODEL}"
+
+          # IRSA credentials don't auto-propagate into child containers. The EKS
+          # runner pod gets AWS_ROLE_ARN + AWS_WEB_IDENTITY_TOKEN_FILE wired in,
+          # but a `docker run` child sees neither unless we forward them: pass
+          # the role ARN as -e and mount the OIDC token file in as a volume.
+          # boto3 inside the container then does its own sts:AssumeRoleWithWebIdentity.
+          IRSA_TOKEN="${AWS_WEB_IDENTITY_TOKEN_FILE:-/var/run/secrets/eks.amazonaws.com/serviceaccount/token}"
+
+          CONTAINER_ID=$(docker run -d --gpus all --ipc=host \
+            -e AWS_ROLE_ARN="${AWS_ROLE_ARN}" \
+            -e AWS_WEB_IDENTITY_TOKEN_FILE="/var/run/secrets/eks-token" \
+            -v "${IRSA_TOKEN}:/var/run/secrets/eks-token:ro" \
+            -e AWS_DEFAULT_REGION="${S3_REGION}" \
+            -e MX_MODEL_URI="${MODEL_S3_URI}" \
+            -e RUNAI_STREAMER_CONCURRENCY="${STREAMER_CONCURRENCY}" \
+            -e VLLM_PLUGINS=modelexpress \
+            -p "${PORT}:${PORT}" \
+            "${WORKER_IMAGE}" \
+            python3 -m vllm.entrypoints.openai.api_server \
+              --model "${MODEL}" \
+              --load-format mx \
+              --port "${PORT}")
+          echo "Container: ${CONTAINER_ID}"
+          echo "CONTAINER_ID=${CONTAINER_ID}" >> "$GITHUB_ENV"
+
+      - name: Wait for model streamer to complete
+        env:
+          LOAD_TIMEOUT: ${{ env.MX_CI_LOAD_TIMEOUT_SECONDS }}
+        run: |
+          set -euo pipefail
+          deadline=$((SECONDS + LOAD_TIMEOUT))
+          while [ $SECONDS -lt $deadline ]; do
+            if docker logs "${CONTAINER_ID}" 2>&1 | grep -q "Model streamer weight loading complete"; then
+              echo "Model streamer loading confirmed."
+              exit 0
+            fi
+            if [ "$(docker inspect "${CONTAINER_ID}" --format '{{.State.Running}}')" != "true" ]; then
+              echo "ERROR: container exited before model streamer completed."
+              docker logs "${CONTAINER_ID}" 2>&1 | tail -80
+              exit 1
+            fi
+            echo "Still loading... (${SECONDS}s elapsed)"
+            sleep 10
+          done
+
+          echo "ERROR: model streamer did not complete within ${LOAD_TIMEOUT}s."
+          docker logs "${CONTAINER_ID}" 2>&1 | tail -80
+          exit 1
+
+      - name: Wait for OpenAI server to be ready
+        env:
+          PORT: ${{ env.MX_CI_VLLM_PORT }}
+        run: |
+          set -euo pipefail
+          timeout 60 bash -c \
+            "until curl -sf http://localhost:${PORT}/health > /dev/null; do sleep 2; done"
+          echo "Server ready on port ${PORT}."
+
+      - name: Verify inference
+        env:
+          MODEL: ${{ env.MX_CI_MODEL }}
+          PORT: ${{ env.MX_CI_VLLM_PORT }}
+        run: |
+          set -euo pipefail
+          RESPONSE=$(curl -sS --max-time 60 "http://localhost:${PORT}/v1/completions" \
+            -H "Content-Type: application/json" \
+            -d "{\"model\": \"${MODEL}\", \"prompt\": \"The capital of France is\", \"max_tokens\": 8}")
+          echo "Response: ${RESPONSE}"
+          echo "${RESPONSE}" | python3 -c "
+          import json, sys
+          body = json.load(sys.stdin)
+          choices = body.get('choices', [])
+          assert choices and choices[0].get('text'), f'No completion text in response: {body}'
+          print('Inference OK:', repr(choices[0]['text'][:60]))
+          "
+
+      - name: Cleanup container
+        if: always()
+        run: |
+          if [ -n "${CONTAINER_ID:-}" ]; then
+            echo "::group::Container logs (tail 200)"
+            docker logs "${CONTAINER_ID}" 2>&1 | tail -200 || true
+            echo "::endgroup::"
+            docker stop "${CONTAINER_ID}" 2>/dev/null || true
+            docker rm -f "${CONTAINER_ID}" 2>/dev/null || true
+          fi
+
+      - name: Cleanup local image
+        # `if: always()` so this also runs when the job is cancelled by
+        # cancel-in-progress concurrency. Without it, the locally built
+        # vLLM worker image (~15GB) accumulates in the GPU runner's Docker
+        # daemon across runs and eventually exhausts disk.
+        if: always()
+        run: |
+          docker image rm -f "${WORKER_IMAGE}" 2>/dev/null || true