NVIDIA · mchmarny · Apr 14, 2026 · Apr 12, 2026 · Apr 14, 2026
@@ -99,7 +99,7 @@ jobs:
           accelerator: h100
           intent: training
 
-      # --- Snapshot and validation ---
+      # --- Snapshot and GPU validation ---
 
       - name: Snapshot and validate GPU
         uses: ./.github/actions/gpu-snapshot-validate
@@ -108,32 +108,16 @@ jobs:
           min_gpu_count: '2'
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
 
-      # --- Install Karpenter before validation so cluster-autoscaling check passes ---
+      # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
 
-      - name: Install Karpenter + KWOK (setup)
+      - name: Install Karpenter + KWOK
         uses: ./.github/actions/install-karpenter-kwok
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
 
-      # --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
-      # DRA and gang scheduling exercises are self-contained within the
-      # conformance checks — they create their own resources and clean up.
+      # --- Health checks ---
 
-      - name: Validate cluster
-        run: |
-          AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
-          ./aicr validate \
-            --recipe recipe.yaml \
-            --phase conformance \
-            --namespace gpu-operator \
-            --kubeconfig="${HOME}/.kube/config" \
-            --require-gpu \
-            --image=ko.local:smoke-test \
-            --timeout=10m \
-            --output=validation-result.yaml \
-            --evidence-dir=conformance-evidence
-
-      - name: Load versions
+      - name: Prepare chainsaw
         id: versions
         uses: ./.github/actions/load-versions
 
@@ -149,10 +133,7 @@ jobs:
             --test-dir tests/chainsaw/ai-conformance/kind-training \
             --config tests/chainsaw/chainsaw-config.yaml
 
-      # --- Evidence collection ---
-
-      - name: Collect AI conformance evidence
-        if: always() && steps.bundle-install.outcome == 'success'
+      - name: Verify expected resources exist
         run: |
           go run ./tests/chainsaw/ai-conformance/ \
             --dir tests/chainsaw/ai-conformance/kind-training \
@@ -164,7 +145,26 @@ jobs:
             --kubeconfig="${HOME}/.kube/config" \
             --debug
 
-      - name: Upload conformance evidence
+      # --- CNCF AI Conformance validation ---
+      # Runs last to ensure the DCGM → Prometheus → adapter pipeline
+      # has had time to bootstrap (pod-autoscaling check needs live metric data).
+
+      - name: Validate CNCF AI Conformance
+        if: always() && steps.bundle-install.outcome == 'success'
+        run: |
+          AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
+          ./aicr validate \
+            --recipe recipe.yaml \
+            --phase conformance \
+            --namespace gpu-operator \
+            --kubeconfig="${HOME}/.kube/config" \
+            --require-gpu \
+            --image=ko.local:smoke-test \
+            --timeout=10m \
+            --output=validation-result.yaml \
+            --evidence-dir=conformance-evidence
+
+      - name: Collect and upload validation artifacts
         if: always()
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a  # v7.0.1
         with:

@@ -98,7 +98,7 @@ jobs:
           accelerator: h100
           platform: dynamo
 
-      # --- Snapshot and validation ---
+      # --- Snapshot and GPU validation ---
 
       - name: Snapshot and validate GPU
         uses: ./.github/actions/gpu-snapshot-validate
@@ -107,33 +107,16 @@ jobs:
           min_gpu_count: '1'
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
 
-      # --- Install Karpenter before validation so cluster-autoscaling check passes ---
+      # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
 
-      - name: Install Karpenter + KWOK (setup)
+      - name: Install Karpenter + KWOK
         uses: ./.github/actions/install-karpenter-kwok
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
 
-      # --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
-      # Includes self-contained secure-accelerator-access check (creates its own
-      # DRA test resources, validates, and cleans up automatically).
+      # --- Health checks ---
 
-      - name: Validate cluster
-        run: |
-          AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
-          ./aicr validate \
-            --recipe recipe.yaml \
-            --phase deployment \
-            --phase conformance \
-            --namespace gpu-operator \
-            --kubeconfig="${HOME}/.kube/config" \
-            --require-gpu \
-            --image=ko.local:smoke-test \
-            --timeout=10m \
-            --output=validation-result.yaml \
-            --evidence-dir=conformance-evidence
-
-      - name: Load versions
+      - name: Prepare chainsaw
         id: versions
         uses: ./.github/actions/load-versions
 
@@ -149,7 +132,44 @@ jobs:
             --test-dir tests/chainsaw/ai-conformance/kind \
             --config tests/chainsaw/chainsaw-config.yaml
 
+      - name: Verify expected resources exist
+        run: |
+          go run ./tests/chainsaw/ai-conformance/ \
+            --dir tests/chainsaw/ai-conformance/kind \
+            --file tests/chainsaw/ai-conformance/cluster/assert-crds.yaml \
+            --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \
+            --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \
+            --file tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml \
+            --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \
+            --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \
+            --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \
+            --file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \
+            --kubeconfig="${HOME}/.kube/config" \
+            --debug
+
+      # --- CNCF AI Conformance validation ---
+      # Runs before Dynamo: the dra-support check needs to allocate a GPU via
+      # ResourceClaim, which conflicts with Dynamo's vllm-smoke-gpu-claim if
+      # Dynamo is deployed first (only 1 GPU on H100 x1).
+
+      - name: Validate CNCF AI Conformance
+        run: |
+          AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
+          ./aicr validate \
+            --recipe recipe.yaml \
+            --phase deployment \
+            --phase conformance \
+            --namespace gpu-operator \
+            --kubeconfig="${HOME}/.kube/config" \
+            --require-gpu \
+            --image=ko.local:smoke-test \
+            --timeout=10m \
+            --output=validation-result.yaml \
+            --evidence-dir=conformance-evidence
+
       # --- Dynamo vLLM inference smoke test ---
+      # Runs after conformance: Dynamo's DRA ResourceClaim consumes the GPU,
+      # which would block dra-support validation if deployed first.
 
       - name: Deploy Dynamo vLLM smoke test
         run: |

@@ -94,7 +94,7 @@ jobs:
           accelerator: h100
           intent: training
 
-      # --- Snapshot and validation ---
+      # --- Snapshot and GPU validation ---
 
       - name: Snapshot and validate GPU
         uses: ./.github/actions/gpu-snapshot-validate
@@ -103,16 +103,16 @@ jobs:
           min_gpu_count: '2'
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
 
-      # --- Install Karpenter before validation so cluster-autoscaling check passes ---
+      # --- Install Karpenter + KWOK early to give monitoring stack settle time ---
 
-      - name: Install Karpenter + KWOK (setup)
+      - name: Install Karpenter + KWOK
         uses: ./.github/actions/install-karpenter-kwok
         with:
           cluster_name: ${{ env.KIND_CLUSTER_NAME }}
 
-      # --- Health checks (run before conformance to give metrics pipeline time) ---
+      # --- Health checks ---
 
-      - name: Load versions
+      - name: Prepare chainsaw
         id: versions
         uses: ./.github/actions/load-versions
 
@@ -128,13 +128,23 @@ jobs:
             --test-dir tests/chainsaw/ai-conformance/kind-training \
             --config tests/chainsaw/chainsaw-config.yaml
 
-      # --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
-      # Runs after chainsaw to ensure the DCGM → Prometheus → adapter pipeline
+      - name: Verify expected resources exist
+        run: |
+          go run ./tests/chainsaw/ai-conformance/ \
+            --dir tests/chainsaw/ai-conformance/kind-training \
+            --file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \
+            --file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \
+            --file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \
+            --file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \
+            --file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \
+            --kubeconfig="${HOME}/.kube/config" \
+            --debug
+
+      # --- CNCF AI Conformance validation ---
+      # Runs last to ensure the DCGM → Prometheus → adapter pipeline
       # has had time to bootstrap (pod-autoscaling check needs live metric data).
-      # Gang scheduling (PodGroup + 2 GPU pods) is exercised by the self-contained
-      # gang-scheduling conformance check — no separate deploy step needed.
 
-      - name: Validate cluster
+      - name: Validate CNCF AI Conformance
         run: |
           AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
           ./aicr validate \