Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 26 additions & 26 deletions .github/workflows/gpu-h100-conformance-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ jobs:
accelerator: h100
intent: training

# --- Snapshot and validation ---
# --- Snapshot and GPU validation ---

- name: Snapshot and validate GPU
uses: ./.github/actions/gpu-snapshot-validate
Expand All @@ -108,32 +108,16 @@ jobs:
min_gpu_count: '2'
cluster_name: ${{ env.KIND_CLUSTER_NAME }}

# --- Install Karpenter before validation so cluster-autoscaling check passes ---
# --- Install Karpenter + KWOK early to give monitoring stack settle time ---

- name: Install Karpenter + KWOK (setup)
- name: Install Karpenter + KWOK
uses: ./.github/actions/install-karpenter-kwok
with:
cluster_name: ${{ env.KIND_CLUSTER_NAME }}

# --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
# DRA and gang scheduling exercises are self-contained within the
# conformance checks — they create their own resources and clean up.
# --- Health checks ---

- name: Validate cluster
run: |
AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
./aicr validate \
--recipe recipe.yaml \
--phase conformance \
--namespace gpu-operator \
--kubeconfig="${HOME}/.kube/config" \
--require-gpu \
--image=ko.local:smoke-test \
--timeout=10m \
--output=validation-result.yaml \
--evidence-dir=conformance-evidence

- name: Load versions
- name: Prepare chainsaw
id: versions
uses: ./.github/actions/load-versions

Expand All @@ -149,10 +133,7 @@ jobs:
--test-dir tests/chainsaw/ai-conformance/kind-training \
--config tests/chainsaw/chainsaw-config.yaml

# --- Evidence collection ---

- name: Collect AI conformance evidence
if: always() && steps.bundle-install.outcome == 'success'
- name: Verify expected resources exist
run: |
go run ./tests/chainsaw/ai-conformance/ \
--dir tests/chainsaw/ai-conformance/kind-training \
Expand All @@ -164,7 +145,26 @@ jobs:
--kubeconfig="${HOME}/.kube/config" \
--debug

- name: Upload conformance evidence
# --- CNCF AI Conformance validation ---
# Runs last to ensure the DCGM → Prometheus → adapter pipeline
# has had time to bootstrap (pod-autoscaling check needs live metric data).

- name: Validate CNCF AI Conformance
if: always() && steps.bundle-install.outcome == 'success'
run: |
AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
./aicr validate \
--recipe recipe.yaml \
--phase conformance \
--namespace gpu-operator \
--kubeconfig="${HOME}/.kube/config" \
--require-gpu \
--image=ko.local:smoke-test \
--timeout=10m \
--output=validation-result.yaml \
--evidence-dir=conformance-evidence

- name: Collect and upload validation artifacts
if: always()
uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
with:
Expand Down
64 changes: 42 additions & 22 deletions .github/workflows/gpu-h100-inference-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ jobs:
accelerator: h100
platform: dynamo

# --- Snapshot and validation ---
# --- Snapshot and GPU validation ---

- name: Snapshot and validate GPU
uses: ./.github/actions/gpu-snapshot-validate
Expand All @@ -107,33 +107,16 @@ jobs:
min_gpu_count: '1'
cluster_name: ${{ env.KIND_CLUSTER_NAME }}

# --- Install Karpenter before validation so cluster-autoscaling check passes ---
# --- Install Karpenter + KWOK early to give monitoring stack settle time ---

- name: Install Karpenter + KWOK (setup)
- name: Install Karpenter + KWOK
uses: ./.github/actions/install-karpenter-kwok
with:
cluster_name: ${{ env.KIND_CLUSTER_NAME }}

# --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
# Includes self-contained secure-accelerator-access check (creates its own
# DRA test resources, validates, and cleans up automatically).
# --- Health checks ---

- name: Validate cluster
run: |
AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
./aicr validate \
--recipe recipe.yaml \
--phase deployment \
--phase conformance \
--namespace gpu-operator \
--kubeconfig="${HOME}/.kube/config" \
--require-gpu \
--image=ko.local:smoke-test \
--timeout=10m \
--output=validation-result.yaml \
--evidence-dir=conformance-evidence

- name: Load versions
- name: Prepare chainsaw
id: versions
uses: ./.github/actions/load-versions

Expand All @@ -149,7 +132,44 @@ jobs:
--test-dir tests/chainsaw/ai-conformance/kind \
--config tests/chainsaw/chainsaw-config.yaml

- name: Verify expected resources exist
run: |
go run ./tests/chainsaw/ai-conformance/ \
--dir tests/chainsaw/ai-conformance/kind \
--file tests/chainsaw/ai-conformance/cluster/assert-crds.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-kgateway.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-dynamo.yaml \
--kubeconfig="${HOME}/.kube/config" \
--debug

# --- CNCF AI Conformance validation ---
# Runs before Dynamo: the dra-support check needs to allocate a GPU via
# ResourceClaim, which conflicts with Dynamo's vllm-smoke-gpu-claim if
# Dynamo is deployed first (only 1 GPU on H100 x1).

- name: Validate CNCF AI Conformance
run: |
AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
./aicr validate \
--recipe recipe.yaml \
--phase deployment \
--phase conformance \
--namespace gpu-operator \
--kubeconfig="${HOME}/.kube/config" \
--require-gpu \
--image=ko.local:smoke-test \
--timeout=10m \
--output=validation-result.yaml \
--evidence-dir=conformance-evidence

# --- Dynamo vLLM inference smoke test ---
# Runs after conformance: Dynamo's DRA ResourceClaim consumes the GPU,
# which would block dra-support validation if deployed first.

- name: Deploy Dynamo vLLM smoke test
run: |
Expand Down
30 changes: 20 additions & 10 deletions .github/workflows/gpu-h100-training-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ jobs:
accelerator: h100
intent: training

# --- Snapshot and validation ---
# --- Snapshot and GPU validation ---

- name: Snapshot and validate GPU
uses: ./.github/actions/gpu-snapshot-validate
Expand All @@ -103,16 +103,16 @@ jobs:
min_gpu_count: '2'
cluster_name: ${{ env.KIND_CLUSTER_NAME }}

# --- Install Karpenter before validation so cluster-autoscaling check passes ---
# --- Install Karpenter + KWOK early to give monitoring stack settle time ---

- name: Install Karpenter + KWOK (setup)
- name: Install Karpenter + KWOK
uses: ./.github/actions/install-karpenter-kwok
with:
cluster_name: ${{ env.KIND_CLUSTER_NAME }}

# --- Health checks (run before conformance to give metrics pipeline time) ---
# --- Health checks ---

- name: Load versions
- name: Prepare chainsaw
id: versions
uses: ./.github/actions/load-versions

Expand All @@ -128,13 +128,23 @@ jobs:
--test-dir tests/chainsaw/ai-conformance/kind-training \
--config tests/chainsaw/chainsaw-config.yaml

# --- Validate cluster (Go conformance checks run inside K8s Jobs) ---
# Runs after chainsaw to ensure the DCGM → Prometheus → adapter pipeline
- name: Verify expected resources exist
run: |
go run ./tests/chainsaw/ai-conformance/ \
--dir tests/chainsaw/ai-conformance/kind-training \
--file tests/chainsaw/ai-conformance/cluster/assert-cert-manager.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-monitoring.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-skyhook.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-dra-driver.yaml \
--file tests/chainsaw/ai-conformance/cluster/assert-kai-scheduler.yaml \
--kubeconfig="${HOME}/.kube/config" \
--debug

# --- CNCF AI Conformance validation ---
# Runs last to ensure the DCGM → Prometheus → adapter pipeline
# has had time to bootstrap (pod-autoscaling check needs live metric data).
# Gang scheduling (PodGroup + 2 GPU pods) is exercised by the self-contained
# gang-scheduling conformance check — no separate deploy step needed.

- name: Validate cluster
- name: Validate CNCF AI Conformance
run: |
AICR_VALIDATOR_IMAGE_REGISTRY=ko.local \
./aicr validate \
Expand Down
Loading