Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 82 additions & 1 deletion catalog/catalog.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# LLMKube Model Catalog
# Version: v1
# Last Updated: 2025-11-23
# Last Updated: 2025-12-01
#
# This catalog contains pre-configured, battle-tested LLM models optimized
# for various use cases. Each model includes verified GGUF sources and
Expand Down Expand Up @@ -188,6 +188,87 @@ models:
# Large Models (13B+) - Premium Quality
# ============================================================================

qwen-2.5-32b:
name: "Qwen 2.5 32B Instruct"
description: "Powerful 32B model with excellent reasoning and multilingual capabilities. Perfect for 32GB VRAM setups."
size: "32B"
quantization: "Q4_K_M"
source: "https://huggingface.co/bartowski/Qwen2.5-32B-Instruct-GGUF/resolve/main/Qwen2.5-32B-Instruct-Q4_K_M.gguf"
context_size: 8192
gpu_layers: 64
use_cases:
- "complex-reasoning"
- "multilingual"
- "production"
- "high-quality-chat"
resources:
cpu: "8"
memory: "24Gi"
gpu_memory: "24Gi"
vram_estimate: "18-24GB"
tags:
- "qwen"
- "large"
- "recommended"
- "32gb-vram"
- "long-context"
homepage: "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct"

qwen-2.5-coder-32b:
name: "Qwen 2.5 Coder 32B Instruct"
description: "State-of-the-art coding model matching GPT-4o. Best open-source code LLM available."
size: "32B"
quantization: "Q4_K_M"
source: "https://huggingface.co/bartowski/Qwen2.5-Coder-32B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf"
context_size: 8192
gpu_layers: 64
use_cases:
- "code-generation"
- "debugging"
- "code-review"
- "technical-docs"
resources:
cpu: "8"
memory: "24Gi"
gpu_memory: "24Gi"
vram_estimate: "18-24GB"
tags:
- "qwen"
- "code"
- "large"
- "recommended"
- "32gb-vram"
- "long-context"
- "gpt4-level"
homepage: "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct"

qwen-3-32b:
name: "Qwen 3 32B"
description: "Latest Qwen 3 series with hybrid thinking modes. Newest and most capable 32B model."
size: "32B"
quantization: "Q4_K_M"
source: "https://huggingface.co/Qwen/Qwen3-32B-GGUF/resolve/main/Qwen3-32B-Q4_K_M.gguf"
context_size: 8192
gpu_layers: 64
use_cases:
- "complex-reasoning"
- "thinking-mode"
- "production"
- "high-quality-chat"
resources:
cpu: "8"
memory: "24Gi"
gpu_memory: "24Gi"
vram_estimate: "18-24GB"
tags:
- "qwen"
- "large"
- "newest"
- "32gb-vram"
- "long-context"
- "thinking"
homepage: "https://huggingface.co/Qwen/Qwen3-32B"

qwen-2.5-14b:
name: "Qwen 2.5 14B Instruct"
description: "Multilingual powerhouse with strong creative writing capabilities."
Expand Down
10 changes: 10 additions & 0 deletions pkg/cli/benchmark.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ type benchmarkOptions struct {
accelerator string
cleanup bool
deployWait time.Duration
contextSize int32
}

type BenchmarkResult struct {
Expand Down Expand Up @@ -283,6 +284,8 @@ Examples:
cmd.Flags().BoolVar(&opts.cleanup, "cleanup", true,
"Cleanup deployments after benchmarking (use --no-cleanup to keep)")
cmd.Flags().DurationVar(&opts.deployWait, "deploy-wait", 10*time.Minute, "Timeout waiting for deployment to be ready")
cmd.Flags().Int32Var(&opts.contextSize, "context", 0,
"Context size (KV cache) for model deployment (0 = use catalog default)")

return cmd
}
Expand Down Expand Up @@ -1099,6 +1102,13 @@ func deployModel(
inferenceService.Spec.Resources.GPUMemory = catalogModel.Resources.GPUMemory
}

if opts.contextSize > 0 {
inferenceService.Spec.ContextSize = &opts.contextSize
} else if catalogModel.ContextSize > 0 {
contextSize := int32(catalogModel.ContextSize)
inferenceService.Spec.ContextSize = &contextSize
}

if err := k8sClient.Create(ctx, inferenceService); err != nil {
// Cleanup model if inference service creation fails
_ = k8sClient.Delete(ctx, model)
Expand Down
83 changes: 82 additions & 1 deletion pkg/cli/catalog.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# LLMKube Model Catalog
# Version: v1
# Last Updated: 2025-11-23
# Last Updated: 2025-12-01
#
# This catalog contains pre-configured, battle-tested LLM models optimized
# for various use cases. Each model includes verified GGUF sources and
Expand Down Expand Up @@ -188,6 +188,87 @@ models:
# Large Models (13B+) - Premium Quality
# ============================================================================

qwen-2.5-32b:
name: "Qwen 2.5 32B Instruct"
description: "Powerful 32B model with excellent reasoning and multilingual capabilities. Perfect for 32GB VRAM setups."
size: "32B"
quantization: "Q4_K_M"
source: "https://huggingface.co/bartowski/Qwen2.5-32B-Instruct-GGUF/resolve/main/Qwen2.5-32B-Instruct-Q4_K_M.gguf"
context_size: 8192
gpu_layers: 64
use_cases:
- "complex-reasoning"
- "multilingual"
- "production"
- "high-quality-chat"
resources:
cpu: "8"
memory: "24Gi"
gpu_memory: "24Gi"
vram_estimate: "18-24GB"
tags:
- "qwen"
- "large"
- "recommended"
- "32gb-vram"
- "long-context"
homepage: "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct"

qwen-2.5-coder-32b:
name: "Qwen 2.5 Coder 32B Instruct"
description: "State-of-the-art coding model matching GPT-4o. Best open-source code LLM available."
size: "32B"
quantization: "Q4_K_M"
source: "https://huggingface.co/bartowski/Qwen2.5-Coder-32B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf"
context_size: 8192
gpu_layers: 64
use_cases:
- "code-generation"
- "debugging"
- "code-review"
- "technical-docs"
resources:
cpu: "8"
memory: "24Gi"
gpu_memory: "24Gi"
vram_estimate: "18-24GB"
tags:
- "qwen"
- "code"
- "large"
- "recommended"
- "32gb-vram"
- "long-context"
- "gpt4-level"
homepage: "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct"

qwen-3-32b:
name: "Qwen 3 32B"
description: "Latest Qwen 3 series with hybrid thinking modes. Newest and most capable 32B model."
size: "32B"
quantization: "Q4_K_M"
source: "https://huggingface.co/Qwen/Qwen3-32B-GGUF/resolve/main/Qwen3-32B-Q4_K_M.gguf"
context_size: 8192
gpu_layers: 64
use_cases:
- "complex-reasoning"
- "thinking-mode"
- "production"
- "high-quality-chat"
resources:
cpu: "8"
memory: "24Gi"
gpu_memory: "24Gi"
vram_estimate: "18-24GB"
tags:
- "qwen"
- "large"
- "newest"
- "32gb-vram"
- "long-context"
- "thinking"
homepage: "https://huggingface.co/Qwen/Qwen3-32B"

qwen-2.5-14b:
name: "Qwen 2.5 14B Instruct"
description: "Multilingual powerhouse with strong creative writing capabilities."
Expand Down
5 changes: 4 additions & 1 deletion pkg/cli/catalog_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ func TestLoadCatalog(t *testing.T) {
}

// Verify we have the expected number of models
expectedModelCount := 10
expectedModelCount := 13
if len(catalog.Models) != expectedModelCount {
t.Errorf("Expected %d models, got %d", expectedModelCount, len(catalog.Models))
}
Expand Down Expand Up @@ -81,6 +81,9 @@ func TestGetModel(t *testing.T) {
{"Mixtral exists", "mixtral-8x7b", true},
{"Llama 70B exists", "llama-3.1-70b", true},
{"Llama 3.2 3B exists", "llama-3.2-3b", true},
{"Qwen 2.5 32B exists", "qwen-2.5-32b", true},
{"Qwen 2.5 Coder 32B exists", "qwen-2.5-coder-32b", true},
{"Qwen 3 32B exists", "qwen-3-32b", true},
{"Non-existent model", "non-existent-model", false},
}

Expand Down