defilantech · Defilan · Dec 2, 2025 · Dec 2, 2025 · Dec 2, 2025
@@ -1,6 +1,6 @@
 # LLMKube Model Catalog
 # Version: v1
-# Last Updated: 2025-11-23
+# Last Updated: 2025-12-01
 #
 # This catalog contains pre-configured, battle-tested LLM models optimized
 # for various use cases. Each model includes verified GGUF sources and
@@ -188,6 +188,87 @@ models:
   # Large Models (13B+) - Premium Quality
   # ============================================================================
 
+  qwen-2.5-32b:
+    name: "Qwen 2.5 32B Instruct"
+    description: "Powerful 32B model with excellent reasoning and multilingual capabilities. Perfect for 32GB VRAM setups."
+    size: "32B"
+    quantization: "Q4_K_M"
+    source: "https://huggingface.co/bartowski/Qwen2.5-32B-Instruct-GGUF/resolve/main/Qwen2.5-32B-Instruct-Q4_K_M.gguf"
+    context_size: 8192
+    gpu_layers: 64
+    use_cases:
+      - "complex-reasoning"
+      - "multilingual"
+      - "production"
+      - "high-quality-chat"
+    resources:
+      cpu: "8"
+      memory: "24Gi"
+      gpu_memory: "24Gi"
+    vram_estimate: "18-24GB"
+    tags:
+      - "qwen"
+      - "large"
+      - "recommended"
+      - "32gb-vram"
+      - "long-context"
+    homepage: "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct"
+
+  qwen-2.5-coder-32b:
+    name: "Qwen 2.5 Coder 32B Instruct"
+    description: "State-of-the-art coding model matching GPT-4o. Best open-source code LLM available."
+    size: "32B"
+    quantization: "Q4_K_M"
+    source: "https://huggingface.co/bartowski/Qwen2.5-Coder-32B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf"
+    context_size: 8192
+    gpu_layers: 64
+    use_cases:
+      - "code-generation"
+      - "debugging"
+      - "code-review"
+      - "technical-docs"
+    resources:
+      cpu: "8"
+      memory: "24Gi"
+      gpu_memory: "24Gi"
+    vram_estimate: "18-24GB"
+    tags:
+      - "qwen"
+      - "code"
+      - "large"
+      - "recommended"
+      - "32gb-vram"
+      - "long-context"
+      - "gpt4-level"
+    homepage: "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct"
+
+  qwen-3-32b:
+    name: "Qwen 3 32B"
+    description: "Latest Qwen 3 series with hybrid thinking modes. Newest and most capable 32B model."
+    size: "32B"
+    quantization: "Q4_K_M"
+    source: "https://huggingface.co/Qwen/Qwen3-32B-GGUF/resolve/main/Qwen3-32B-Q4_K_M.gguf"
+    context_size: 8192
+    gpu_layers: 64
+    use_cases:
+      - "complex-reasoning"
+      - "thinking-mode"
+      - "production"
+      - "high-quality-chat"
+    resources:
+      cpu: "8"
+      memory: "24Gi"
+      gpu_memory: "24Gi"
+    vram_estimate: "18-24GB"
+    tags:
+      - "qwen"
+      - "large"
+      - "newest"
+      - "32gb-vram"
+      - "long-context"
+      - "thinking"
+    homepage: "https://huggingface.co/Qwen/Qwen3-32B"
+
   qwen-2.5-14b:
     name: "Qwen 2.5 14B Instruct"
     description: "Multilingual powerhouse with strong creative writing capabilities."

@@ -66,6 +66,7 @@ type benchmarkOptions struct {
 	accelerator string
 	cleanup     bool
 	deployWait  time.Duration
+	contextSize int32
 }
 
 type BenchmarkResult struct {
@@ -283,6 +284,8 @@ Examples:
 	cmd.Flags().BoolVar(&opts.cleanup, "cleanup", true,
 		"Cleanup deployments after benchmarking (use --no-cleanup to keep)")
 	cmd.Flags().DurationVar(&opts.deployWait, "deploy-wait", 10*time.Minute, "Timeout waiting for deployment to be ready")
+	cmd.Flags().Int32Var(&opts.contextSize, "context", 0,
+		"Context size (KV cache) for model deployment (0 = use catalog default)")
 
 	return cmd
 }
@@ -1099,6 +1102,13 @@ func deployModel(
 		inferenceService.Spec.Resources.GPUMemory = catalogModel.Resources.GPUMemory
 	}
 
+	if opts.contextSize > 0 {
+		inferenceService.Spec.ContextSize = &opts.contextSize
+	} else if catalogModel.ContextSize > 0 {
+		contextSize := int32(catalogModel.ContextSize)
+		inferenceService.Spec.ContextSize = &contextSize
+	}
+
 	if err := k8sClient.Create(ctx, inferenceService); err != nil {
 		// Cleanup model if inference service creation fails
 		_ = k8sClient.Delete(ctx, model)

@@ -1,6 +1,6 @@
 # LLMKube Model Catalog
 # Version: v1
-# Last Updated: 2025-11-23
+# Last Updated: 2025-12-01
 #
 # This catalog contains pre-configured, battle-tested LLM models optimized
 # for various use cases. Each model includes verified GGUF sources and
@@ -188,6 +188,87 @@ models:
   # Large Models (13B+) - Premium Quality
   # ============================================================================
 
+  qwen-2.5-32b:
+    name: "Qwen 2.5 32B Instruct"
+    description: "Powerful 32B model with excellent reasoning and multilingual capabilities. Perfect for 32GB VRAM setups."
+    size: "32B"
+    quantization: "Q4_K_M"
+    source: "https://huggingface.co/bartowski/Qwen2.5-32B-Instruct-GGUF/resolve/main/Qwen2.5-32B-Instruct-Q4_K_M.gguf"
+    context_size: 8192
+    gpu_layers: 64
+    use_cases:
+      - "complex-reasoning"
+      - "multilingual"
+      - "production"
+      - "high-quality-chat"
+    resources:
+      cpu: "8"
+      memory: "24Gi"
+      gpu_memory: "24Gi"
+    vram_estimate: "18-24GB"
+    tags:
+      - "qwen"
+      - "large"
+      - "recommended"
+      - "32gb-vram"
+      - "long-context"
+    homepage: "https://huggingface.co/Qwen/Qwen2.5-32B-Instruct"
+
+  qwen-2.5-coder-32b:
+    name: "Qwen 2.5 Coder 32B Instruct"
+    description: "State-of-the-art coding model matching GPT-4o. Best open-source code LLM available."
+    size: "32B"
+    quantization: "Q4_K_M"
+    source: "https://huggingface.co/bartowski/Qwen2.5-Coder-32B-Instruct-GGUF/resolve/main/Qwen2.5-Coder-32B-Instruct-Q4_K_M.gguf"
+    context_size: 8192
+    gpu_layers: 64
+    use_cases:
+      - "code-generation"
+      - "debugging"
+      - "code-review"
+      - "technical-docs"
+    resources:
+      cpu: "8"
+      memory: "24Gi"
+      gpu_memory: "24Gi"
+    vram_estimate: "18-24GB"
+    tags:
+      - "qwen"
+      - "code"
+      - "large"
+      - "recommended"
+      - "32gb-vram"
+      - "long-context"
+      - "gpt4-level"
+    homepage: "https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct"
+
+  qwen-3-32b:
+    name: "Qwen 3 32B"
+    description: "Latest Qwen 3 series with hybrid thinking modes. Newest and most capable 32B model."
+    size: "32B"
+    quantization: "Q4_K_M"
+    source: "https://huggingface.co/Qwen/Qwen3-32B-GGUF/resolve/main/Qwen3-32B-Q4_K_M.gguf"
+    context_size: 8192
+    gpu_layers: 64
+    use_cases:
+      - "complex-reasoning"
+      - "thinking-mode"
+      - "production"
+      - "high-quality-chat"
+    resources:
+      cpu: "8"
+      memory: "24Gi"
+      gpu_memory: "24Gi"
+    vram_estimate: "18-24GB"
+    tags:
+      - "qwen"
+      - "large"
+      - "newest"
+      - "32gb-vram"
+      - "long-context"
+      - "thinking"
+    homepage: "https://huggingface.co/Qwen/Qwen3-32B"
+
   qwen-2.5-14b:
     name: "Qwen 2.5 14B Instruct"
     description: "Multilingual powerhouse with strong creative writing capabilities."

@@ -41,7 +41,7 @@ func TestLoadCatalog(t *testing.T) {
 	}
 
 	// Verify we have the expected number of models
-	expectedModelCount := 10
+	expectedModelCount := 13
 	if len(catalog.Models) != expectedModelCount {
 		t.Errorf("Expected %d models, got %d", expectedModelCount, len(catalog.Models))
 	}
@@ -81,6 +81,9 @@ func TestGetModel(t *testing.T) {
 		{"Mixtral exists", "mixtral-8x7b", true},
 		{"Llama 70B exists", "llama-3.1-70b", true},
 		{"Llama 3.2 3B exists", "llama-3.2-3b", true},
+		{"Qwen 2.5 32B exists", "qwen-2.5-32b", true},
+		{"Qwen 2.5 Coder 32B exists", "qwen-2.5-coder-32b", true},
+		{"Qwen 3 32B exists", "qwen-3-32b", true},
 		{"Non-existent model", "non-existent-model", false},
 	}