defilantech · Defilan · Mar 4, 2026 · Mar 4, 2026
@@ -137,9 +137,18 @@ scrape_configs:
       - targets: ['<your-server>:8080']
         labels:
           instance: '<your-server>'
+
+  # LLMKube Metal Agent - Agent health and process metrics (if using Metal)
+  # Note: The agent binds to 127.0.0.1 by default.
+  # For remote scraping, use an SSH tunnel: ssh -L 9090:localhost:9090 <mac>
+  - job_name: 'llmkube-metal-agent'
+    static_configs:
+      - targets: ['localhost:9090']
+        labels:
+          instance: 'metal-agent'
 ```
 
-Replace `<your-server>` with your server's hostname or IP address.
+Replace `<your-server>` with your server's hostname or IP address. The Metal Agent binds to `127.0.0.1` for security; use an SSH tunnel (`ssh -L 9090:localhost:9090 <mac>`) for remote Prometheus scraping.
 
 ### Prometheus Docker Compose Example
 
@@ -194,6 +203,9 @@ curl http://<your-server>:9100/metrics | grep node_cpu
 # DCGM exporter metrics
 curl http://<your-server>:9400/metrics | grep DCGM_FI_DEV_GPU
 
+# Metal Agent metrics (if using Apple Silicon)
+curl http://<your-mac-ip>:9090/metrics | grep llmkube_metal_agent
+
 # Prometheus targets (should show UP)
 curl http://prometheus:9090/api/v1/targets
 ```
@@ -223,14 +235,29 @@ curl http://prometheus:9090/api/v1/targets
 | `DCGM_FI_DEV_FB_FREE` | GPU memory free |
 | `DCGM_FI_DEV_MEM_COPY_UTIL` | Memory copy utilization |
 
-### LLMKube Metrics (if instrumented)
+### LLMKube Controller Metrics
+
+| Metric | Description |
+|--------|-------------|
+| `llmkube_model_download_duration_seconds` | Model download/copy duration |
+| `llmkube_model_status` | Current model status phase |
+| `llmkube_inferenceservice_phase` | Current inference service phase |
+| `llmkube_inferenceservice_ready_duration_seconds` | Time to Ready phase |
+| `llmkube_reconcile_total` | Total reconciliation cycles |
+| `llmkube_reconcile_duration_seconds` | Reconciliation cycle duration |
+| `llmkube_active_models_total` | Models in Ready/Cached phase |
+| `llmkube_active_inferenceservices_total` | Inference services in Ready phase |
+
+### LLMKube Metal Agent Metrics
 
 | Metric | Description |
 |--------|-------------|
-| `llmkube_model_status` | Model download status |
-| `llmkube_inferenceservice_status` | Service running status |
-| `llmkube_inference_requests_total` | Total inference requests |
-| `llmkube_inference_latency_seconds` | Request latency histogram |
+| `llmkube_metal_agent_managed_processes` | Number of managed llama-server processes |
+| `llmkube_metal_agent_process_healthy` | Process health status (1=healthy, 0=unhealthy) |
+| `llmkube_metal_agent_process_restarts_total` | Process restarts from health monitoring |
+| `llmkube_metal_agent_health_check_duration_seconds` | Health check probe duration |
+| `llmkube_metal_agent_memory_budget_bytes` | Total memory budget for model serving |
+| `llmkube_metal_agent_memory_estimated_bytes` | Estimated memory usage per process |
 
 ## Troubleshooting
 

@@ -74,6 +74,12 @@ tail -f /tmp/llmkube-metal-agent.log
 
 # Check running processes
 ps aux | grep llmkube-metal-agent
+
+# Health check (liveness)
+curl http://localhost:9090/healthz
+
+# Readiness check (at least one process healthy, or no processes yet)
+curl http://localhost:9090/readyz
 ```
 
 ### Verify Metal Acceleration
@@ -165,6 +171,69 @@ To set this in the launchd plist:
     <string>0.75</string>                 <!-- 75% of system memory -->
 ```
 
+## Health Checks & Monitoring
+
+The Metal Agent exposes an HTTP server on `127.0.0.1:9090` (configurable via `--port`) with health check and Prometheus metrics endpoints. The server binds to localhost only; to expose it for remote Prometheus scraping, use a reverse proxy or SSH tunnel.
+
+### Endpoints
+
+| Endpoint | Purpose | Success | Failure |
+|----------|---------|---------|---------|
+| `GET /healthz` | Liveness probe — agent process is alive | Always 200 | — |
+| `GET /readyz` | Readiness probe — at least one process healthy (or no processes) | 200 | 503 (all unhealthy) |
+| `GET /metrics` | Prometheus metrics | 200 | — |
+
+### Prometheus Metrics
+
+| Metric | Type | Description |
+|--------|------|-------------|
+| `llmkube_metal_agent_managed_processes` | Gauge | Number of llama-server processes currently managed |
+| `llmkube_metal_agent_process_healthy` | Gauge | Whether a process is healthy (1) or not (0). Labels: `name`, `namespace` |
+| `llmkube_metal_agent_process_restarts_total` | Counter | Total process restarts triggered by health monitoring. Labels: `name`, `namespace` |
+| `llmkube_metal_agent_health_check_duration_seconds` | Histogram | Duration of health check probes. Labels: `name`, `namespace` |
+| `llmkube_metal_agent_memory_budget_bytes` | Gauge | Total memory budget for model serving |
+| `llmkube_metal_agent_memory_estimated_bytes` | Gauge | Estimated memory per process. Labels: `name`, `namespace` |
+
+Standard Go runtime and process metrics (`go_*`, `process_*`) are also available.
+
+### Continuous Health Monitoring
+
+The agent polls each managed llama-server process every 30 seconds via its `/health` endpoint. On failure:
+
+1. The process is marked unhealthy (`Healthy=false`, `process_healthy` gauge set to 0)
+2. The agent re-fetches the InferenceService from Kubernetes
+3. `ensureProcess()` is called to restart the llama-server
+4. The `process_restarts_total` counter is incremented
+
+When a previously unhealthy process recovers, it is marked healthy again automatically.
+
+### Scraping with Prometheus
+
+The health server binds to `127.0.0.1` by default. If Prometheus runs on the same Mac, scrape directly:
+
+```yaml
+scrape_configs:
+  - job_name: 'llmkube-metal-agent'
+    static_configs:
+      - targets: ['localhost:9090']
+        labels:
+          instance: 'metal-agent'
+```
+
+For remote Prometheus, use an SSH tunnel: `ssh -L 9090:localhost:9090 <your-mac>`.
+
+Quick verification:
+
+```bash
+# Check all endpoints
+curl http://localhost:9090/healthz   # → "ok"
+curl http://localhost:9090/readyz    # → "ready" or "not ready"
+curl http://localhost:9090/metrics   # → Prometheus text format
+
+# Check specific metric
+curl -s http://localhost:9090/metrics | grep llmkube_metal_agent_managed_processes
+```
+
 ## Troubleshooting
 
 ### Agent won't start
@@ -261,7 +330,9 @@ rm ~/Library/LaunchAgents/com.llmkube.metal-agent.plist
 4. **Validates** that the model fits in the system's memory budget
 5. **Spawns** llama-server processes with Metal acceleration
 6. **Registers** service endpoints back to Kubernetes
-7. **Pods** access the Metal-accelerated inference via Service endpoints
+7. **Monitors** process health every 30s and auto-restarts on failure
+8. **Exposes** health checks and Prometheus metrics on port 9090
+9. **Pods** access the Metal-accelerated inference via Service endpoints
 
 ### Remote cluster (Recommended)
 

@@ -72,6 +72,10 @@ which llama-server
 launchctl list | grep llmkube
 # Should show: com.llmkube.metal-agent
 
+# 3b. Check Metal agent health endpoint
+curl http://localhost:9090/healthz
+# Should show: ok
+
 # 4. Check minikube is running
 minikube status
 # Should show: host: Running, kubelet: Running
@@ -130,6 +134,13 @@ llmkube deploy my-custom-model \
 # Watch the deployment
 kubectl get inferenceservices -w
 
+# Check Metal agent health
+curl http://localhost:9090/healthz   # Liveness: "ok"
+curl http://localhost:9090/readyz    # Readiness: "ready" when processes are healthy
+
+# Check Metal agent metrics
+curl -s http://localhost:9090/metrics | grep llmkube_metal_agent
+
 # Check Metal agent logs
 tail -f /tmp/llmkube-metal-agent.log
 
@@ -327,7 +338,7 @@ make uninstall-metal-agent
 - **Scale up**: Try larger models (Mixtral 8x7B, Llama 70B)
 - **Production**: Deploy multiple replicas for high availability
 - **Integration**: Connect to your applications using OpenAI SDK
-- **Monitoring**: Set up Prometheus + Grafana dashboards
+- **Monitoring**: Scrape `localhost:9090/metrics` with Prometheus for agent health and process metrics
 
 ## Example Applications
 

@@ -109,14 +109,15 @@ func NewMetalAgent(config MetalAgentConfig) *MetalAgent {
 
 // Start begins watching for InferenceService resources and managing processes
 func (a *MetalAgent) Start(ctx context.Context) error {
-	// Log effective memory budget
+	// Log effective memory budget and set gauge
 	if total, err := a.memoryProvider.TotalMemory(); err == nil {
-		budgetBytes := uint64(float64(total) * a.memoryFraction)
+		budget := uint64(float64(total) * a.memoryFraction)
 		a.logger.Infow("memory budget",
 			"total", formatMemory(total),
 			"fraction", a.memoryFraction,
-			"budget", formatMemory(budgetBytes),
+			"budget", formatMemory(budget),
 		)
+		memoryBudgetBytes.Set(float64(budget))
 	} else {
 		a.logger.Warnw("unable to query total memory", "error", err)
 	}
@@ -130,6 +131,25 @@ func (a *MetalAgent) Start(ctx context.Context) error {
 	)
 	a.registry = NewServiceRegistry(a.config.K8sClient, a.config.HostIP, a.logger.With("subsystem", "registry"))
 
+	// Start health server
+	if a.config.Port > 0 {
+		healthSrv := NewHealthServer(a, a.config.Port, a.logger.With("subsystem", "health-server"))
+		go func() {
+			if err := healthSrv.Run(ctx); err != nil {
+				a.logger.Warnw("health server exited with error", "error", err)
+			}
+		}()
+	}
+
+	// Start health monitor
+	monitor := NewHealthMonitor(
+		a,
+		NewDefaultProcessHealthChecker(5*time.Second),
+		30*time.Second,
+		a.logger.With("subsystem", "health-monitor"),
+	)
+	go monitor.Run(ctx)
+
 	// Start watcher
 	eventChan := make(chan InferenceServiceEvent)
 	go func() {
@@ -212,6 +232,8 @@ func (a *MetalAgent) ensureProcess(ctx context.Context, isvc *inferencev1alpha1.
 	if estimate, err := a.estimateModelMemory(model, contextSize); err != nil {
 		a.logger.Warnw("memory estimation failed, proceeding without check", "error", err)
 	} else {
+		memoryEstimatedBytes.WithLabelValues(isvc.Name, isvc.Namespace).Set(float64(estimate.TotalBytes))
+
 		budget, err := CheckMemoryBudget(a.memoryProvider, estimate, a.memoryFraction)
 		if err != nil {
 			a.logger.Warnw("memory budget check failed, proceeding without check", "error", err)
@@ -256,10 +278,12 @@ func (a *MetalAgent) ensureProcess(ctx context.Context, isvc *inferencev1alpha1.
 		return fmt.Errorf("failed to start process: %w", err)
 	}
 
-	// Store process
+	// Store process and update metrics
 	a.mu.Lock()
 	a.processes[key] = process
+	managedProcesses.Set(float64(len(a.processes)))
 	a.mu.Unlock()
+	processHealthy.WithLabelValues(isvc.Name, isvc.Namespace).Set(1)
 
 	// Register service endpoint in Kubernetes
 	if err := a.registry.RegisterEndpoint(ctx, isvc, process.Port); err != nil {
@@ -292,11 +316,18 @@ func (a *MetalAgent) deleteProcess(ctx context.Context, key string) error {
 		return nil
 	}
 	delete(a.processes, key)
+	managedProcesses.Set(float64(len(a.processes)))
 	a.mu.Unlock()
 
 	a.logger.Infow("stopping inference service", "key", key)
 	namespace, name := parseKey(key)
 
+	// Clean up per-process metrics
+	processHealthy.DeleteLabelValues(name, namespace)
+	memoryEstimatedBytes.DeleteLabelValues(name, namespace)
+	healthCheckDuration.DeleteLabelValues(name, namespace)
+	processRestarts.DeleteLabelValues(name, namespace)
+
 	var deleteErrors []error
 	if err := a.executor.StopProcess(process.PID); err != nil {
 		deleteErrors = append(deleteErrors, fmt.Errorf("failed to stop process: %w", err))
@@ -317,6 +348,26 @@ func (a *MetalAgent) deleteProcess(ctx context.Context, key string) error {
 	return nil
 }
 
+// scheduleRestart increments the restart counter and re-runs ensureProcess
+// for the named InferenceService. It is called by HealthMonitor when a process
+// becomes unhealthy.
+func (a *MetalAgent) scheduleRestart(ctx context.Context, name, namespace string) {
+	processRestarts.WithLabelValues(name, namespace).Inc()
+
+	isvc := &inferencev1alpha1.InferenceService{}
+	if err := a.config.K8sClient.Get(ctx, types.NamespacedName{
+		Namespace: namespace,
+		Name:      name,
+	}, isvc); err != nil {
+		a.logger.Warnw("failed to fetch InferenceService for restart", "name", name, "namespace", namespace, "error", err)
+		return
+	}
+
+	if err := a.ensureProcess(ctx, isvc); err != nil {
+		a.logger.Warnw("failed to restart process", "name", name, "namespace", namespace, "error", err)
+	}
+}
+
 // Shutdown gracefully shuts down all running processes
 func (a *MetalAgent) Shutdown(ctx context.Context) error {
 	a.mu.Lock()

@@ -0,0 +1,89 @@
+/*
+Copyright 2025.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package agent
+
+import (
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/collectors"
+)
+
+// AgentRegistry is a standalone Prometheus registry for the Metal agent.
+// It is separate from controller-runtime's registry because the agent
+// runs as its own binary without the controller-manager.
+var AgentRegistry = prometheus.NewRegistry()
+
+var (
+	managedProcesses = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Name: "llmkube_metal_agent_managed_processes",
+			Help: "Number of llama-server processes currently managed by the agent.",
+		},
+	)
+
+	processHealthy = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "llmkube_metal_agent_process_healthy",
+			Help: "Whether a managed process is healthy (1) or unhealthy (0).",
+		},
+		[]string{"name", "namespace"},
+	)
+
+	processRestarts = prometheus.NewCounterVec(
+		prometheus.CounterOpts{
+			Name: "llmkube_metal_agent_process_restarts_total",
+			Help: "Total number of process restarts triggered by health monitoring.",
+		},
+		[]string{"name", "namespace"},
+	)
+
+	healthCheckDuration = prometheus.NewHistogramVec(
+		prometheus.HistogramOpts{
+			Name:    "llmkube_metal_agent_health_check_duration_seconds",
+			Help:    "Duration of individual health check probes.",
+			Buckets: prometheus.DefBuckets,
+		},
+		[]string{"name", "namespace"},
+	)
+
+	memoryBudgetBytes = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Name: "llmkube_metal_agent_memory_budget_bytes",
+			Help: "Total memory budget available for model serving in bytes.",
+		},
+	)
+
+	memoryEstimatedBytes = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Name: "llmkube_metal_agent_memory_estimated_bytes",
+			Help: "Estimated memory usage per managed process in bytes.",
+		},
+		[]string{"name", "namespace"},
+	)
+)
+
+func init() {
+	AgentRegistry.MustRegister(
+		collectors.NewGoCollector(),
+		collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
+		managedProcesses,
+		processHealthy,
+		processRestarts,
+		healthCheckDuration,
+		memoryBudgetBytes,
+		memoryEstimatedBytes,
+	)
+}