feat: Add local LLM improvements for reasoning models and Docker startup (#88)

csfet9 · claude · web-flow · commit eea0f271180a · 2026-01-05T10:04:58.000+01:00
* feat: Add local LLM improvements for reasoning models and Docker startup ## Reasoning Model Support - Strip thinking tags from local LLM responses (<think>, <thinking>, <reasoning>, |startthink|/|endthink|) - Enables Qwen3, DeepSeek, and other reasoning models to work with JSON extraction - Non-breaking: only affects responses that contain thinking tags ## Docker Retry Start Script - New retry-start.sh waits for dependencies before starting Hindsight - Checks LLM Studio availability at /v1/models endpoint - Checks database connectivity (skipped for embedded pg0) - Configurable via HINDSIGHT_RETRY_MAX and HINDSIGHT_RETRY_INTERVAL env vars - Prevents startup failures when LLM Studio isn't ready yet Tested on Apple Silicon M4 Max with Qwen3 8B via LM Studio. * refactor: make thinking token stripping opt-in via env var * refactor: merge retry logic into start-all.sh (opt-in via HINDSIGHT_WAIT_FOR_DEPS) * fix: resolve pg0 stale instance config in Docker build - Remove stale pg0 instance data after pre-caching binaries to avoid port conflicts (was using hardcoded port 5555 from build time) - Remove unused cache copy logic from start-all.sh - Add database backup instructions to CLAUDE.md 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -84,6 +84,27 @@ PostgreSQL with pgvector. Schema managed via Alembic migrations in `hindsight-ap
 
 Key tables: `banks`, `memory_units`, `documents`, `entities`, `entity_links`
 
+### Database Backups (IMPORTANT)
+**Before any operation that may affect the database, run a backup:**
+```bash
+docker exec hindsight /backups/backup.sh
+```
+
+Operations requiring backup:
+- Running database migrations
+- Modifying Alembic migration files
+- Rebuilding Docker images
+- Resetting or recreating containers
+- Any schema changes
+- Bulk data operations
+
+Backups are stored in `~/hindsight-backups/` on the host.
+
+To restore:
+```bash
+docker exec -it hindsight /backups/restore.sh <backup-file.sql.gz>
+```
+
 ## Key Conventions
 
 ### Memory Banks
diff --git a/docker/standalone/Dockerfile b/docker/standalone/Dockerfile
@@ -157,9 +157,7 @@ USER hindsight
 # Set PATH for hindsight user
 ENV PATH="/app/api/.venv/bin:${PATH}"
 
-# Pre-cache PostgreSQL binaries by starting/stopping pg0-embedded
-ENV PG0_HOME=/home/hindsight/.pg0-cache
-
+# pg0 will download PostgreSQL binaries on first run
 ENV PG0_HOME=/home/hindsight/.pg0
 
 # Pre-download ML models to avoid runtime download (conditional)
@@ -272,16 +270,17 @@ USER hindsight
 ENV PATH="/app/api/.venv/bin:${PATH}"
 
 # Pre-cache PostgreSQL binaries by starting/stopping pg0-embedded
-ENV PG0_HOME=/home/hindsight/.pg0-cache
+# Note: We use a temp instance just to download binaries, then delete instance data
+# to avoid stale port config. Only installation binaries are kept.
+ENV PG0_HOME=/home/hindsight/.pg0
 RUN /app/api/.venv/bin/python -c "\
 from pg0 import Pg0; \
 print('Pre-caching PostgreSQL binaries...'); \
-pg = Pg0(name='hindsight', port=5555, username='hindsight', password='hindsight', database='hindsight'); \
+pg = Pg0(name='temp-cache', username='hindsight', password='hindsight', database='hindsight'); \
 pg.start(); \
 pg.stop(); \
-print('PostgreSQL pre-cached to PG0_HOME')" || echo "Pre-download skipped"
-
-ENV PG0_HOME=/home/hindsight/.pg0
+print('PostgreSQL binaries cached')" && \
+    rm -rf /home/hindsight/.pg0/instances || echo "Pre-download skipped"
 
 # Pre-download ML models to avoid runtime download (conditional)
 ARG PRELOAD_ML_MODELS
diff --git a/docker/standalone/start-all.sh b/docker/standalone/start-all.sh
@@ -5,16 +5,70 @@ set -e
 ENABLE_API="${HINDSIGHT_ENABLE_API:-true}"
 ENABLE_CP="${HINDSIGHT_ENABLE_CP:-true}"
 
-# Copy pre-cached PostgreSQL data if runtime directory is empty (first run with volume)
-if [ "$ENABLE_API" = "true" ]; then
-    PG0_CACHE="/home/hindsight/.pg0-cache"
-    PG0_HOME="/home/hindsight/.pg0"
-    if [ -d "$PG0_CACHE" ] && [ "$(ls -A $PG0_CACHE 2>/dev/null)" ]; then
-        if [ ! "$(ls -A $PG0_HOME 2>/dev/null)" ]; then
-            echo "📦 Copying pre-cached PostgreSQL data..."
-            cp -r "$PG0_CACHE"/* "$PG0_HOME"/ 2>/dev/null || true
-        fi
+# =============================================================================
+# Dependency waiting (opt-in via HINDSIGHT_WAIT_FOR_DEPS=true)
+#
+# Problem: When running with LM Studio, the LLM may take time to load models.
+# If Hindsight starts before LM Studio is ready, it fails on LLM verification.
+# This wait loop ensures dependencies are ready before starting.
+# =============================================================================
+if [ "${HINDSIGHT_WAIT_FOR_DEPS:-false}" = "true" ]; then
+    LLM_BASE_URL="${HINDSIGHT_API_LLM_BASE_URL:-http://host.docker.internal:1234/v1}"
+    MAX_RETRIES="${HINDSIGHT_RETRY_MAX:-0}"  # 0 = infinite
+    RETRY_INTERVAL="${HINDSIGHT_RETRY_INTERVAL:-10}"
+
+    # Check if external database is configured (skip check for embedded pg0)
+    SKIP_DB_CHECK=false
+    if [ -z "${HINDSIGHT_API_DATABASE_URL}" ]; then
+        SKIP_DB_CHECK=true
+    else
+        DB_CHECK_HOST=$(echo "$HINDSIGHT_API_DATABASE_URL" | sed -E 's|.*@([^:/]+):([0-9]+)/.*|\1 \2|')
     fi
+
+    check_db() {
+        if $SKIP_DB_CHECK; then
+            return 0
+        fi
+        if command -v pg_isready &> /dev/null; then
+            pg_isready -h $(echo $DB_CHECK_HOST | cut -d' ' -f1) -p $(echo $DB_CHECK_HOST | cut -d' ' -f2) &>/dev/null
+        else
+            python3 -c "import socket; s=socket.socket(); s.settimeout(5); exit(0 if s.connect_ex(('$(echo $DB_CHECK_HOST | cut -d' ' -f1)', $(echo $DB_CHECK_HOST | cut -d' ' -f2))) == 0 else 1)" 2>/dev/null
+        fi
+    }
+
+    check_llm() {
+        curl -sf "${LLM_BASE_URL}/models" --connect-timeout 5 &>/dev/null
+    }
+
+    echo "⏳ Waiting for dependencies to be ready..."
+    attempt=1
+
+    while true; do
+        db_ok=false
+        llm_ok=false
+
+        if check_db; then
+            db_ok=true
+        fi
+
+        if check_llm; then
+            llm_ok=true
+        fi
+
+        if $db_ok && $llm_ok; then
+            echo "✅ Dependencies ready!"
+            break
+        fi
+
+        if [ "$MAX_RETRIES" -ne 0 ] && [ "$attempt" -ge "$MAX_RETRIES" ]; then
+            echo "❌ Max retries ($MAX_RETRIES) reached. Dependencies not available."
+            exit 1
+        fi
+
+        echo "   Attempt $attempt: DB=$( $db_ok && echo 'ok' || echo 'waiting' ), LLM=$( $llm_ok && echo 'ok' || echo 'waiting' )"
+        sleep "$RETRY_INTERVAL"
+        ((attempt++))
+    done
 fi
 
 # Track PIDs for wait
diff --git a/hindsight-api/hindsight_api/config.py b/hindsight-api/hindsight_api/config.py
@@ -18,6 +18,7 @@
 ENV_LLM_BASE_URL = "HINDSIGHT_API_LLM_BASE_URL"
 ENV_LLM_MAX_CONCURRENT = "HINDSIGHT_API_LLM_MAX_CONCURRENT"
 ENV_LLM_TIMEOUT = "HINDSIGHT_API_LLM_TIMEOUT"
+ENV_LLM_STRIP_THINKING = "HINDSIGHT_API_LLM_STRIP_THINKING"
 
 ENV_EMBEDDINGS_PROVIDER = "HINDSIGHT_API_EMBEDDINGS_PROVIDER"
 ENV_EMBEDDINGS_LOCAL_MODEL = "HINDSIGHT_API_EMBEDDINGS_LOCAL_MODEL"
diff --git a/hindsight-api/hindsight_api/engine/llm_wrapper.py b/hindsight-api/hindsight_api/engine/llm_wrapper.py
@@ -6,6 +6,7 @@
 import json
 import logging
 import os
+import re
 import time
 from typing import Any
 
@@ -19,6 +20,7 @@
     DEFAULT_LLM_MAX_CONCURRENT,
     DEFAULT_LLM_TIMEOUT,
     ENV_LLM_MAX_CONCURRENT,
+    ENV_LLM_STRIP_THINKING,
     ENV_LLM_TIMEOUT,
 )
 
@@ -310,6 +312,20 @@ async def call(
 
                         content = response.choices[0].message.content
 
+                        # Strip reasoning model thinking tags when enabled (opt-in for local LLMs)
+                        # Supports: <think>, <thinking>, <reasoning>, |startthink|/|endthink|
+                        # Enable with HINDSIGHT_API_LLM_STRIP_THINKING=true for reasoning models
+                        # that embed thinking in their output (e.g., Qwen3, DeepSeek on LM Studio)
+                        if content and os.getenv(ENV_LLM_STRIP_THINKING, "false").lower() == "true":
+                            original_len = len(content)
+                            content = re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL)
+                            content = re.sub(r"<thinking>.*?</thinking>", "", content, flags=re.DOTALL)
+                            content = re.sub(r"<reasoning>.*?</reasoning>", "", content, flags=re.DOTALL)
+                            content = re.sub(r"\|startthink\|.*?\|endthink\|", "", content, flags=re.DOTALL)
+                            content = content.strip()
+                            if len(content) < original_len:
+                                logger.debug(f"Stripped {original_len - len(content)} chars of reasoning tokens")
+
                         # For local models, they may wrap JSON in markdown code blocks
                         if self.provider in ("lmstudio", "ollama"):
                             clean_content = content