Fixes benchmark and regression reports,

acgetchell · acgetchell · commit c658287ba25d · 2025-09-03T09:04:06.000-07:00
Refactors benchmark scripts to improve baseline commit SHA resolution and report top regressions.

Adds timeout functionality to example scripts, including support for `gtimeout` on macOS, and introduces an `EXAMPLE_TIMEOUT` environment variable.

Updates tests and documentation accordingly.
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -273,14 +273,8 @@ jobs:
           bc_sha="$(grep "^Git commit:" baseline-artifact/baseline_results.txt | awk '{print $3}' || true)"
           if [[ -z "$bc_sha" || ! "$bc_sha" =~ ^[0-9A-Fa-f]{7,40}$ ]]; then
             if [[ -f "baseline-artifact/metadata.json" ]]; then
-              bc_sha="$(python3 - <<'PY'
-import json,sys
-try:
-  print(json.load(open("baseline-artifact/metadata.json"))["commit"])
-except Exception:
-  sys.exit(0)
-PY
-)"
+              bc_sha="$(python3 -c 'import json,sys; p="baseline-artifact/metadata.json"; \
+d=json.load(open(p)); print(d.get("commit",""))' || true)"
             fi
           fi
           if [[ -n "$bc_sha" && "$bc_sha" =~ ^[0-9A-Fa-f]{7,40}$ ]]; then
diff --git a/WARP.md b/WARP.md
@@ -134,10 +134,10 @@ uv run pytest
 uv run benchmark-utils generate-baseline
 
 # Compare performance against baseline
-uv run benchmark-utils compare --baseline benches/baseline_results.txt
+uv run benchmark-utils compare --baseline baseline-artifact/baseline_results.txt
 
 # Development mode (10x faster for iteration)
-uv run benchmark-utils compare --baseline benches/baseline_results.txt --dev
+uv run benchmark-utils compare --baseline baseline-artifact/baseline_results.txt --dev
 ```
 
 ### Changelog Management
@@ -146,8 +146,9 @@ uv run benchmark-utils compare --baseline benches/baseline_results.txt --dev
 # Generate enhanced changelog with AI categorization
 uv run changelog-utils generate
 
-# Create git tag with changelog content
-uv run changelog-utils tag v0.4.2
+# Create git tag with changelog content (user-only; WARP must not execute)
+# Run manually from your terminal:
+# uv run changelog-utils tag v0.4.2
 ```
 
 ## Project Context
diff --git a/cspell.json b/cspell.json
@@ -90,6 +90,7 @@
         "getrandom",
         "golangci",
         "gsub",
+        "gtimeout",
         "Guibas",
         "hashset",
         "htmlhint",
diff --git a/scripts/benchmark_utils.py b/scripts/benchmark_utils.py
@@ -476,6 +476,10 @@ def _write_performance_comparison(self, f, current_results: list[BenchmarkData],
             f.write(f"Total benchmarks compared: {len(time_changes)}\n")
             f.write(f"Individual regressions (>{self.regression_threshold}%): {individual_regressions}\n")
             f.write(f"Average time change: {average_change:.1f}%\n")
+            # Optional: top regressions
+            top = sorted(time_changes, reverse=True)[:5]
+            if top:
+                f.write("Top regressions (by time change %): " + ", ".join(f"{t:.1f}%" for t in top) + "\n")
 
             average_regression_found = average_change > self.regression_threshold
             if average_regression_found:
diff --git a/scripts/run_all_examples.sh b/scripts/run_all_examples.sh
@@ -38,6 +38,8 @@ NOTES:
     - Examples are discovered automatically from the examples/ directory
     - Output is shown in real-time as examples execute
     - Script exits with error code if any example fails
+    - Set EXAMPLE_TIMEOUT (seconds, default 600) to bound per-example runtime
+    - On macOS, install coreutils and ensure gtimeout is available (auto-detected)
 
 SEE ALSO:
     examples/README.md - Detailed documentation for each example
@@ -124,15 +126,22 @@ if [ ${#all_examples[@]} -eq 0 ]; then
 fi
 
 # Run all examples
+TIMEOUT_CMD=""
+if command -v timeout >/dev/null 2>&1; then
+  TIMEOUT_CMD="timeout"
+elif command -v gtimeout >/dev/null 2>&1; then
+  TIMEOUT_CMD="gtimeout"
+fi
+
 for example in "${all_examples[@]}"; do
-	echo "=== Running $example ==="
-	if command -v timeout >/dev/null 2>&1; then
-		timeout "${EXAMPLE_TIMEOUT:-600}" cargo run --release --example "$example" ||
-			error_exit "Example $example failed!"
-	else
-		cargo run --release --example "$example" ||
-			error_exit "Example $example failed!"
-	fi
+  echo "=== Running $example ==="
+  if [[ -n "$TIMEOUT_CMD" ]]; then
+    "$TIMEOUT_CMD" "${EXAMPLE_TIMEOUT:-600}" cargo run --release --example "$example" ||
+      error_exit "Example $example failed!"
+  else
+    cargo run --release --example "$example" ||
+      error_exit "Example $example failed!"
+  fi
 done
 
 echo
diff --git a/scripts/tests/test_benchmark_utils.py b/scripts/tests/test_benchmark_utils.py
@@ -311,15 +311,15 @@ def test_write_performance_comparison_no_average_regression(self, comparator):
         output = StringIO()
         regression_found = comparator._write_performance_comparison(output, current_results, baseline_results)
 
-        # Average change should be: (20 + (-2) + (-15)) / 3 = 1%
+        # Average change using geometric mean: ~0.0%
         # This is less than 5% threshold, so no overall regression
         assert not regression_found
 
         result = output.getvalue()
         assert "SUMMARY" in result
         assert "Total benchmarks compared: 3" in result
         assert "Individual regressions (>5.0%): 1" in result  # Only the +20% one
-        assert "Average time change: 1.0%" in result
+        assert "Average time change: -0.0%" in result
         assert "✅ OVERALL OK" in result
 
     def test_write_performance_comparison_with_average_regression(self, comparator):
@@ -344,15 +344,15 @@ def test_write_performance_comparison_with_average_regression(self, comparator):
         output = StringIO()
         regression_found = comparator._write_performance_comparison(output, current_results, baseline_results)
 
-        # Average change should be: (10 + 8 + (-1)) / 3 = 5.67%
+        # Average change using geometric mean: 5.6%
         # This exceeds 5% threshold, so overall regression found
         assert regression_found
 
         result = output.getvalue()
         assert "SUMMARY" in result
         assert "Total benchmarks compared: 3" in result
         assert "Individual regressions (>5.0%): 2" in result  # The +10% and +8% ones
-        assert "Average time change: 5.7%" in result
+        assert "Average time change: 5.6%" in result
         assert "🚨 OVERALL REGRESSION" in result
 
     def test_write_performance_comparison_with_average_improvement(self, comparator):
@@ -377,15 +377,15 @@ def test_write_performance_comparison_with_average_improvement(self, comparator)
         output = StringIO()
         regression_found = comparator._write_performance_comparison(output, current_results, baseline_results)
 
-        # Average change should be: (-10 + (-8) + 2) / 3 = -5.33%
+        # Average change using geometric mean: -5.5%
         # This is significant improvement, so no regression found
         assert not regression_found
 
         result = output.getvalue()
         assert "SUMMARY" in result
         assert "Total benchmarks compared: 3" in result
         assert "Individual regressions (>5.0%): 0" in result
-        assert "Average time change: -5.3%" in result
+        assert "Average time change: -5.5%" in result
         assert "🎉 OVERALL IMPROVEMENT" in result
 
     def test_write_performance_comparison_missing_baseline(self, comparator):
@@ -501,14 +501,14 @@ def test_realistic_mixed_performance_scenario(self, comparator):
         output = StringIO()
         regression_found = comparator._write_performance_comparison(output, current_results, baseline_results)
 
-        # Average change: (3 + 7 + (-2) + (-12) + 4) / 5 = 0%
+        # Average change using geometric mean: -0.2%
         # No overall regression should be detected
         assert not regression_found
 
         result = output.getvalue()
         assert "Total benchmarks compared: 5" in result
         assert "Individual regressions (>5.0%): 1" in result  # Only the 7% one
-        assert "Average time change: 0.0%" in result
+        assert "Average time change: -0.2%" in result
         assert "✅ OVERALL OK" in result
 
     def test_gradual_performance_degradation_scenario(self, comparator):
@@ -571,15 +571,15 @@ def test_noisy_benchmarks_scenario(self, comparator):
         output = StringIO()
         regression_found = comparator._write_performance_comparison(output, current_results, baseline_results)
 
-        # Average change: (2 + (-4) + 3 + 40 + (-10)) / 5 = 6.2%
-        # Despite the one big outlier, overall regression should be detected
-        assert regression_found
+        # Average change using geometric mean: 4.9%
+        # Despite the one big outlier, no overall regression should be detected (4.9% < 5.0% threshold)
+        assert not regression_found
 
         result = output.getvalue()
         assert "Total benchmarks compared: 5" in result
         assert "Individual regressions (>5.0%): 1" in result  # Only the 40% outlier
-        assert "Average time change: 6.2%" in result
-        assert "🚨 OVERALL REGRESSION" in result
+        assert "Average time change: 4.9%" in result
+        assert "✅ OVERALL OK" in result
 
 
 class TestEdgeCases:
@@ -652,6 +652,9 @@ def test_mixed_valid_invalid_baselines(self, comparator):
         output = StringIO()
         regression_found = comparator._write_performance_comparison(output, current_results, baseline_results)
 
+        # Should find regression due to the 10% change in the valid comparison
+        assert regression_found
+
         result = output.getvalue()
         assert "Total benchmarks compared: 1" in result  # Only one valid comparison
         assert "N/A (baseline mean is 0)" in result
diff --git a/scripts/tests/test_hardware_utils.py b/scripts/tests/test_hardware_utils.py
@@ -539,7 +539,7 @@ def test_extract_memory_value(self):
             with self.subTest(memory_str=memory_str):
                 result = HardwareComparator._extract_memory_value(memory_str)  # noqa: SLF001
                 if expected is None:
-                    assert result == expected
+                    assert result is None
                 else:
                     assert result == pytest.approx(expected, abs=1e-9)
 
diff --git a/tests/circumsphere_debug_tools.rs b/tests/circumsphere_debug_tools.rs
@@ -10,7 +10,7 @@
 //! ```bash
 //! cargo test --test circumsphere_debug_tools test_2d_circumsphere_debug -- --nocapture
 //! cargo test --test circumsphere_debug_tools test_3d_circumsphere_debug -- --nocapture
-//! cargo test --test circumsphere_debug_tools test_all_debug -- --nocapture
+//! cargo test --test circumsphere_debug_tools test_all_debug -- --ignored --nocapture
 //! ```
 
 use delaunay::geometry::util::squared_norm;