Skip to content

Commit c658287

Browse files
committed
Fixes benchmark and regression reports,
Refactors benchmark scripts to improve baseline commit SHA resolution and report top regressions. Adds timeout functionality to example scripts, including support for `gtimeout` on macOS, and introduces an `EXAMPLE_TIMEOUT` environment variable. Updates tests and documentation accordingly.
1 parent 73e0585 commit c658287

8 files changed

Lines changed: 47 additions & 35 deletions

File tree

.github/workflows/benchmarks.yml

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -273,14 +273,8 @@ jobs:
273273
bc_sha="$(grep "^Git commit:" baseline-artifact/baseline_results.txt | awk '{print $3}' || true)"
274274
if [[ -z "$bc_sha" || ! "$bc_sha" =~ ^[0-9A-Fa-f]{7,40}$ ]]; then
275275
if [[ -f "baseline-artifact/metadata.json" ]]; then
276-
bc_sha="$(python3 - <<'PY'
277-
import json,sys
278-
try:
279-
print(json.load(open("baseline-artifact/metadata.json"))["commit"])
280-
except Exception:
281-
sys.exit(0)
282-
PY
283-
)"
276+
bc_sha="$(python3 -c 'import json,sys; p="baseline-artifact/metadata.json"; \
277+
d=json.load(open(p)); print(d.get("commit",""))' || true)"
284278
fi
285279
fi
286280
if [[ -n "$bc_sha" && "$bc_sha" =~ ^[0-9A-Fa-f]{7,40}$ ]]; then

WARP.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -134,10 +134,10 @@ uv run pytest
134134
uv run benchmark-utils generate-baseline
135135

136136
# Compare performance against baseline
137-
uv run benchmark-utils compare --baseline benches/baseline_results.txt
137+
uv run benchmark-utils compare --baseline baseline-artifact/baseline_results.txt
138138

139139
# Development mode (10x faster for iteration)
140-
uv run benchmark-utils compare --baseline benches/baseline_results.txt --dev
140+
uv run benchmark-utils compare --baseline baseline-artifact/baseline_results.txt --dev
141141
```
142142

143143
### Changelog Management
@@ -146,8 +146,9 @@ uv run benchmark-utils compare --baseline benches/baseline_results.txt --dev
146146
# Generate enhanced changelog with AI categorization
147147
uv run changelog-utils generate
148148

149-
# Create git tag with changelog content
150-
uv run changelog-utils tag v0.4.2
149+
# Create git tag with changelog content (user-only; WARP must not execute)
150+
# Run manually from your terminal:
151+
# uv run changelog-utils tag v0.4.2
151152
```
152153

153154
## Project Context

cspell.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@
9090
"getrandom",
9191
"golangci",
9292
"gsub",
93+
"gtimeout",
9394
"Guibas",
9495
"hashset",
9596
"htmlhint",

scripts/benchmark_utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -476,6 +476,10 @@ def _write_performance_comparison(self, f, current_results: list[BenchmarkData],
476476
f.write(f"Total benchmarks compared: {len(time_changes)}\n")
477477
f.write(f"Individual regressions (>{self.regression_threshold}%): {individual_regressions}\n")
478478
f.write(f"Average time change: {average_change:.1f}%\n")
479+
# Optional: top regressions
480+
top = sorted(time_changes, reverse=True)[:5]
481+
if top:
482+
f.write("Top regressions (by time change %): " + ", ".join(f"{t:.1f}%" for t in top) + "\n")
479483

480484
average_regression_found = average_change > self.regression_threshold
481485
if average_regression_found:

scripts/run_all_examples.sh

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ NOTES:
3838
- Examples are discovered automatically from the examples/ directory
3939
- Output is shown in real-time as examples execute
4040
- Script exits with error code if any example fails
41+
- Set EXAMPLE_TIMEOUT (seconds, default 600) to bound per-example runtime
42+
- On macOS, install coreutils and ensure gtimeout is available (auto-detected)
4143
4244
SEE ALSO:
4345
examples/README.md - Detailed documentation for each example
@@ -124,15 +126,22 @@ if [ ${#all_examples[@]} -eq 0 ]; then
124126
fi
125127

126128
# Run all examples
129+
TIMEOUT_CMD=""
130+
if command -v timeout >/dev/null 2>&1; then
131+
TIMEOUT_CMD="timeout"
132+
elif command -v gtimeout >/dev/null 2>&1; then
133+
TIMEOUT_CMD="gtimeout"
134+
fi
135+
127136
for example in "${all_examples[@]}"; do
128-
echo "=== Running $example ==="
129-
if command -v timeout >/dev/null 2>&1; then
130-
timeout "${EXAMPLE_TIMEOUT:-600}" cargo run --release --example "$example" ||
131-
error_exit "Example $example failed!"
132-
else
133-
cargo run --release --example "$example" ||
134-
error_exit "Example $example failed!"
135-
fi
137+
echo "=== Running $example ==="
138+
if [[ -n "$TIMEOUT_CMD" ]]; then
139+
"$TIMEOUT_CMD" "${EXAMPLE_TIMEOUT:-600}" cargo run --release --example "$example" ||
140+
error_exit "Example $example failed!"
141+
else
142+
cargo run --release --example "$example" ||
143+
error_exit "Example $example failed!"
144+
fi
136145
done
137146

138147
echo

scripts/tests/test_benchmark_utils.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -311,15 +311,15 @@ def test_write_performance_comparison_no_average_regression(self, comparator):
311311
output = StringIO()
312312
regression_found = comparator._write_performance_comparison(output, current_results, baseline_results)
313313

314-
# Average change should be: (20 + (-2) + (-15)) / 3 = 1%
314+
# Average change using geometric mean: ~0.0%
315315
# This is less than 5% threshold, so no overall regression
316316
assert not regression_found
317317

318318
result = output.getvalue()
319319
assert "SUMMARY" in result
320320
assert "Total benchmarks compared: 3" in result
321321
assert "Individual regressions (>5.0%): 1" in result # Only the +20% one
322-
assert "Average time change: 1.0%" in result
322+
assert "Average time change: -0.0%" in result
323323
assert "✅ OVERALL OK" in result
324324

325325
def test_write_performance_comparison_with_average_regression(self, comparator):
@@ -344,15 +344,15 @@ def test_write_performance_comparison_with_average_regression(self, comparator):
344344
output = StringIO()
345345
regression_found = comparator._write_performance_comparison(output, current_results, baseline_results)
346346

347-
# Average change should be: (10 + 8 + (-1)) / 3 = 5.67%
347+
# Average change using geometric mean: 5.6%
348348
# This exceeds 5% threshold, so overall regression found
349349
assert regression_found
350350

351351
result = output.getvalue()
352352
assert "SUMMARY" in result
353353
assert "Total benchmarks compared: 3" in result
354354
assert "Individual regressions (>5.0%): 2" in result # The +10% and +8% ones
355-
assert "Average time change: 5.7%" in result
355+
assert "Average time change: 5.6%" in result
356356
assert "🚨 OVERALL REGRESSION" in result
357357

358358
def test_write_performance_comparison_with_average_improvement(self, comparator):
@@ -377,15 +377,15 @@ def test_write_performance_comparison_with_average_improvement(self, comparator)
377377
output = StringIO()
378378
regression_found = comparator._write_performance_comparison(output, current_results, baseline_results)
379379

380-
# Average change should be: (-10 + (-8) + 2) / 3 = -5.33%
380+
# Average change using geometric mean: -5.5%
381381
# This is significant improvement, so no regression found
382382
assert not regression_found
383383

384384
result = output.getvalue()
385385
assert "SUMMARY" in result
386386
assert "Total benchmarks compared: 3" in result
387387
assert "Individual regressions (>5.0%): 0" in result
388-
assert "Average time change: -5.3%" in result
388+
assert "Average time change: -5.5%" in result
389389
assert "🎉 OVERALL IMPROVEMENT" in result
390390

391391
def test_write_performance_comparison_missing_baseline(self, comparator):
@@ -501,14 +501,14 @@ def test_realistic_mixed_performance_scenario(self, comparator):
501501
output = StringIO()
502502
regression_found = comparator._write_performance_comparison(output, current_results, baseline_results)
503503

504-
# Average change: (3 + 7 + (-2) + (-12) + 4) / 5 = 0%
504+
# Average change using geometric mean: -0.2%
505505
# No overall regression should be detected
506506
assert not regression_found
507507

508508
result = output.getvalue()
509509
assert "Total benchmarks compared: 5" in result
510510
assert "Individual regressions (>5.0%): 1" in result # Only the 7% one
511-
assert "Average time change: 0.0%" in result
511+
assert "Average time change: -0.2%" in result
512512
assert "✅ OVERALL OK" in result
513513

514514
def test_gradual_performance_degradation_scenario(self, comparator):
@@ -571,15 +571,15 @@ def test_noisy_benchmarks_scenario(self, comparator):
571571
output = StringIO()
572572
regression_found = comparator._write_performance_comparison(output, current_results, baseline_results)
573573

574-
# Average change: (2 + (-4) + 3 + 40 + (-10)) / 5 = 6.2%
575-
# Despite the one big outlier, overall regression should be detected
576-
assert regression_found
574+
# Average change using geometric mean: 4.9%
575+
# Despite the one big outlier, no overall regression should be detected (4.9% < 5.0% threshold)
576+
assert not regression_found
577577

578578
result = output.getvalue()
579579
assert "Total benchmarks compared: 5" in result
580580
assert "Individual regressions (>5.0%): 1" in result # Only the 40% outlier
581-
assert "Average time change: 6.2%" in result
582-
assert "🚨 OVERALL REGRESSION" in result
581+
assert "Average time change: 4.9%" in result
582+
assert " OVERALL OK" in result
583583

584584

585585
class TestEdgeCases:
@@ -652,6 +652,9 @@ def test_mixed_valid_invalid_baselines(self, comparator):
652652
output = StringIO()
653653
regression_found = comparator._write_performance_comparison(output, current_results, baseline_results)
654654

655+
# Should find regression due to the 10% change in the valid comparison
656+
assert regression_found
657+
655658
result = output.getvalue()
656659
assert "Total benchmarks compared: 1" in result # Only one valid comparison
657660
assert "N/A (baseline mean is 0)" in result

scripts/tests/test_hardware_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -539,7 +539,7 @@ def test_extract_memory_value(self):
539539
with self.subTest(memory_str=memory_str):
540540
result = HardwareComparator._extract_memory_value(memory_str) # noqa: SLF001
541541
if expected is None:
542-
assert result == expected
542+
assert result is None
543543
else:
544544
assert result == pytest.approx(expected, abs=1e-9)
545545

tests/circumsphere_debug_tools.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
//! ```bash
1111
//! cargo test --test circumsphere_debug_tools test_2d_circumsphere_debug -- --nocapture
1212
//! cargo test --test circumsphere_debug_tools test_3d_circumsphere_debug -- --nocapture
13-
//! cargo test --test circumsphere_debug_tools test_all_debug -- --nocapture
13+
//! cargo test --test circumsphere_debug_tools test_all_debug -- --ignored --nocapture
1414
//! ```
1515
1616
use delaunay::geometry::util::squared_norm;

0 commit comments

Comments
 (0)