diff --git a/.github/workflows/branch.yaml b/.github/workflows/branch.yaml index 92133de0..e984b0b8 100644 --- a/.github/workflows/branch.yaml +++ b/.github/workflows/branch.yaml @@ -14,6 +14,8 @@ jobs: SLACK_BOT: ${{ secrets.SLACK_BOT }} NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }} GOOGLE_CONTAINER_ID: ${{ secrets.GOOGLE_CONTAINER_ID }} + WEAVIATE_VIBE_EVAL_URL: ${{ secrets.WEAVIATE_VIBE_EVAL_URL }} + WEAVIATE_VIBE_EVAL_KEY: ${{ secrets.WEAVIATE_VIBE_EVAL_KEY }} steps: - uses: actions/checkout@v4 - uses: actions/setup-node@v4 diff --git a/docs/weaviate/benchmarks/index.md b/docs/weaviate/benchmarks/index.md index b9feaad6..5fedeb8c 100644 --- a/docs/weaviate/benchmarks/index.md +++ b/docs/weaviate/benchmarks/index.md @@ -7,12 +7,13 @@ image: og/docs/benchmarks.jpg --- -You can find the following vector database performance benchmarks: +You can find the following benchmarks: 1. [ANN (unfiltered vector search) latencies and throughput](./ann.md) -2. Filtered ANN (benchmark coming soon) -2. Scalar filters / Inverted Index (benchmark coming soon) -3. Large-scale ANN (benchmark coming soon) +2. [LLM Weaviate code generation](./vibe-coding-evaluation.mdx) — how well LLMs generate correct Weaviate v4 Python client code +3. Filtered ANN (benchmark coming soon) +4. Scalar filters / Inverted Index (benchmark coming soon) +5. Large-scale ANN (benchmark coming soon) ## Benchmark code diff --git a/docs/weaviate/benchmarks/vibe-coding-evaluation.mdx b/docs/weaviate/benchmarks/vibe-coding-evaluation.mdx new file mode 100644 index 00000000..2af13159 --- /dev/null +++ b/docs/weaviate/benchmarks/vibe-coding-evaluation.mdx @@ -0,0 +1,73 @@ +--- +title: LLM Weaviate Code Generation Benchmark +sidebar_position: 2 +description: "Benchmark evaluating how well LLMs generate correct Weaviate v4 Python client code across zero-shot and few-shot scenarios." +--- + +import VibeEvalDashboard from "@site/src/components/VibeEvalDashboard"; + +This benchmark evaluates how well large language models (LLMs) generate **working Weaviate v4 Python client code** when given natural language task descriptions. It measures whether an LLM can produce code that actually connects to a Weaviate cluster and performs the requested operation without errors. + +## Results + + + +## What is being tested + +Each LLM is prompted to generate Python code for a specific Weaviate operation. The generated code is then executed inside a Docker container against a real Weaviate Cloud cluster. A task **passes** if the code runs with exit code 0, and **fails** otherwise. + +The benchmark covers these operations: + +| Task | What it tests | +| ------------------------- | ------------------------------------------------------------------- | +| **connect** | Connecting to a Weaviate Cloud instance and verifying readiness | +| **create_collection** | Creating a collection with typed properties (text, number, boolean) | +| **batch_import** | Batch importing 50 objects into a collection | +| **basic_semantic_search** | Running a `near_text` semantic search query | +| **complex_hybrid_query** | Hybrid search with filters, metadata, and multiple conditions | + +### Task variants + +Each task is run in multiple variants to measure the effect of providing examples: + +- **Zero-shot** — The LLM receives only the task description with no code examples +- **Simple example** — The LLM receives one concise code example alongside the task +- **Extensive examples** — The LLM receives full API documentation as in-context examples + +This lets you see how much a model improves when given reference code versus relying purely on its training data. + +## How to interpret the results + +- **Pass rate** is the primary metric — the percentage of tasks where the generated code executed successfully. A higher pass rate means the model produces more reliable Weaviate client code. +- **Avg duration** includes both the LLM generation time and the Docker execution time. It's useful for comparing relative speed but not absolute latency, since it depends on API response times. +- **Similarity score** (1–5, when available) is an LLM-judged comparison of the generated code against a canonical implementation, focusing on correct Weaviate API usage rather than general code style. + +### What a failure means + +A failure means the generated code threw a Python exception or returned a non-zero exit code. Common causes include: + +- Using deprecated v3 client syntax instead of the current v4 API +- Incorrect method names, parameter names, or import paths +- Missing authentication setup or wrong connection patterns +- Hallucinated API methods that don't exist in the Weaviate client + +The **Task Breakdown** tab shows per-task results. When LLM judge analysis is enabled, you can expand failed tasks to see the diagnosed root cause and suggested fix. + +### Limitations + +- Results reflect a point in time. LLM providers update their models, and results may change between runs. +- The benchmark uses `temperature=0.1` for near-deterministic output, but some variance is expected. When multiple repetitions are run, the pass rate accounts for this. +- Tasks test the Weaviate Python v4 client specifically. Results don't generalize to other Weaviate clients (TypeScript, Go, Java) or other database APIs. +- Pass/fail is binary based on exit code. A task can pass with suboptimal code or fail due to a minor syntax issue. + +## How the benchmark is generated + +The benchmark is run monthly via a [GitHub Actions workflow](https://github.com/weaviate-tutorials/weaviate-vibe-eval) and can also be triggered manually. The process is: + +1. Each model is prompted with each task variant +2. Python code is extracted from the LLM response +3. The code is executed in a sandboxed Docker container with network access to a Weaviate Cloud cluster +4. Results (pass/fail, duration, generated code, stdout/stderr) are stored in a remote Weaviate cluster +5. During the docs build, results are fetched and rendered in the dashboard below + +The benchmark source code, task definitions, and full methodology are available at [github.com/weaviate-tutorials/weaviate-vibe-eval](https://github.com/weaviate-tutorials/weaviate-vibe-eval). diff --git a/package.json b/package.json index 5952f11b..d15f583e 100644 --- a/package.json +++ b/package.json @@ -6,7 +6,8 @@ "scripts": { "docusaurus": "docusaurus", "start": "docusaurus start", - "build": "docusaurus build", + "fetch-vibe-eval": "node tools/fetch-vibe-eval-results.js", + "build": "npm run fetch-vibe-eval; docusaurus build", "build-dev": "docusaurus build --config docusaurus.dev.config.js --out-dir build.dev", "validate-links-dev": "node ./_build_scripts/validate-links-pr.js", "swizzle": "docusaurus swizzle", diff --git a/sidebars.js b/sidebars.js index ffaee594..1896142a 100644 --- a/sidebars.js +++ b/sidebars.js @@ -878,7 +878,7 @@ const sidebars = { type: "doc", id: "weaviate/benchmarks/index", }, - items: ["weaviate/benchmarks/ann"], + items: ["weaviate/benchmarks/ann", "weaviate/benchmarks/vibe-coding-evaluation"], }, { type: "category", diff --git a/src/components/VibeEvalDashboard/data.json b/src/components/VibeEvalDashboard/data.json new file mode 100644 index 00000000..f01e0ec4 --- /dev/null +++ b/src/components/VibeEvalDashboard/data.json @@ -0,0 +1,555 @@ +{ + "generated_at": "2026-03-21T06:36:54.675Z", + "latest_run_id": "20260320_193338_b4fa11", + "leaderboard": [ + { + "model_id": "gemini/gemini-2.5-flash", + "provider": "gemini", + "passed": 9, + "total": 14, + "total_duration": 122.66570115089417, + "pass_rate": 0.6428571428571429, + "avg_duration": 8.76 + }, + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "provider": "anthropic", + "passed": 8, + "total": 14, + "total_duration": 81.49544954299927, + "pass_rate": 0.5714285714285714, + "avg_duration": 5.82 + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "provider": "openai", + "passed": 4, + "total": 14, + "total_duration": 259.4072422981262, + "pass_rate": 0.2857142857142857, + "avg_duration": 18.53 + } + ], + "task_breakdown": [ + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "task_id": "extensive_examples_complex_hybrid_query", + "variant": "extensive_examples_complex_hybrid", + "provider": "anthropic", + "success": false, + "duration": 7.983539819717407, + "similarity_score": 2, + "failure_analysis": "The generated code fails because it passes `where=where_filter` as a keyword argument to `collection.query.hybrid()`. In the current Weaviate Python client v4, the correct parameter name for filters in query methods is `filters=`, not `where=`. The `where` parameter was used in older versions of the Weaviate client (v3/legacy), and has been renamed to `filters` in v4. The TypeError message \"got an unexpected keyword argument 'where'\" confirms this mismatch.", + "root_cause": "The generated code uses the deprecated/incorrect parameter name `where=` instead of `filters=` for the hybrid query method in Weaviate client v4.", + "suggested_fix": "Change `where=where_filter` to `filters=where_filter` in the `collection.query.hybrid()` call, and also incorporate the `review_body` substring check into the filter using `Filter.by_property(\"review_body\").contains_any([\"spice\", \"fruit\", \"berry\", \"cherry\", \"honey\"])` combined with `&` into the existing filter chain, rather than doing it as a post-processing Python step. Also add `from weaviate.classes.query import MetadataQuery` and pass `return_metadata=MetadataQuery(score=True)` to the hybrid call." + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "task_id": "zero_shot_basic_semantic_search", + "variant": "zero_shot_basic_semantic", + "provider": "openai", + "success": false, + "duration": 20.757816076278687, + "similarity_score": 1, + "failure_analysis": "The generated code attempts to instantiate `weaviate.Client(url=..., auth_client_secret=..., additional_headers=...)`, which is the v3 client constructor. In the installed version of the weaviate-client package (v4), this constructor signature no longer accepts these keyword arguments, causing a `TypeError: Client.__init__() got an unexpected keyword argument 'url'`. The entire v3 client interface has been removed in v4.", + "root_cause": "The generated code uses the removed Weaviate Python client v3 `weaviate.Client(url=...)` constructor, which no longer exists in the installed v4 client library.", + "suggested_fix": "Replace the client initialization with `weaviate.connect_to_weaviate_cloud(cluster_url=WCD_URL, auth_credentials=Auth.api_key(WCD_KEY), headers={\"X-OpenAI-API-Key\": OPENAI_KEY})` (importing `Auth` from `weaviate.classes.init`), and replace the v3 query chain with `client.collections.get(\"WineReview\").query.near_text(query=\"dessert wine\", limit=2)`, then iterate over `response.objects` accessing `obj.properties` directly, and call `client.close()` at the end." + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "task_id": "zero_shot_complex_hybrid_query", + "variant": "zero_shot_complex_hybrid", + "provider": "openai", + "success": false, + "duration": 23.932209968566895, + "similarity_score": 1, + "failure_analysis": "The generated code is completely empty (no code at all), so there is nothing to execute. The execution failed because there was no Python code to run. The canonical implementation uses `weaviate.connect_to_weaviate_cloud()` with proper authentication, performs a hybrid query with filters using `Filter.by_property()` chained with `&` operators, and uses `MetadataQuery(score=True)` to retrieve scores. None of this exists in the generated code.", + "root_cause": "The generated code is entirely empty - no implementation was provided.", + "suggested_fix": "Implement the full solution by adding: client initialization with `weaviate.connect_to_weaviate_cloud()` using `Auth.api_key()` and OpenAI headers, collection access via `client.collections.get(\"WineReview\")`, a hybrid query using `wine_reviews.query.hybrid()` with `query`, `limit`, `alpha`, `filters` (using `Filter.by_property()` with `&` operators), and `return_metadata=MetadataQuery(score=True)`, followed by iterating over `response.objects` to print results, and closing the client with `client.close()`." + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "task_id": "simple_example_basic_semantic_search", + "variant": "simple_example_basic_semantic", + "provider": "openai", + "success": false, + "duration": 18.650628805160522, + "similarity_score": 1, + "failure_analysis": "The generated code attempts to instantiate `weaviate.Client(url=..., ...)` which was the v3 client constructor. In the installed version of the weaviate-client package (v4), this constructor no longer accepts the `url` keyword argument and raises a `TypeError`. The v3 client has been fully removed from the v4 package, so none of the v3 patterns (including `weaviate.AuthApiKey`, `client.query.get()`, `.with_near_text()`, `.do()`) are available.", + "root_cause": "The generated code uses the removed Weaviate Python client v3 `weaviate.Client(url=...)` constructor, which no longer exists in the installed v4 client library.", + "suggested_fix": "Replace the client initialization with `weaviate.connect_to_weaviate_cloud(cluster_url=WCD_TEST_URL, auth_credentials=Auth.api_key(WCD_TEST_KEY), headers={\"X-OpenAI-API-Key\": OPENAI_API_KEY})` (importing `Auth` from `weaviate.classes.init`), then replace the query with `client.collections.get(\"WineReview\").query.near_text(query=\"dessert wine\", limit=2)` and iterate over `response.objects`, accessing properties via `obj.properties[\"title\"]` etc." + }, + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "task_id": "simple_example_basic_semantic_search", + "variant": "simple_example_basic_semantic", + "provider": "anthropic", + "success": true, + "duration": 4.9738218784332275, + "similarity_score": 5, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "task_id": "extensive_examples_create_collection", + "variant": "extensive_examples_create", + "provider": "anthropic", + "success": true, + "duration": 5.3912341594696045, + "similarity_score": 5, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "task_id": "simple_example_connect", + "variant": "simple_example", + "provider": "openai", + "success": false, + "duration": 17.100103855133057, + "similarity_score": 1, + "failure_analysis": "The generated code attempts to instantiate `weaviate.Client(url=..., auth_client_secret=...)`, which is the v3 client API. In the installed version of the weaviate-client package (v4), the v3 `Client` class has been removed entirely. The error `TypeError: Client.__init__() got an unexpected keyword argument 'url'` confirms that the v3-style constructor no longer accepts these parameters. Additionally, importing `AuthApiKey` directly from the `weaviate` root module triggers a deprecation warning, indicating it should be imported from `weaviate.auth`.", + "root_cause": "The generated code uses the removed Weaviate Python client v3 `weaviate.Client(...)` constructor instead of the v4 `weaviate.connect_to_weaviate_cloud(...)` function.", + "suggested_fix": "Replace `weaviate.Client(url=url, auth_client_secret=weaviate.AuthApiKey(api_key=api_key))` with `weaviate.connect_to_weaviate_cloud(cluster_url=url, auth_credentials=Auth.api_key(api_key))`, and add `from weaviate.classes.init import Auth` to the imports." + }, + { + "model_id": "gemini/gemini-2.5-flash", + "task_id": "zero_shot_batch_import", + "variant": "zero_shot_batch", + "provider": "gemini", + "success": false, + "duration": 9.043149709701538, + "similarity_score": 1, + "failure_analysis": "The generated output begins with a plain English explanation (\"Here's the Python code to connect to Weaviate Cloud...\") placed directly in the file as if it were code, outside of any string literal or comment. Python's parser attempts to interpret this as a statement and immediately fails with a `SyntaxError: unterminated string literal` on line 1 (specifically at the apostrophe in \"Here's\"). The actual Python code block was wrapped in markdown triple-backtick fences, which are also not valid Python syntax. As a result, no Weaviate API code was ever executed.", + "root_cause": "The generated response included markdown prose and code fences as literal file content instead of outputting only valid Python code.", + "suggested_fix": "Remove all prose text and markdown fences from the file so that the file starts directly with `import os` and `import weaviate`. Then complete the implementation to match the canonical: use `weaviate.connect_to_weaviate_cloud(cluster_url=WCD_URL, auth_credentials=Auth.api_key(WCD_KEY))`, get the collection with `client.collections.get(\"DemoProducts\")`, and batch import using `with products.batch.dynamic() as batch:` with `batch.add_object(properties=data_row)` inside the loop." + }, + { + "model_id": "gemini/gemini-2.5-flash", + "task_id": "zero_shot_connect", + "variant": "zero_shot", + "provider": "gemini", + "success": true, + "duration": 9.942929983139038, + "similarity_score": 2, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "task_id": "zero_shot_connect", + "variant": "zero_shot", + "provider": "anthropic", + "success": false, + "duration": 3.0586721897125244, + "similarity_score": 1, + "failure_analysis": "The generated code attempts to use `ConnectionParams.from_url(wcd_url)` with only one argument, but this method requires two positional arguments: the URL and a `grpc_port`. This reflects an incorrect understanding of the Weaviate v4 Python client API. The entire approach of using `Client` with `ConnectionParams` is not the correct way to connect to Weaviate Cloud in the v4 client — the canonical approach uses the higher-level `weaviate.connect_to_weaviate_cloud()` helper function which handles gRPC configuration internally.", + "root_cause": "The generated code uses an incorrect low-level API (`Client` + `ConnectionParams.from_url()`) instead of the high-level `weaviate.connect_to_weaviate_cloud()` function, and `ConnectionParams.from_url()` requires a `grpc_port` argument that was not provided.", + "suggested_fix": "Replace the entire client initialization block with `import weaviate` and `from weaviate.classes.init import Auth`, then use `client = weaviate.connect_to_weaviate_cloud(cluster_url=wcd_url, auth_credentials=Auth.api_key(wcd_key))`, and add `client.close()` at the end." + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "task_id": "extensive_examples_batch_import", + "variant": "extensive_examples_batch", + "provider": "openai", + "success": true, + "duration": 17.368285179138184, + "similarity_score": 2, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "task_id": "simple_example_batch_import", + "variant": "simple_example_batch", + "provider": "anthropic", + "success": false, + "duration": 6.198858022689819, + "similarity_score": 2, + "failure_analysis": "The code fails at line 25 with `AttributeError: module 'weaviate.classes' has no attribute 'Property'`. The generated code tries to use `weaviate.classes.Property` and `weaviate.classes.DataType.TEXT` to define collection properties, but these classes are not directly accessible under `weaviate.classes`. In the modern Weaviate Python client (v4), `Property` and `DataType` live under `weaviate.classes.config`, not `weaviate.classes`. The canonical implementation avoids this entirely by not creating a collection at all — it just calls `client.collections.get(\"DemoProducts\")` assuming the collection already exists.", + "root_cause": "Incorrect module path `weaviate.classes.Property` and `weaviate.classes.DataType` — these attributes do not exist directly on `weaviate.classes`; they should be accessed from `weaviate.classes.config`.", + "suggested_fix": "Remove the collection creation block entirely and replace it with `collection = client.collections.get(\"DemoProducts\")` as done in the canonical implementation. If collection creation is needed, change `weaviate.classes.Property` to `weaviate.classes.config.Property` and `weaviate.classes.DataType.TEXT` to `weaviate.classes.config.DataType.TEXT`, and add `from weaviate.classes.config import Property, DataType` at the top." + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "task_id": "zero_shot_batch_import", + "variant": "zero_shot_batch", + "provider": "openai", + "success": false, + "duration": 18.89422106742859, + "similarity_score": 1, + "failure_analysis": "The generated code attempts to use the Weaviate Python client v3 API (`weaviate.Client(url=..., auth_client_secret=...)`), but the installed package is v4, which has removed the v3 `Client` class constructor. The v4 client no longer accepts `url` as a keyword argument to `weaviate.Client()`, causing a `TypeError` at initialization. The deprecation warning also confirms that `AuthApiKey` should not be imported from the root `weaviate` module.", + "root_cause": "The generated code uses the removed v3 `weaviate.Client(url=..., auth_client_secret=...)` constructor instead of the v4 `weaviate.connect_to_weaviate_cloud(cluster_url=..., auth_credentials=Auth.api_key(...))` function.", + "suggested_fix": "Replace `weaviate.Client(url=WEAVIATE_URL, auth_client_secret=weaviate.AuthApiKey(api_key=WEAVIATE_API_KEY))` with `weaviate.connect_to_weaviate_cloud(cluster_url=WEAVIATE_URL, auth_credentials=weaviate.auth.Auth.api_key(WEAVIATE_API_KEY))`, then replace all schema/batch operations with v4 equivalents: use `client.collections.get(\"DemoProducts\")` to access the collection, and use `products.batch.dynamic()` with `batch.add_object(properties=obj)` for batch imports." + }, + { + "model_id": "gemini/gemini-2.5-flash", + "task_id": "simple_example_create_collection", + "variant": "simple_example_create", + "provider": "gemini", + "success": true, + "duration": 6.690547943115234, + "similarity_score": 5, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "gemini/gemini-2.5-flash", + "task_id": "simple_example_basic_semantic_search", + "variant": "simple_example_basic_semantic", + "provider": "gemini", + "success": true, + "duration": 8.716688871383667, + "similarity_score": 5, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "task_id": "extensive_examples_batch_import", + "variant": "extensive_examples_batch", + "provider": "anthropic", + "success": false, + "duration": 8.329793930053711, + "similarity_score": 2, + "failure_analysis": "The code fails at line 44 with `AttributeError: 'QueryReturn' object has no attribute 'total_count'`. The generated code calls `collection.query.fetch_objects(limit=1)` and then tries to access `.total_count` on the returned `QueryReturn` object. However, `QueryReturn` does not have a `total_count` attribute. The canonical implementation simply uses `len(collection)` to get the total count of objects in the collection, which is the correct API method. Additionally, the `client.close()` is never reached due to the exception, causing a resource leak warning.", + "root_cause": "`QueryReturn` object does not have a `total_count` attribute; the correct way to count objects is `len(collection)`.", + "suggested_fix": "Replace the lines `response = collection.query.fetch_objects(limit=1)` and `total_count = response.total_count` with `total_count = len(collection)`, and also wrap the entire logic in a try/finally block to ensure `client.close()` is always called. Additionally, change `collection.batch.fixed_size(batch_size=50)` to `collection.batch.dynamic()` and add error checking with `batch.number_errors` and `collection.batch.failed_objects` to match the canonical implementation." + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "task_id": "extensive_examples_complex_hybrid_query", + "variant": "extensive_examples_complex_hybrid", + "provider": "openai", + "success": true, + "duration": 17.416374921798706, + "similarity_score": 3, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "gemini/gemini-2.5-flash", + "task_id": "zero_shot_complex_hybrid_query", + "variant": "zero_shot_complex_hybrid", + "provider": "gemini", + "success": true, + "duration": 10.001044034957886, + "similarity_score": 2, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "task_id": "extensive_examples_create_collection", + "variant": "extensive_examples_create", + "provider": "openai", + "success": true, + "duration": 12.641349077224731, + "similarity_score": 5, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "gemini/gemini-2.5-flash", + "task_id": "zero_shot_create_collection", + "variant": "zero_shot_create", + "provider": "gemini", + "success": true, + "duration": 9.172446012496948, + "similarity_score": 2, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "task_id": "zero_shot_create_collection", + "variant": "zero_shot_create", + "provider": "anthropic", + "success": true, + "duration": 5.154320001602173, + "similarity_score": 4, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "task_id": "simple_example_complex_hybrid_query", + "variant": "simple_example_complex_hybrid", + "provider": "anthropic", + "success": true, + "duration": 4.913505792617798, + "similarity_score": 5, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "gemini/gemini-2.5-flash", + "task_id": "zero_shot_basic_semantic_search", + "variant": "zero_shot_basic_semantic", + "provider": "gemini", + "success": false, + "duration": 9.605401992797852, + "similarity_score": 1, + "failure_analysis": "The generated code failed because the output is not Python code at all. It appears the code generation produced only a natural language description (e.g., \"Here's the Python code to query the 'WineReview' collection...\") without any actual Python implementation following it. Python's parser attempted to interpret this English text as code and encountered a syntax error on line 1 due to an unterminated string literal (the apostrophe in \"Here's\" was interpreted as the start of a string that was never closed).", + "root_cause": "The generated output is plain English prose rather than executable Python code — no actual code was generated.", + "suggested_fix": "Replace the entire output with valid Python code that mirrors the canonical implementation: import weaviate and os, connect using `weaviate.connect_to_weaviate_cloud()` with `Auth.api_key()` credentials and the OpenAI API key header, retrieve the collection with `client.collections.get(\"WineReview\")`, run `wine_reviews.query.near_text(query=\"dessert wine\", limit=2)`, iterate over `response.objects` printing the relevant properties, and call `client.close()`." + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "task_id": "extensive_examples_basic_semantic_search", + "variant": "extensive_examples_basic_semantic", + "provider": "openai", + "success": true, + "duration": 20.570353984832764, + "similarity_score": 4, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "task_id": "zero_shot_create_collection", + "variant": "zero_shot_create", + "provider": "openai", + "success": false, + "duration": 17.935655117034912, + "similarity_score": 1, + "failure_analysis": "The generated code attempts to instantiate `weaviate.Client(url=..., auth_client_secret=...)`, which is the v3 client API. In the installed weaviate-client v4 package, the `Client` class no longer accepts these keyword arguments (`url`, `auth_client_secret`). The v4 package has removed the v3 `weaviate.Client(...)` interface entirely, replacing it with connection helper functions like `weaviate.connect_to_weaviate_cloud(...)`. As a result, Python raises a `TypeError: Client.__init__() got an unexpected keyword argument 'url'` and the script exits with code 1.", + "root_cause": "The generated code uses the removed Weaviate Python client v3 `weaviate.Client(url=..., auth_client_secret=...)` constructor, which no longer exists in the installed v4 client library.", + "suggested_fix": "Replace the entire client initialization and schema management code with the v4 API: use `weaviate.connect_to_weaviate_cloud(cluster_url=WCD_TEST_URL, auth_credentials=Auth.api_key(WCD_TEST_KEY))` for connection, `client.collections.exists(CLASS_NAME)` / `client.collections.delete(CLASS_NAME)` for existence check and deletion, and `client.collections.create(CLASS_NAME, properties=[Property(name=\"name\", data_type=DataType.TEXT), ...])` for collection creation, importing `Auth` from `weaviate.classes.init` and `Property`, `DataType` from `weaviate.classes.config`." + }, + { + "model_id": "gemini/gemini-2.5-flash", + "task_id": "extensive_examples_batch_import", + "variant": "extensive_examples_batch", + "provider": "gemini", + "success": true, + "duration": 10.83474588394165, + "similarity_score": 2, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "gemini/gemini-2.5-flash", + "task_id": "simple_example_connect", + "variant": "simple_example", + "provider": "gemini", + "success": true, + "duration": 4.6224329471588135, + "similarity_score": 5, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "gemini/gemini-2.5-flash", + "task_id": "simple_example_batch_import", + "variant": "simple_example_batch", + "provider": "gemini", + "success": false, + "duration": 9.340939044952393, + "similarity_score": 1, + "failure_analysis": "The generated code file literally starts with the characters \"```python\" (backtick-backtick-backtick python), which is Markdown code fence syntax. When Python tries to parse this as source code, it immediately encounters a SyntaxError because backticks are not valid Python syntax. This means none of the actual code logic ever runs. Additionally, the code is truncated mid-way through the data_rows construction, so even if the fencing were removed, it would still fail.", + "root_cause": "The generated code was wrapped in Markdown code fences (```python ... ```) that were included literally in the Python source file, causing an immediate SyntaxError on line 1.", + "suggested_fix": "Remove the opening ```python and closing ``` markdown fences from the Python file so that the file starts directly with `import weaviate`. Additionally, complete the truncated data_rows loop and add the batch import logic using `products.batch.dynamic()` as shown in the canonical implementation, replacing the collection creation approach with `client.collections.get(\"DemoProducts\")`." + }, + { + "model_id": "gemini/gemini-2.5-flash", + "task_id": "extensive_examples_create_collection", + "variant": "extensive_examples_create", + "provider": "gemini", + "success": true, + "duration": 9.947964906692505, + "similarity_score": 5, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "task_id": "zero_shot_connect", + "variant": "zero_shot", + "provider": "openai", + "success": false, + "duration": 13.802639722824097, + "similarity_score": 1, + "failure_analysis": "The generated code attempts to instantiate `weaviate.Client(url=..., auth_client_secret=...)`, which is the v3 Weaviate Python client API. In the installed version of the weaviate-client package (v4), the `weaviate.Client` class has been completely removed and no longer accepts the `url` keyword argument. This causes a `TypeError: Client.__init__() got an unexpected keyword argument 'url'` at runtime. Additionally, `weaviate.AuthApiKey` is deprecated and should be imported from `weaviate.auth` or replaced with `Auth.api_key` from `weaviate.classes.init`.", + "root_cause": "The generated code uses the removed Weaviate Python client v3 `weaviate.Client(url=..., auth_client_secret=...)` constructor instead of the v4 `weaviate.connect_to_weaviate_cloud(...)` function.", + "suggested_fix": "Replace `weaviate.Client(url=WCD_TEST_URL, auth_client_secret=weaviate.AuthApiKey(api_key=WCD_TEST_KEY))` with `weaviate.connect_to_weaviate_cloud(cluster_url=WCD_TEST_URL, auth_credentials=Auth.api_key(WCD_TEST_KEY))`, and add `from weaviate.classes.init import Auth` to the imports. Also add `client.close()` at the end." + }, + { + "model_id": "gemini/gemini-2.5-flash", + "task_id": "simple_example_complex_hybrid_query", + "variant": "simple_example_complex_hybrid", + "provider": "gemini", + "success": false, + "duration": 8.564034938812256, + "similarity_score": 2, + "failure_analysis": "The generated code is wrapped in markdown code fences (```python at the start and presumably ``` at the end), which are not valid Python syntax. When Python tries to parse the file, it encounters the backtick characters on line 1 and raises a `SyntaxError: invalid syntax`. This is a formatting error where the code generator included the markdown formatting delimiters as part of the actual code file content.", + "root_cause": "The generated code file contains markdown code fence delimiters (```python) as literal text, which Python cannot parse as valid syntax.", + "suggested_fix": "Remove the opening ```python and closing ``` markdown fence lines from the code file, so the file contains only valid Python statements. Additionally, complete the truncated hybrid query by adding `alpha=0.5`, the `filters` parameter with the three chained `Filter.by_property(...)` conditions, `return_metadata=MetadataQuery(score=True)`, and the result iteration loop printing `obj.properties` and `obj.metadata.score`, followed by `client.close()`." + }, + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "task_id": "simple_example_connect", + "variant": "simple_example", + "provider": "anthropic", + "success": true, + "duration": 2.656920909881592, + "similarity_score": 5, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "task_id": "simple_example_complex_hybrid_query", + "variant": "simple_example_complex_hybrid", + "provider": "openai", + "success": false, + "duration": 22.80874490737915, + "similarity_score": 1, + "failure_analysis": "The generated code is completely empty — there is no Python code present at all. Because there is nothing to execute, the script produces no output and no error messages, but it trivially fails to accomplish the task. There is no client initialization, no query, and no result handling.", + "root_cause": "The generated code is an empty string; no implementation was provided.", + "suggested_fix": "Implement the full solution matching the canonical approach: initialize the client with `weaviate.connect_to_weaviate_cloud(cluster_url=..., auth_credentials=Auth.api_key(...), headers={\"X-OpenAI-API-Key\": ...})`, retrieve the collection with `client.collections.get(\"WineReview\")`, run a hybrid query with `wine_reviews.query.hybrid(query=\"pair with steak\", limit=5, alpha=0.5, filters=..., return_metadata=MetadataQuery(score=True))`, iterate over `response.objects` to print properties and metadata, and close the client with `client.close()`." + }, + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "task_id": "simple_example_create_collection", + "variant": "simple_example_create", + "provider": "anthropic", + "success": true, + "duration": 8.705229759216309, + "similarity_score": 5, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "task_id": "zero_shot_basic_semantic_search", + "variant": "zero_shot_basic_semantic", + "provider": "anthropic", + "success": true, + "duration": 6.236234188079834, + "similarity_score": 4, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "task_id": "simple_example_batch_import", + "variant": "simple_example_batch", + "provider": "openai", + "success": false, + "duration": 18.014436721801758, + "similarity_score": 2, + "failure_analysis": "The generated code crashes at line 25 with `AttributeError: 'WeaviateClient' object has no attribute 'schema'`. This is because the v4 Weaviate Python client (`weaviate-client>=4.x`) removed the `client.schema` attribute that existed in v3. The generated code attempts to call `client.schema.get()` and `client.schema.create_class()` to check and create the collection schema, but these methods no longer exist in the v4 API. The canonical implementation simply calls `client.collections.get(\"DemoProducts\")` without any schema management, assuming the collection already exists. Additionally, the use of `weaviate.AuthApiKey` triggers a deprecation warning (though not a crash), whereas the canonical uses `Auth.api_key` from `weaviate.auth`.", + "root_cause": "Use of v3-style `client.schema.get()` and `client.schema.create_class()` methods that do not exist in the v4 Weaviate Python client.", + "suggested_fix": "Remove the entire schema checking and creation block (lines 14–30 involving `client.schema.get()` and `client.schema.create_class()`), replace `weaviate.AuthApiKey(api_key=WCD_TEST_KEY)` with `Auth.api_key(WCD_TEST_KEY)` (importing `Auth` from `weaviate.auth`), and if collection creation is needed use `client.collections.create(\"DemoProducts\", properties=[...])` from the v4 API instead." + }, + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "task_id": "extensive_examples_basic_semantic_search", + "variant": "extensive_examples_basic_semantic", + "provider": "anthropic", + "success": true, + "duration": 5.069045066833496, + "similarity_score": 5, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "openai/gpt-5-mini-2025-08-07", + "task_id": "simple_example_create_collection", + "variant": "simple_example_create", + "provider": "openai", + "success": false, + "duration": 19.51442289352417, + "similarity_score": 1, + "failure_analysis": "The generated code is completely empty - there is no code to execute at all. This explains why the execution result shows no stdout, no stderr, and a failure status. The code file was either not generated or was generated as an empty string, resulting in a Python script with no statements to run.", + "root_cause": "The generated code is entirely empty, containing no Python statements or logic whatsoever.", + "suggested_fix": "Implement the full solution by adding the necessary imports (`import weaviate`, `from weaviate.classes.init import Auth`, `from weaviate.classes.config import Property, DataType`), connecting via `weaviate.connect_to_weaviate_cloud(cluster_url=..., auth_credentials=Auth.api_key(...))`, and creating the collection using `client.collections.create()` with the appropriate `Property` objects specifying names and `DataType` values, then closing the client with `client.close()`." + }, + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "task_id": "zero_shot_batch_import", + "variant": "zero_shot_batch", + "provider": "anthropic", + "success": false, + "duration": 6.977207660675049, + "similarity_score": 2, + "failure_analysis": "The generated code fails because `Property(name=\"name\", data_type=\"text\")` passes a plain string `\"text\"` for the `data_type` parameter. The Weaviate v4 Python client requires `data_type` to be an instance of the `DataType` enum (e.g., `DataType.TEXT`), not a raw string. Pydantic validation enforces this type constraint and raises a `ValidationError` because `'text'` (a `str`) is not an instance of `DataType`.", + "root_cause": "`data_type` in `Property` was passed as a plain string `\"text\"` instead of the required `DataType.TEXT` enum value from `weaviate.classes.config`.", + "suggested_fix": "Import `DataType` from `weaviate.classes.config` and change `Property(name=\"name\", data_type=\"text\")` to `Property(name=\"name\", data_type=DataType.TEXT)` and similarly for the description property. Additionally, replace `collection.data.insert_many(objects_to_add)` with the `batch.dynamic()` context manager pattern used in the canonical implementation." + }, + { + "model_id": "gemini/gemini-2.5-flash", + "task_id": "extensive_examples_basic_semantic_search", + "variant": "extensive_examples_basic_semantic", + "provider": "gemini", + "success": true, + "duration": 6.453775882720947, + "similarity_score": 4, + "failure_analysis": "", + "root_cause": "", + "suggested_fix": "" + }, + { + "model_id": "gemini/gemini-2.5-flash", + "task_id": "extensive_examples_complex_hybrid_query", + "variant": "extensive_examples_complex_hybrid", + "provider": "gemini", + "success": false, + "duration": 9.729598999023438, + "similarity_score": 1, + "failure_analysis": "The generated code failed because it includes markdown code fence delimiters (```python at the start) as literal Python code. Python's parser encounters the backtick characters and raises a SyntaxError on the very first line. Additionally, the code is truncated mid-statement (`os.environ` is referenced but never completed), meaning even if the fences were removed, the code would still fail. The code never reaches any Weaviate API calls.", + "root_cause": "Markdown code fence syntax (```python) was included as literal Python code, causing an immediate SyntaxError before any logic could execute.", + "suggested_fix": "Remove the markdown code fence delimiters (```python and closing ```) from the file, complete the implementation by adding `weaviate.connect_to_weaviate_cloud()` with proper auth and headers, import `MetadataQuery`, retrieve the collection with `client.collections.get(\"WineReview\")`, perform a `hybrid()` query with the appropriate filters and `return_metadata=MetadataQuery(score=True)`, iterate over results, and call `client.close()`." + }, + { + "model_id": "anthropic/claude-haiku-4-5-20251001", + "task_id": "zero_shot_complex_hybrid_query", + "variant": "zero_shot_complex_hybrid", + "provider": "anthropic", + "success": false, + "duration": 5.847066164016724, + "similarity_score": 2, + "failure_analysis": "The generated code fails because it attempts to call `Filter.by_and()` as a static method on the `Filter` class, which does not exist in the Weaviate Python client v4 library. The correct way to combine filters in v4 is using Python's bitwise operators (`&` for AND, `|` for OR) directly on filter objects. Additionally, the code uses `Filter.by_or()` which also doesn't exist, and uses multiple `.like()` calls instead of the `.contains_any()` method for matching multiple string values.", + "root_cause": "The generated code uses non-existent `Filter.by_and()` and `Filter.by_or()` static methods instead of the correct `&` and `|` operator syntax for combining filters in Weaviate Python client v4.", + "suggested_fix": "Replace `Filter.by_and(...)` and `Filter.by_or(...)` with Python operator chaining using `&` and `|`, and replace the multiple `.like()` calls with `.contains_any([\"spice\", \"fruit\", \"berry\", \"cherry\", \"honey\"])`. The filter should be written as: `Filter.by_property(\"price\").less_than(50) & Filter.by_property(\"points\").greater_than(90) & Filter.by_property(\"review_body\").contains_any([\"spice\", \"fruit\", \"berry\", \"cherry\", \"honey\"])`. Also change `MetadataQuery(distance=True)` to `MetadataQuery(score=True)` for hybrid search." + } + ], + "run_history": [ + { + "run_id": "20260320_193338_b4fa11", + "timestamp": "2026-03-20T19:47:23.402723", + "total": 42, + "passed": 21, + "pass_rate": 0.5 + }, + { + "run_id": "20260320_185642_b2d217", + "timestamp": "2026-03-20T18:59:30.894997", + "total": 14, + "passed": 10, + "pass_rate": 0.7142857142857143 + } + ] +} \ No newline at end of file diff --git a/src/components/VibeEvalDashboard/index.jsx b/src/components/VibeEvalDashboard/index.jsx new file mode 100644 index 00000000..76490ac0 --- /dev/null +++ b/src/components/VibeEvalDashboard/index.jsx @@ -0,0 +1,297 @@ +import React, { useState, useMemo } from "react"; +import styles from "./styles.module.scss"; +import dashboardData from "./data.json"; + +const TABS = ["Leaderboard", "Task Breakdown", "Run History"]; + +function SortArrow({ active, direction }) { + if (!active) return null; + return ( + {direction === "asc" ? "▲" : "▼"} + ); +} + +function useSortable(data, defaultKey, defaultDir = "desc") { + const [sortKey, setSortKey] = useState(defaultKey); + const [sortDir, setSortDir] = useState(defaultDir); + + const sorted = useMemo(() => { + if (!data || !sortKey) return data; + return [...data].sort((a, b) => { + const av = a[sortKey]; + const bv = b[sortKey]; + if (av == null && bv == null) return 0; + if (av == null) return 1; + if (bv == null) return -1; + if (typeof av === "string") { + return sortDir === "asc" + ? av.localeCompare(bv) + : bv.localeCompare(av); + } + return sortDir === "asc" ? av - bv : bv - av; + }); + }, [data, sortKey, sortDir]); + + const onSort = (key) => { + if (key === sortKey) { + setSortDir((d) => (d === "asc" ? "desc" : "asc")); + } else { + setSortKey(key); + setSortDir("desc"); + } + }; + + return { sorted, sortKey, sortDir, onSort }; +} + +function passRateClass(rate) { + if (rate >= 0.8) return `${styles.passRate} ${styles.passRateHigh}`; + if (rate >= 0.5) return `${styles.passRate} ${styles.passRateMedium}`; + return `${styles.passRate} ${styles.passRateLow}`; +} + +function SortableHeader({ label, field, sortKey, sortDir, onSort }) { + return ( + onSort(field)}> + {label} + + + ); +} + +function Leaderboard({ data }) { + const { sorted, sortKey, sortDir, onSort } = useSortable( + data, + "pass_rate", + "desc" + ); + + if (!sorted || sorted.length === 0) { + return
No leaderboard data available.
; + } + + const H = (props) => ( + + ); + + return ( +
+ + + + + + + + + + + + + + {sorted.map((row, i) => ( + + + + + + + + + + ))} + +
#
{i + 1}{row.model_id} + {row.provider} + + {(row.pass_rate * 100).toFixed(0)}% + {row.passed}{row.total}{row.avg_duration.toFixed(1)}s
+
+ ); +} + +function TaskBreakdown({ data }) { + const [expanded, setExpanded] = useState({}); + const { sorted, sortKey, sortDir, onSort } = useSortable( + data, + "task_id", + "asc" + ); + + if (!sorted || sorted.length === 0) { + return ( +
No task breakdown data available.
+ ); + } + + const toggleExpand = (key) => { + setExpanded((prev) => ({ ...prev, [key]: !prev[key] })); + }; + + const H = (props) => ( + + ); + + return ( +
+ + + + + + + + + + + + + + {sorted.map((row) => { + const key = `${row.model_id}||${row.task_id}`; + const hasAnalysis = row.failure_analysis || row.root_cause; + const isExpanded = expanded[key]; + + return ( + + + + + + + + + + + {isExpanded && hasAnalysis && ( + + + + )} + + ); + })} + +
Analysis
{row.model_id}{row.task_id}{row.variant} + {row.success ? "PASS" : "FAIL"} + {(row.duration || 0).toFixed(1)}s{row.similarity_score > 0 ? `${row.similarity_score}/5` : "-"} + {hasAnalysis && ( + + )} +
+ {row.root_cause && ( +
+ Root cause: + {row.root_cause} +
+ )} + {row.failure_analysis && ( +
+ Analysis: + {row.failure_analysis} +
+ )} + {row.suggested_fix && ( +
+ Suggested fix: + {row.suggested_fix} +
+ )} +
+
+ ); +} + +function RunHistory({ data }) { + if (!data || data.length === 0) { + return
No run history available.
; + } + + return ( +
+ + + + + + + + + + + + {data.map((run) => ( + + + + + + + + ))} + +
Run IDDatePassedTotalPass Rate
+ {run.run_id} + + {run.timestamp + ? new Date(run.timestamp).toLocaleDateString() + : "-"} + {run.passed}{run.total} + {(run.pass_rate * 100).toFixed(0)}% +
+
+ ); +} + +export default function VibeEvalDashboard() { + const [activeTab, setActiveTab] = useState(0); + + const hasData = + dashboardData.leaderboard.length > 0 || + dashboardData.task_breakdown.length > 0; + + return ( +
+ {dashboardData.generated_at && ( +
+ Last updated: {new Date(dashboardData.generated_at).toLocaleString()} + {dashboardData.latest_run_id && ( + <> · Run: {dashboardData.latest_run_id} + )} +
+ )} + +
+ {TABS.map((tab, i) => ( + + ))} +
+ + {!hasData && ( +
+ No benchmark data available yet. Results will appear after the first + benchmark run. +
+ )} + + {hasData && activeTab === 0 && ( + + )} + {hasData && activeTab === 1 && ( + + )} + {activeTab === 2 && } +
+ ); +} diff --git a/src/components/VibeEvalDashboard/styles.module.scss b/src/components/VibeEvalDashboard/styles.module.scss new file mode 100644 index 00000000..6c291b84 --- /dev/null +++ b/src/components/VibeEvalDashboard/styles.module.scss @@ -0,0 +1,152 @@ +.dashboard { + margin: 1rem 0; +} + +.metadata { + color: var(--ifm-color-emphasis-600); + font-size: 0.85rem; + margin-bottom: 1.5rem; +} + +.tabs { + display: flex; + gap: 0; + border-bottom: 2px solid var(--ifm-color-emphasis-200); + margin-bottom: 1.5rem; +} + +.tab { + padding: 0.6rem 1.2rem; + cursor: pointer; + border: none; + background: none; + font-size: 0.95rem; + font-weight: 500; + color: var(--ifm-color-emphasis-600); + border-bottom: 2px solid transparent; + margin-bottom: -2px; + transition: color 0.15s, border-color 0.15s; + + &:hover { + color: var(--ifm-color-primary); + } +} + +.tabActive { + color: var(--ifm-color-primary); + border-bottom-color: var(--ifm-color-primary); +} + +.table { + width: 100%; + border-collapse: collapse; + font-size: 0.9rem; + + th, td { + padding: 0.5rem 0.75rem; + text-align: left; + border-bottom: 1px solid var(--ifm-color-emphasis-200); + } + + th { + font-weight: 600; + color: var(--ifm-color-emphasis-700); + background: var(--ifm-color-emphasis-100); + position: sticky; + top: 0; + cursor: pointer; + user-select: none; + white-space: nowrap; + + &:hover { + background: var(--ifm-color-emphasis-200); + } + } + + tr:hover td { + background: var(--ifm-color-emphasis-50); + } +} + +.pass { + color: #16a34a; + font-weight: 600; +} + +.fail { + color: #dc2626; + font-weight: 600; +} + +.passRate { + font-weight: 600; +} + +.passRateHigh { + color: #16a34a; +} + +.passRateMedium { + color: #ca8a04; +} + +.passRateLow { + color: #dc2626; +} + +.rank { + font-weight: 600; + color: var(--ifm-color-emphasis-500); +} + +.provider { + font-size: 0.8rem; + color: var(--ifm-color-emphasis-500); + text-transform: capitalize; +} + +.expandBtn { + background: none; + border: none; + cursor: pointer; + color: var(--ifm-color-primary); + font-size: 0.85rem; + padding: 0.2rem 0.5rem; + + &:hover { + text-decoration: underline; + } +} + +.analysisRow td { + background: var(--ifm-color-emphasis-50); + padding: 0.75rem 1rem; + font-size: 0.85rem; + line-height: 1.5; +} + +.analysisLabel { + font-weight: 600; + color: var(--ifm-color-emphasis-700); + margin-right: 0.5rem; +} + +.emptyState { + text-align: center; + padding: 3rem 1rem; + color: var(--ifm-color-emphasis-500); + font-size: 1rem; +} + +.sortArrow { + margin-left: 0.3rem; + font-size: 0.75rem; +} + +.tableWrapper { + overflow-x: auto; + max-height: 600px; + overflow-y: auto; + border: 1px solid var(--ifm-color-emphasis-200); + border-radius: 6px; +} diff --git a/tests/test_docs_indexability.py b/tests/test_docs_indexability.py index 8a774cf4..27b2b2fe 100644 --- a/tests/test_docs_indexability.py +++ b/tests/test_docs_indexability.py @@ -481,7 +481,7 @@ def test_claude_can_fetch_llms_txt(): # --------------------------------------------------------------------------- -# Part 3: ChatGPT Agent Test (requires OPENAI_API_KEY) +# Part 3: ChatGPT Agent Test (requires OPENAI_API_KEY) # --------------------------------------------------------------------------- diff --git a/tools/fetch-vibe-eval-results.js b/tools/fetch-vibe-eval-results.js new file mode 100644 index 00000000..252cf087 --- /dev/null +++ b/tools/fetch-vibe-eval-results.js @@ -0,0 +1,179 @@ +/** + * Fetches benchmark results from the remote Weaviate cluster and writes + * a static JSON file for the VibeEvalDashboard component. + * + * Usage: + * node tools/fetch-vibe-eval-results.js + * + * Required env vars: + * WEAVIATE_VIBE_EVAL_URL - Weaviate cluster URL (e.g. https://xxx.weaviate.cloud) + * WEAVIATE_VIBE_EVAL_KEY - Weaviate API key + */ + +const fs = require("fs"); +const path = require("path"); + +const COLLECTION = "BenchmarkRun"; +const OUTPUT_PATH = path.join( + __dirname, + "..", + "src", + "components", + "VibeEvalDashboard", + "data.json" +); +const LIMIT = 2000; + +async function fetchAllResults(baseUrl, apiKey) { + const url = `${baseUrl}/v1/objects?class=${COLLECTION}&limit=${LIMIT}`; + const res = await fetch(url, { + headers: { + Authorization: `Bearer ${apiKey}`, + "Content-Type": "application/json", + }, + }); + + if (!res.ok) { + throw new Error(`Weaviate responded ${res.status}: ${await res.text()}`); + } + + const body = await res.json(); + return (body.objects || []).map((o) => o.properties); +} + +function buildDashboardData(results) { + // Group by run_id + const runs = {}; + for (const r of results) { + const runId = r.run_id || "unknown"; + if (!runs[runId]) { + runs[runId] = { run_id: runId, timestamp: r.timestamp, results: [] }; + } + runs[runId].results.push(r); + // Keep the latest timestamp per run + if (r.timestamp > runs[runId].timestamp) { + runs[runId].timestamp = r.timestamp; + } + } + + // Sort runs by timestamp descending + const sortedRuns = Object.values(runs).sort( + (a, b) => (b.timestamp || "").localeCompare(a.timestamp || "") + ); + + // Build leaderboard for latest run + const latestRun = sortedRuns[0]; + let leaderboard = []; + if (latestRun) { + const modelStats = {}; + for (const r of latestRun.results) { + const mid = r.model_id || "unknown"; + if (!modelStats[mid]) { + modelStats[mid] = { + model_id: mid, + provider: r.provider || "unknown", + passed: 0, + total: 0, + total_duration: 0, + }; + } + modelStats[mid].total += 1; + if (r.success) modelStats[mid].passed += 1; + modelStats[mid].total_duration += r.duration || 0; + } + + leaderboard = Object.values(modelStats) + .map((s) => ({ + ...s, + pass_rate: s.total > 0 ? s.passed / s.total : 0, + avg_duration: + s.total > 0 ? Math.round((s.total_duration / s.total) * 100) / 100 : 0, + })) + .sort((a, b) => b.pass_rate - a.pass_rate || a.avg_duration - b.avg_duration); + } + + // Build per-task breakdown for latest run + let taskBreakdown = []; + if (latestRun) { + const taskMap = {}; + for (const r of latestRun.results) { + const key = `${r.model_id}||${r.task_id}`; + if (!taskMap[key]) { + taskMap[key] = { + model_id: r.model_id, + task_id: r.task_id, + variant: r.variant, + provider: r.provider, + success: r.success, + duration: r.duration, + similarity_score: r.similarity_score || 0, + failure_analysis: r.failure_analysis || "", + root_cause: r.root_cause || "", + suggested_fix: r.suggested_fix || "", + }; + } + } + taskBreakdown = Object.values(taskMap); + } + + // Run history summary (last 10 runs) + const runHistory = sortedRuns.slice(0, 10).map((run) => { + const passed = run.results.filter((r) => r.success).length; + return { + run_id: run.run_id, + timestamp: run.timestamp, + total: run.results.length, + passed, + pass_rate: run.results.length > 0 ? passed / run.results.length : 0, + }; + }); + + return { + generated_at: new Date().toISOString(), + latest_run_id: latestRun ? latestRun.run_id : null, + leaderboard, + task_breakdown: taskBreakdown, + run_history: runHistory, + }; +} + +async function main() { + let baseUrl = process.env.WEAVIATE_VIBE_EVAL_URL; + const apiKey = process.env.WEAVIATE_VIBE_EVAL_KEY; + + // Ensure URL has protocol + if (baseUrl && !baseUrl.startsWith("http")) { + baseUrl = `https://${baseUrl}`; + } + + if (!baseUrl || !apiKey) { + console.warn( + "WEAVIATE_VIBE_EVAL_URL or WEAVIATE_VIBE_EVAL_KEY not set. Writing empty data file." + ); + const emptyData = { + generated_at: new Date().toISOString(), + latest_run_id: null, + leaderboard: [], + task_breakdown: [], + run_history: [], + }; + fs.mkdirSync(path.dirname(OUTPUT_PATH), { recursive: true }); + fs.writeFileSync(OUTPUT_PATH, JSON.stringify(emptyData, null, 2)); + console.log(`Wrote empty data to ${OUTPUT_PATH}`); + return; + } + + console.log(`Fetching vibe eval results from ${baseUrl}...`); + const results = await fetchAllResults(baseUrl, apiKey); + console.log(`Fetched ${results.length} results`); + + const data = buildDashboardData(results); + fs.mkdirSync(path.dirname(OUTPUT_PATH), { recursive: true }); + fs.writeFileSync(OUTPUT_PATH, JSON.stringify(data, null, 2)); + console.log(`Wrote dashboard data to ${OUTPUT_PATH} (${data.leaderboard.length} models, ${data.task_breakdown.length} tasks)`); +} + +main().catch((err) => { + console.error("Failed to fetch vibe eval results:", err.message); + process.exit(1); +});