-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathreporting.py
More file actions
96 lines (72 loc) · 3.38 KB
/
reporting.py
File metadata and controls
96 lines (72 loc) · 3.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""Reporting and formatting functions for LLM evaluation results."""
from typing import Any, Dict, List, Tuple
from tabulate import tabulate
from shared import (
calculate_model_summary,
create_progress_bar,
format_accuracy,
format_response_time,
get_unique_prompts_and_models
)
def create_detailed_table_data(results: List[Dict[str, Any]]) -> List[List[str]]:
"""Create data for the detailed results table."""
return [
[r["model"], r["file"], "correct" if r["correct"] else "wrong", format_response_time(r["response_time"]), r.get("note", "")]
for r in results
]
def create_matrix_table_data(results: List[Dict[str, Any]]) -> Tuple[List[str], List[List[str]]]:
"""Create data for the matrix table (prompts as columns, models as rows)."""
prompts, models = get_unique_prompts_and_models(results)
header = ["Model"] + prompts
table_data = []
for model in models:
row = [model]
for prompt in prompts:
found = False
for r in results:
if r["model"] == model and r["file"] == prompt:
row.append("correct" if r["correct"] else "wrong")
found = True
break
if not found:
row.append("unavailable")
table_data.append(row)
return header, table_data
def create_summary_table_data(results: List[Dict[str, Any]]) -> List[List[str]]:
"""Create data for the model performance summary table."""
model_summary = calculate_model_summary(results)
summary_table = []
for model, stats in model_summary.items():
total = stats["total"]
correct = stats["correct"]
total_time = stats["total_time"]
accuracy = (correct / total) * 100 if total > 0 else 0
avg_time = total_time / total if total > 0 else 0
bar = create_progress_bar(accuracy)
summary_table.append([
model,
f"{correct}/{total} ({format_accuracy(correct, total)}) [{bar}]",
format_response_time(avg_time)
])
return summary_table
def format_detailed_table(results: List[Dict[str, Any]]) -> str:
"""Format the detailed results table."""
table_data = create_detailed_table_data(results)
return tabulate(table_data, headers=["Model", "File", "Correct", "Response Time", "Note"], tablefmt="fancy_grid")
def format_matrix_table(results: List[Dict[str, Any]]) -> str:
"""Format the matrix table (prompts as columns, models as rows)."""
header, table_data = create_matrix_table_data(results)
return tabulate(table_data, headers=header, tablefmt="fancy_grid")
def format_summary_table(results: List[Dict[str, Any]]) -> str:
"""Format the model performance summary table."""
table_data = create_summary_table_data(results)
return tabulate(table_data, headers=["Model", "Correct", "Avg Response Time"], tablefmt="fancy_grid")
def calculate_all_summary_data(results: List[Dict[str, Any]]) -> Dict[str, Any]:
"""Calculate all summary data for results."""
return {
"detailed_table": create_detailed_table_data(results),
"matrix_table": create_matrix_table_data(results),
"summary_table": create_summary_table_data(results),
"model_summary": calculate_model_summary(results),
"unique_prompts_and_models": get_unique_prompts_and_models(results)
}