llm-eval-simple/reporting.py at master · grigio/llm-eval-simple · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
"""Reporting and formatting functions for LLM evaluation results."""

from typing import Any, Dict, List, Tuple
from tabulate import tabulate

from shared import (
    calculate_model_summary,
    create_progress_bar,
    format_accuracy,
    format_response_time,
    get_unique_prompts_and_models
)


def create_detailed_table_data(results: List[Dict[str, Any]]) -> List[List[str]]:
    """Create data for the detailed results table."""
    return [
        [r["model"], r["file"], "correct" if r["correct"] else "wrong", format_response_time(r["response_time"]), r.get("note", "")]
        for r in results
    ]


def create_matrix_table_data(results: List[Dict[str, Any]]) -> Tuple[List[str], List[List[str]]]:
    """Create data for the matrix table (prompts as columns, models as rows)."""
    prompts, models = get_unique_prompts_and_models(results)

    header = ["Model"] + prompts
    table_data = []

    for model in models:
        row = [model]
        for prompt in prompts:
            found = False
            for r in results:
                if r["model"] == model and r["file"] == prompt:
                    row.append("correct" if r["correct"] else "wrong")
                    found = True
                    break
            if not found:
                row.append("unavailable")
        table_data.append(row)

    return header, table_data


def create_summary_table_data(results: List[Dict[str, Any]]) -> List[List[str]]:
    """Create data for the model performance summary table."""
    model_summary = calculate_model_summary(results)
    summary_table = []

    for model, stats in model_summary.items():
        total = stats["total"]
        correct = stats["correct"]
        total_time = stats["total_time"]

        accuracy = (correct / total) * 100 if total > 0 else 0
        avg_time = total_time / total if total > 0 else 0

        bar = create_progress_bar(accuracy)

        summary_table.append([
            model,
            f"{correct}/{total} ({format_accuracy(correct, total)}) [{bar}]",
            format_response_time(avg_time)
        ])

    return summary_table


def format_detailed_table(results: List[Dict[str, Any]]) -> str:
    """Format the detailed results table."""
    table_data = create_detailed_table_data(results)
    return tabulate(table_data, headers=["Model", "File", "Correct", "Response Time", "Note"], tablefmt="fancy_grid")


def format_matrix_table(results: List[Dict[str, Any]]) -> str:
    """Format the matrix table (prompts as columns, models as rows)."""
    header, table_data = create_matrix_table_data(results)
    return tabulate(table_data, headers=header, tablefmt="fancy_grid")


def format_summary_table(results: List[Dict[str, Any]]) -> str:
    """Format the model performance summary table."""
    table_data = create_summary_table_data(results)
    return tabulate(table_data, headers=["Model", "Correct", "Avg Response Time"], tablefmt="fancy_grid")


def calculate_all_summary_data(results: List[Dict[str, Any]]) -> Dict[str, Any]:
    """Calculate all summary data for results."""
    return {
        "detailed_table": create_detailed_table_data(results),
        "matrix_table": create_matrix_table_data(results),
        "summary_table": create_summary_table_data(results),
        "model_summary": calculate_model_summary(results),
        "unique_prompts_and_models": get_unique_prompts_and_models(results)
    }