GABRIEL/src/gabriel/utils/openai_utils.py at a5c92f152e2f0712b94cf6e2d51f4bf1e559fd4f · openai/GABRIEL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
This module reimplements the original GABRIEL `openai_utils.py` for the
OpenAI Responses API with several improvements:

* Rate limit introspection – a helper fetches the current token/request
  budget from the ``x‑ratelimit-*`` response headers returned by a cheap
  ``GET /v1/models`` call.  These values are used to display how many
  tokens and requests remain per minute.
* User‑friendly summary – before a long job starts, the module prints a
  summary showing the number of prompts, input words, remaining rate‑limit
  capacity, usage tier qualifications, and an estimated cost.
* Improved rate‑limit gating – the token limiter now estimates the worst
  possible output length when the cutoff is unspecified by assuming
  the response could be as long as the input.  This avoids grossly
  underestimating throughput while still honouring the per‑minute token
  budget.
* Exponential backoff with jitter – the retry logic uses a random
  exponential backoff when rate‑limit errors occur, following OpenAI’s
  guidelines for handling 429 responses.

The overall API surface remains compatible with the original file: the
public functions ``get_response`` and ``get_all_responses`` still
exist.
"""

from __future__ import annotations

import asyncio
import contextlib
import csv
import functools
import importlib.util
import inspect
import json
import os
from pathlib import Path
import random
import re
import tempfile
import time
import subprocess
import sys
import textwrap
from typing import (
    Any,
    Awaitable,
    Callable,
    Deque,
    Dict,
    Hashable,
    List,
    Optional,
    Set,
    Tuple,
    Union,
)
from collections import defaultdict, deque
from collections.abc import Iterable
import pickle

from gabriel.utils.logging import get_logger, set_log_level
import logging
import math
import pandas as pd
from aiolimiter import AsyncLimiter
from tqdm.auto import tqdm
import openai
import statistics
import numpy as np
import tiktoken
from dataclasses import dataclass, fields
from pydantic import BaseModel

logger = get_logger(__name__)

# Track whether the verbose usage sheet has been shown to avoid repeating the
# static "info sheet" content on subsequent runs.
_USAGE_SHEET_PRINTED = False
_DEPENDENCIES_VERIFIED = False

# Cap the number of prompts we fully scan when estimating words/tokens.  Large
# datasets are sampled to keep start-up time predictable.
_ESTIMATION_SAMPLE_SIZE = 5000

_TIMEOUT_BURST_RATIO = 1.25

DEFAULT_SYSTEM_INSTRUCTION = (
    "Please provide a helpful response to this inquiry for purposes of academic research."
)

# Try to import requests/httpx for rate‑limit introspection
try:
    import requests  # type: ignore
except Exception:
    requests = None  # type: ignore
try:
    import httpx  # type: ignore
except Exception:
    httpx = None  # type: ignore

# Bring in specific error classes for granular handling
try:
    from openai import (
        APIConnectionError,
        APIError,
        APITimeoutError,
        AuthenticationError,
        BadRequestError,
        InvalidRequestError,
        RateLimitError,
    )  # type: ignore
except Exception:
    APIConnectionError = Exception  # type: ignore
    APIError = Exception  # type: ignore
    APITimeoutError = Exception  # type: ignore
    AuthenticationError = Exception  # type: ignore
    BadRequestError = Exception  # type: ignore
    InvalidRequestError = Exception  # type: ignore
    RateLimitError = Exception  # type: ignore

from gabriel.utils.parsing import parse_json_with_status, safe_json

# single connection pool per process, keyed by base URL and created lazily
_clients_async: Dict[Optional[str], openai.AsyncOpenAI] = {}


def _progress_bar(*args: Any, verbose: bool = True, **kwargs: Any):
    """Construct a tqdm progress bar that degrades gracefully."""

    disable = kwargs.pop("disable", False) or not verbose
    kwargs.setdefault("dynamic_ncols", True)
    return tqdm(*args, disable=disable, **kwargs)


def _display_example_prompt(example_prompt: str, *, verbose: bool = True) -> None:
    """Print the full example prompt in plain text for easy copying."""

    if not verbose or not example_prompt:
        return

    print("\n===== Example prompt =====")
    print(textwrap.indent(example_prompt.strip("\n"), "  "))


def _get_client(
    base_url: Optional[str] = None,
) -> openai.AsyncOpenAI:
    """Return a cached ``AsyncOpenAI`` client for ``base_url``.

    When ``base_url`` is ``None`` the default OpenAI endpoint is used.  A client
    is created on first use and reused for subsequent calls with the same base
    URL to benefit from connection pooling.
    """

    url = base_url or os.getenv("OPENAI_BASE_URL")
    client = _clients_async.get(url)
    if client is None:
        kwargs: Dict[str, Any] = {}
        if url:
            kwargs["base_url"] = url
        if httpx is not None:
            try:
                kwargs.setdefault(
                    "timeout",
                    httpx.Timeout(connect=10.0, read=None, write=None, pool=None),
                )
            except Exception:
                # Fall back to the SDK default if constructing the timeout fails
                pass
        client = openai.AsyncOpenAI(**kwargs)
        _clients_async[url] = client
    return client

# Estimated output tokens per prompt used for cost estimation when no cutoff is specified.
# When a user does not explicitly set ``max_output_tokens``, we assume that each response
# will contain roughly this many tokens.  This value is used solely for estimating cost
# and determining how many parallel requests can safely run under the token budget.
ESTIMATED_OUTPUT_TOKENS_PER_PROMPT = 500
# Extra input tokens to add per prompt when estimating non-text inputs or web search.
NON_TEXT_INPUT_TOKEN_BUFFER = 2000

# Conservative headroom when translating observed rate limits into concurrency and limiter budgets.
# Using less than the reported limit provides a buffer for short spikes and accounting inaccuracies.
RATE_LIMIT_HEADROOM = 0.85
# Additional planning buffer applied when translating reported rate limits into budgets.
PLANNING_RATE_LIMIT_BUFFER = 0.85
# Cushion applied to expected output tokens during initial planning. This headroom is relaxed to
# ``OUTPUT_TOKEN_HEADROOM_STEADY`` after we accumulate real usage samples.
OUTPUT_TOKEN_HEADROOM_INITIAL = 2.0
OUTPUT_TOKEN_HEADROOM_STEADY = 1.0

# ---------------------------------------------------------------------------
# Helper dataclasses and token utilities


@dataclass
class StatusTracker:
    """Simple container for bookkeeping counters."""

    num_tasks_started: int = 0
    num_tasks_in_progress: int = 0
    num_tasks_succeeded: int = 0
    num_tasks_failed: int = 0
    num_rate_limit_errors: int = 0
    num_connection_errors: int = 0
    num_api_errors: int = 0
    num_timeout_errors: int = 0
    num_json_parse_errors: int = 0
    num_other_errors: int = 0
    time_of_last_rate_limit_error: float = 0.0
    time_of_last_connection_error: float = 0.0


@dataclass
class DummyResponseSpec:
    """Configuration object describing synthetic responses for dummy runs."""

    responses: Optional[Any] = None
    duration: Optional[float] = None
    input_tokens: Optional[int] = None
    output_tokens: Optional[int] = None
    reasoning_tokens: Optional[int] = None
    reasoning_summary: Optional[str] = None
    response_id: Optional[str] = None
    successful: Optional[bool] = None
    error_log: Optional[Union[str, List[str]]] = None
    warning: Optional[str] = None


class BackgroundTimeoutError(asyncio.TimeoutError):
    """Timeout raised while polling a background response."""

    def __init__(self, response_id: Optional[str], last_response: Any, message: str):
        super().__init__(message)
        self.response_id = response_id
        self.last_response = last_response


class JSONParseError(ValueError):
    """Raised when JSON parsing fails during JSON-mode requests."""

    def __init__(self, message: str, snippet: Optional[str] = None):
        super().__init__(message)
        self.snippet = snippet


def _extract_retry_after_seconds(error: Exception) -> Optional[float]:
    """Return a retry-after duration in seconds when available."""

    for attr in ("retry_after", "retry_after_s", "retry_after_seconds"):
        retry_value = getattr(error, attr, None)
        if isinstance(retry_value, (int, float)) and retry_value > 0:
            return float(retry_value)
    retry_ms = getattr(error, "retry_after_ms", None)
    if isinstance(retry_ms, (int, float)) and retry_ms > 0:
        return float(retry_ms) / 1000.0
    message = str(error)
    if not message:
        return None
    match = re.search(r"after\s+([0-9]+(?:\.[0-9]+)?)\s*seconds", message)
    if match:
        try:
            parsed = float(match.group(1))
        except ValueError:
            return None
        if parsed > 0:
            return parsed
    return None


def _is_quota_error_message(message: str) -> bool:
    """Return True when the error text indicates an exhausted quota."""

    return bool(message) and "quota" in message.lower()


def _classify_timeout_detail(detail: str) -> Optional[str]:
    """Classify timeout exception detail strings for safer accounting."""

    if not detail:
        return None
    detail_lower = detail.lower()
    rate_markers = ("rate limit", "rate-limit", "too many requests", "http 429", " 429")
    if any(marker in detail_lower for marker in rate_markers) or _is_quota_error_message(
        detail_lower
    ):
        return "rate_limit"
    connection_markers = (
        "connection error",
        "connection reset",
        "connection refused",
        "connection aborted",
        "connection closed",
        "network error",
        "network unreachable",
        "connection lost",
    )
    if any(marker in detail_lower for marker in connection_markers):
        return "connection"
    return None


def _get_tokenizer(model_name: str) -> tiktoken.Encoding:
    """Return a tiktoken encoding for the model or a sensible default."""
    try:
        return tiktoken.encoding_for_model(model_name)
    except Exception:
        class _ApproxEncoder:
            def encode(self, text: str) -> List[int]:
                return [0] * max(1, _approx_tokens(text))

        return _ApproxEncoder()


def _uses_legacy_system_instruction(model_name: str) -> bool:
    """Return True when the model expects legacy system-message prompts."""

    lowered = (model_name or "").lower()
    return lowered.startswith("gpt-3") or lowered.startswith("gpt-4")


def _is_audio_model(model_name: str) -> bool:
    """Return True when the model name indicates audio support."""

    return "audio" in (model_name or "").lower()


def _has_media_payloads(
    prompt_images: Optional[Dict[str, List[str]]],
    prompt_audio: Optional[Dict[str, List[Dict[str, str]]]],
    prompt_pdfs: Optional[Dict[str, List[Dict[str, str]]]],
    identifiers: Iterable[Any],
) -> bool:
    """Return True when any prompt includes image/audio/pdf payloads."""

    if not (prompt_images or prompt_audio or prompt_pdfs):
        return False
    for ident in identifiers:
        key = str(ident)
        if prompt_images and prompt_images.get(key):
            return True
        if prompt_audio and prompt_audio.get(key):
            return True
        if prompt_pdfs and prompt_pdfs.get(key):
            return True
    return False

# Usage tiers with qualifications and monthly limits for printing
TIER_INFO = [
    {
        "tier": "Free",
        "qualification": "User must be in an allowed geography",
        "monthly_quota": "$100 / month",
    },
    {"tier": "Tier 1", "qualification": "$5 added", "monthly_quota": None},
    {"tier": "Tier 2", "qualification": "$50 added and 7+ days since first payment", "monthly_quota": None},
    {"tier": "Tier 3", "qualification": "$100 added and 7+ days since first payment", "monthly_quota": None},
    {"tier": "Tier 4", "qualification": "$250 added and 14+ days since first payment", "monthly_quota": None},
    {"tier": "Tier 5", "qualification": "$1 000 added and 30+ days since first payment", "monthly_quota": None},
]

# Truncated pricing table (USD per million tokens) for a few common models
MODEL_PRICING: Dict[str, Dict[str, float]] = {
    # model family       input   cached_input   output   batch_factor
    "gpt-4.1": {"input": 2.00, "cached_input": 0.50, "output": 8.00, "batch": 0.5},
    "gpt-4.1-mini": {"input": 0.40, "cached_input": 0.10, "output": 1.60, "batch": 0.5},
    "gpt-4.1-nano": {
        "input": 0.10,
        "cached_input": 0.025,
        "output": 0.40,
        "batch": 0.5,
    },
    "gpt-4o": {"input": 2.50, "cached_input": 1.25, "output": 10.00, "batch": 0.5},
    "gpt-4o-mini": {"input": 0.15, "cached_input": 0.075, "output": 0.60, "batch": 0.5},
    "o3": {"input": 2.00, "cached_input": 0.50, "output": 8.00, "batch": 0.5},
    "o4-mini": {"input": 1.10, "cached_input": 0.275, "output": 4.40, "batch": 0.5},
    "gpt-audio-mini": {
        "input": 0.60,
        "cached_input": 0.15,
        "output": 2.40,
        "batch": 0.5,
    },
    "gpt-audio": {
        "input": 2.50,
        "cached_input": 0.625,
        "output": 10.00,
        "batch": 0.5,
    },
    "gpt-5.2": {"input": 1.75, "cached_input": 0.175, "output": 14.00, "batch": 0.5},
    "gpt-5.1": {"input": 1.25, "cached_input": 0.125, "output": 10.00, "batch": 0.5},
    "gpt-5": {"input": 1.25, "cached_input": 0.125, "output": 10.00, "batch": 0.5},
    "gpt-5-mini": {"input": 0.25, "cached_input": 0.025, "output": 2.00, "batch": 0.5},
    "gpt-5-nano": {"input": 0.05, "cached_input": 0.005, "output": 0.40, "batch": 0.5},
    "o3-mini": {"input": 1.10, "cached_input": 0.55, "output": 4.40, "batch": 0.5},
    "o3-deep-research": {
        "input": 10.00,
        "cached_input": 2.50,
        "output": 40.00,
        "batch": 0.5,
    },
    "o4-mini-deep-research": {
        "input": 2.00,
        "cached_input": 0.50,
        "output": 8.00,
        "batch": 0.5,
    },
}


def _print_tier_explainer(verbose: bool = True) -> None:
    """Print a helpful explanation of usage tiers and how to increase them.

    This helper can be called when a user encounters errors that may be
    related to low quotas or tier limitations.  It summarises the
    qualifications for each tier and encourages users to check their
    payment status and billing page.  The message is only printed when
    ``verbose`` is ``True``.
    """
    if not verbose:
        return
    print("\n===== Tier explainer =====")
    print(
        "Your ability to call the OpenAI API is governed by usage tiers. Runs on lower usage tiers will be slower."
    )
    print(
        "As you spend more on the API, you are automatically graduated to higher tiers with larger token and request limits."
    )
    print("Here are the current tiers and how to qualify:")
    for tier in TIER_INFO:
        monthly = tier.get("monthly_quota")
        monthly_text = f"; monthly quota {monthly}" if monthly else ""
        print(f"  • {tier['tier']}: qualify by {tier['qualification']}{monthly_text}")
    print("If you are encountering rate limits or truncated outputs, consider:")
    print(
        "  – Checking your current spend and ensuring you have met the payment criteria for a higher tier."
    )
    print(
        "  – Adding funds at https://platform.openai.com/settings/organization/billing/."
    )


def _approx_tokens(text: str) -> int:
    """Roughly estimate the token count from a string by assuming ~1.5 tokens per word."""
    return int(len(str(text).split()) * 1.5)


def _decide_default_max_output_tokens(
    max_output_tokens: Optional[int],
    rate_headers: Optional[Dict[str, str]],
) -> Optional[int]:
    """Choose a default max output token cap when the user leaves it unset."""

    if max_output_tokens is not None:
        return max_output_tokens
    return None


def _lookup_model_pricing(model: str) -> Optional[Dict[str, float]]:
    """Find a pricing entry for ``model`` by prefix match (case‑insensitive)."""
    key = model.lower()
    # Find the most specific prefix match by selecting the longest matching prefix.
    best_match: Optional[Dict[str, float]] = None
    best_len = -1
    for prefix, pricing in MODEL_PRICING.items():
        if key.startswith(prefix) and len(prefix) > best_len:
            best_match = pricing
            best_len = len(prefix)
    return best_match


def _estimate_cost(
    prompts: List[str],
    n: int,
    max_output_tokens: Optional[int],
    model: str,
    use_batch: bool,
    *,
    sample_size: int = _ESTIMATION_SAMPLE_SIZE,
    estimated_output_tokens_per_prompt: int = ESTIMATED_OUTPUT_TOKENS_PER_PROMPT,
    extra_input_tokens_per_prompt: int = 0,
) -> Optional[Dict[str, float]]:
    """Estimate input/output tokens and cost for a set of prompts.

    Returns a dict with keys ``input_tokens``, ``output_tokens``, ``input_cost``, ``output_cost``, and ``total_cost``.
    If the model pricing is unavailable, returns ``None``.
    ``estimated_output_tokens_per_prompt`` controls the assumed output length when no
    explicit ``max_output_tokens`` is supplied.
    """
    pricing = _lookup_model_pricing(model)
    if pricing is None:
        return None
    # Estimate tokens: sample large datasets to avoid long start-up times.
    total_prompts = len(prompts)
    if total_prompts == 0:
        return None
    if sample_size and total_prompts > sample_size:
        # Deterministic sampling keeps estimates stable across runs.
        rng = random.Random(total_prompts)
        sampled = rng.sample(prompts, sample_size)
        avg_tokens = sum(_approx_tokens(p) for p in sampled) / float(sample_size)
        input_tokens = int(avg_tokens * total_prompts * max(1, n))
    else:
        input_tokens = sum(_approx_tokens(p) for p in prompts) * max(1, n)
    if extra_input_tokens_per_prompt > 0:
        input_tokens += extra_input_tokens_per_prompt * total_prompts * max(1, n)
    # Estimate output tokens: when no cutoff is provided we assume a reasonable default
    # number of output tokens per prompt.  This prevents the cost estimate from
    # ballooning for long inputs, which previously assumed the output could be as long
    # as the input.
    if max_output_tokens is None:
        # Use the per‑prompt estimate for each response
        output_tokens = estimated_output_tokens_per_prompt * max(1, n) * len(prompts)
    else:
        output_tokens = max_output_tokens * max(1, n) * len(prompts)
    cost_in = (input_tokens / 1_000_000) * pricing["input"]
    cost_out = (output_tokens / 1_000_000) * pricing["output"]
    if use_batch:
        cost_in *= pricing["batch"]
        cost_out *= pricing["batch"]
    return {
        "input_tokens": input_tokens,
        "output_tokens": output_tokens,
        "input_cost": cost_in,
        "output_cost": cost_out,
        "total_cost": cost_in + cost_out,
    }


def _estimate_tokens_per_call(
    avg_input_tokens: float,
    expected_output_tokens: Optional[int],
    n: int,
    *,
    estimated_output_tokens_per_prompt: int = ESTIMATED_OUTPUT_TOKENS_PER_PROMPT,
    output_headroom: float = OUTPUT_TOKEN_HEADROOM_INITIAL,
) -> float:
    """Return a conservative token estimate per call for planning throughput."""

    gating_output = (
        expected_output_tokens
        if expected_output_tokens is not None
        else estimated_output_tokens_per_prompt
    )
    gating_output *= max(1.0, float(output_headroom))
    tokens_per_call = max(1.0, (avg_input_tokens + gating_output) * max(1, n))
    return tokens_per_call


def _estimate_prompts_per_minute(
    allowed_req_pm: Optional[float],
    allowed_tok_pm: Optional[float],
    tokens_per_call: Optional[float],
) -> Tuple[Optional[float], Dict[str, float]]:
    """Estimate achievable prompts/min given budgets and per-call token needs."""

    details: Dict[str, float] = {}
    if tokens_per_call is None or tokens_per_call <= 0:
        return None, details
    if allowed_req_pm is not None and allowed_req_pm > 0:
        details["request_bound"] = float(allowed_req_pm)
    if allowed_tok_pm is not None and allowed_tok_pm > 0:
        details["token_bound"] = float(allowed_tok_pm) / float(tokens_per_call)
    candidates = [v for v in details.values() if v is not None]
    if not candidates:
        return None, details
    throughput = min(candidates)
    details["throughput"] = throughput
    return throughput, details


def _planned_ppm_and_details(
    allowed_req_pm: Optional[float],
    allowed_tok_pm: Optional[float],
    tokens_per_call: Optional[float],
) -> Tuple[Optional[int], Dict[str, float]]:
    """Return rounded prompts/minute with the underlying limiter details."""

    throughput, details = _estimate_prompts_per_minute(
        allowed_req_pm, allowed_tok_pm, tokens_per_call
    )
    if throughput is None or throughput <= 0:
        return None, details
    return max(1, int(round(throughput))), details


def _safe_planned_ppm_and_details(
    allowed_req_pm: Optional[float],
    allowed_tok_pm: Optional[float],
    tokens_per_call: Optional[float],
    *,
    context: str,
) -> Tuple[Optional[int], Dict[str, float]]:
    """Wrapper around `_planned_ppm_and_details` that never raises."""

    try:
        return _planned_ppm_and_details(
            allowed_req_pm, allowed_tok_pm, tokens_per_call
        )
    except Exception as exc:
        logger.error("Error while %s: %s", context, exc)
        if context == "tuning concurrency":
            print(f"Error while updating concurrency cap dynamically: {exc}")
        return None, {}


def _resolve_limiting_factor(
    throughput_details: Dict[str, float],
    *,
    allowed_req_pm: Optional[float],
    allowed_tok_pm: Optional[float],
) -> Tuple[Optional[str], Optional[float]]:
    """Return the primary limiter (requests/tokens per minute) and its value."""

    req_bound = throughput_details.get("request_bound")
    tok_bound = throughput_details.get("token_bound")
    if req_bound is None and tok_bound is None:
        return None, None
    if tok_bound is None or (req_bound is not None and req_bound <= tok_bound):
        return "requests per minute", allowed_req_pm if allowed_req_pm is not None else req_bound
    return "token budget per minute", allowed_tok_pm if allowed_tok_pm is not None else tok_bound


def _format_throughput_plan(
    *,
    planned_ppm: Optional[int],
    throughput_details: Dict[str, float],
    remaining_prompts: int,
    allowed_req_pm: Optional[float],
    allowed_tok_pm: Optional[float],
    include_upgrade_hint: bool = True,
    tokens_per_call: Optional[float] = None,
    parallel_ceiling: Optional[int] = None,
    n_parallels: Optional[int] = None,
    ultimate_parallel_cap: Optional[int] = None,
) -> List[str]:
    """Build human-friendly throughput summary lines."""

    del include_upgrade_hint, tokens_per_call
    parallel_cap = parallel_ceiling if parallel_ceiling is not None else n_parallels
    ultimate_parallel_cap = (
        max(1, ultimate_parallel_cap) if ultimate_parallel_cap is not None else n_parallels
    )
    fallback_line = "If running into API or timeout errors, try reducing n_parallels."

    if planned_ppm is None or planned_ppm <= 0:
        return [
            "Expected prompts per minute: unknown (rate-limit data unavailable; running with conservative defaults).",
            fallback_line,
        ]
    estimated_minutes = math.ceil(remaining_prompts / planned_ppm) if remaining_prompts > 0 else 0
    minimum_minutes = max(1, estimated_minutes)
    limiter, limiter_value = _resolve_limiting_factor(
        throughput_details,
        allowed_req_pm=allowed_req_pm,
        allowed_tok_pm=allowed_tok_pm,
    )
    lines = [
        f"Expected prompts per minute: maximum of {planned_ppm:,}",
        f"Estimated total mins: minimum of {minimum_minutes} minute{'s' if minimum_minutes != 1 else ''}",
    ]
    rate_label = limiter or "current rate limits"
    rate_val = f"~{int(limiter_value):,}/min" if limiter_value is not None else "rate limits"
    meets_parallel_cap = (
        parallel_cap is not None
        and planned_ppm >= parallel_cap
        and (limiter_value is None or limiter_value >= parallel_cap)
    )
    at_ultimate_parallel_cap = (
        ultimate_parallel_cap is not None
        and meets_parallel_cap
        and parallel_cap == ultimate_parallel_cap
    )
    if at_ultimate_parallel_cap:
        lines.append(
            f"Rate currently limited by n_parallels = {ultimate_parallel_cap}. Increase n_parallels for faster runs, if your machine allows."
        )
    elif limiter:
        lines.append(
            f"Rate currently limited by {limiter} ({rate_val}). Moving to a higher usage tier can raise these limits and allow faster runs."
        )
    else:
        lines.append(
            f"Rate currently limited by {rate_label} ({rate_val}). Moving to a higher usage tier can raise these limits and allow faster runs."
        )
    lines.append(fallback_line)
    return lines


def _estimate_dataset_stats(
    prompts: List[str],
    *,
    sample_size: int = _ESTIMATION_SAMPLE_SIZE,
    extra_input_tokens_per_prompt: int = 0,
) -> Dict[str, Any]:
    """Return rough totals for words and tokens without scanning massive datasets.

    The helper samples up to ``sample_size`` prompts and scales the totals to the
    full dataset.  This keeps initial reporting fast even for hundreds of
    thousands of prompts.
    """

    total_prompts = len(prompts)
    if total_prompts == 0:
        return {"word_count": 0, "token_count": 0, "sampled": False, "sample_size": 0}
    if sample_size and total_prompts > sample_size:
        rng = random.Random(total_prompts)
        sample = rng.sample(prompts, sample_size)
        avg_words = sum(len(str(p).split()) for p in sample) / float(sample_size)
        avg_tokens = sum(_approx_tokens(p) for p in sample) / float(sample_size)
        if extra_input_tokens_per_prompt > 0:
            avg_tokens += float(extra_input_tokens_per_prompt)
        return {
            "word_count": int(avg_words * total_prompts),
            "token_count": int(avg_tokens * total_prompts),
            "sampled": True,
            "sample_size": sample_size,
        }
    extra_tokens = extra_input_tokens_per_prompt * total_prompts
    return {
        "word_count": sum(len(str(p).split()) for p in prompts),
        "token_count": sum(_approx_tokens(p) for p in prompts) + extra_tokens,
        "sampled": False,
        "sample_size": total_prompts,
    }


def _ensure_runtime_dependencies(packages: Optional[List[str]] = None, *, verbose: bool = True) -> None:
    """Install missing runtime dependencies in a best-effort manner.

    The function is intentionally lightweight: it checks for a small set of
    packages and silently returns when everything is already present.  When a
    package is missing, ``pip`` is invoked to install only the missing items so
    the helper works in local, Colab, Databricks, and CI environments without
    user intervention.
    """

    global _DEPENDENCIES_VERIFIED
    if _DEPENDENCIES_VERIFIED:
        return
    pkgs = packages or ["wheel", "tiktoken", "aiolimiter", "httpx", "requests"]
    missing = [pkg for pkg in pkgs if importlib.util.find_spec(pkg) is None]
    if not missing:
        _DEPENDENCIES_VERIFIED = True
        return
    if verbose:
        print(
            "Installing missing dependencies for GABRIEL (once per session): "
            + ", ".join(sorted(missing))
        )
    try:
        subprocess.run(
            [sys.executable, "-m", "pip", "install", "--quiet", "--upgrade", *missing],
            check=True,
        )
        _DEPENDENCIES_VERIFIED = True
    except Exception as exc:
        logger.warning("Automatic dependency installation failed: %s", exc)


def _print_run_banner(
    *,
    prompts: List[str],
    model: str,
    n: int,
    use_batch: bool,
    modality: Optional[str],
    web_search: bool,
    reasoning_effort: Optional[str],
    estimated_cost: Optional[Dict[str, float]],
    max_output_tokens: Optional[int],
    stats: Dict[str, Any],
    estimated_output_tokens_per_prompt: int = ESTIMATED_OUTPUT_TOKENS_PER_PROMPT,
    verbose: bool = True,
) -> None:
    """Print an immediate run overview so users see progress right away."""

    if not verbose:
        return
    print("\n===== Run kickoff =====")
    total_words = stats.get("word_count", 0) or 0
    words_per_prompt = (
        int(round(total_words / max(len(prompts), 1))) if prompts else 0
    )
    print(
        f"Prompts: {len(prompts):,} | Words: ~{total_words:,} | Words per prompt: ~{words_per_prompt:,}"
    )
    modality_segment = f" | modality: {modality}" if modality else ""
    reasoning_segment = f"Reasoning effort: {reasoning_effort or 'default'}"
    print(
        f"Model: {model} | {reasoning_segment} | Mode: {'batch' if use_batch else 'streaming'}{modality_segment}"
    )
    pricing = _lookup_model_pricing(model)
    if pricing:
        print(
            f"Pricing for model '{model}': input ${pricing['input']}/1M, output ${pricing['output']}/1M"
        )
    tokens_per_call = None
    if stats and prompts:
        avg_input_tokens = (stats.get("token_count") or 0) / max(1, len(prompts))
        tokens_per_call = _estimate_tokens_per_call(
            avg_input_tokens,
            max_output_tokens,
            n,
            estimated_output_tokens_per_prompt=estimated_output_tokens_per_prompt,
            output_headroom=OUTPUT_TOKEN_HEADROOM_INITIAL,
        )
    if estimated_cost:
        token_usage = (
            f"Estimated token usage: input {estimated_cost['input_tokens']:,}, output {estimated_cost['output_tokens']:,}"
        )
        if tokens_per_call:
            token_usage += f" | ~{int(round(tokens_per_call)):,} tokens per call"
        print(token_usage)
        print(
            f"Estimated {'batch' if use_batch else 'synchronous'} cost: ${estimated_cost['total_cost']:.2f} "
            f"(input: ${estimated_cost['input_cost']:.2f}, output: ${estimated_cost['output_cost']:.2f})"
        )
        if _is_multimodal_estimate(modality=modality, web_search=web_search):
            print(
                "Note: multimedia/web inputs can make cost estimates unreliable. Monitor usage in the OpenAI dashboard."
            )
    else:
        if pricing:
            print("Estimated token usage unavailable for this model.")
        print("Estimated cost unavailable for this model.")


def _infer_modality_from_inputs(
    prompt_images: Optional[Dict[str, List[str]]],
    prompt_audio: Optional[Dict[str, List[Dict[str, str]]]],
    prompt_pdfs: Optional[Dict[str, List[Dict[str, str]]]],
) -> str:
    present = []
    if prompt_images:
        present.append("image")
    if prompt_audio:
        present.append("audio")
    if prompt_pdfs:
        present.append("pdf")
    if not present:
        return "text"
    if len(present) > 1:
        return "mixed"
    return present[0]


def _estimate_extra_input_tokens_per_prompt(
    *,
    modality: Optional[str],
    web_search: bool,
    has_media: bool,
) -> int:
    """Return the extra input tokens to add per prompt for non-text inputs."""

    mode = (modality or "").lower()
    if mode and mode not in {"text", "entity"}:
        return NON_TEXT_INPUT_TOKEN_BUFFER
    if web_search or has_media:
        return NON_TEXT_INPUT_TOKEN_BUFFER
    return 0


def _is_multimodal_estimate(
    *,
    modality: Optional[str],
    web_search: bool,
    has_media: bool = False,
) -> bool:
    mode = (modality or "").lower()
    if web_search:
        return True
    if mode and mode not in {"text", "entity"}:
        return True
    if has_media:
        return True
    return False


def _require_api_key() -> str:
    """Return the API key or raise a runtime error if missing."""
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        raise RuntimeError(
            "OPENAI_API_KEY environment variable must be set or passed via OpenAIClient(api_key)."
        )
    return api_key


def _get_rate_limit_headers(
    model: str = "gpt-5-mini", base_url: Optional[str] = None
) -> Optional[Dict[str, str]]:
    """Retrieve rate‑limit headers via a cheap API request.

    The OpenAI platform does not yet expose a dedicated endpoint for
    checking how many requests or tokens remain in your minute quota.  In
    practice, these values are only communicated via ``x‑ratelimit-*``
    headers on API responses.  The newer *Responses* API does not
    consistently include these headers as of mid‑2025【360365694688557†L209-L243】, but it
    may in the future.  To accommodate current and future behaviour, this
    helper first tries a minimal call against the Responses endpoint and
    falls back to a tiny call against the Chat completions endpoint when
    the headers are absent.  Both calls cap generation at one token to
    minimise usage.

    :param model: The model to use for the dummy request.  Matching the
      model you intend to use yields the most accurate limits.
    :returns: A dictionary containing limit and remaining values for
      requests and tokens if successful, otherwise ``None``.
    """
    api_key = os.getenv("OPENAI_API_KEY")
    if not api_key:
        return None
    base = base_url or os.getenv("OPENAI_BASE_URL") or "https://api.openai.com/v1"
    base = base.rstrip("/")
    # Define two candidate endpoints: the Responses API and the Chat
    # completions API.  In mid‑2025 the Responses API often omits rate‑limit
    # headers【360365694688557†L209-L243】, but OpenAI may add them in the future.  We try
    # the Responses endpoint first to see if headers are now included; if
    # missing, we fall back to a minimal call to the chat completions
    # endpoint.  Both calls cap generation at one token to minimise usage.
    candidates: List[Tuple[str, Dict[str, Any]]] = []
    # Responses API payload (first attempt)
    candidates.append(
        (
            f"{base}/responses",
            {
                "model": model,
                "input": [
                    {"role": "user", "content": "Hello"},
                ],
                "truncation": "auto",
                "max_output_tokens": 1,
            },
        )
    )
    # Chat completions API payload (fallback)
    candidates.append(
        (
            f"{base}/chat/completions",
            {
                "model": model,
                "messages": [
                    {"role": "user", "content": "Hello"},
                ],
                "max_tokens": 1,
            },
        )
    )
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
    }
    for url, payload in candidates:
        for client in (requests, httpx):
            if client is None:
                continue
            try:
                resp = client.post(url, headers=headers, json=payload, timeout=10)  # type: ignore
                h = getattr(resp, "headers", {})  # type: ignore
                new_h = {k.lower(): v for k, v in h.items()}
                # Collect both standard and usage‑based headers.  If the
                # responses API is missing them, continue to the next
                # candidate.
                limit_requests = new_h.get("x-ratelimit-limit-requests")
                remaining_requests = new_h.get("x-ratelimit-remaining-requests")
                reset_requests = new_h.get("x-ratelimit-reset-requests")
                limit_tokens = new_h.get("x-ratelimit-limit-tokens") or new_h.get(
                    "x-ratelimit-limit-tokens_usage_based"
                )
                remaining_tokens = new_h.get("x-ratelimit-remaining-tokens") or new_h.get(
                    "x-ratelimit-remaining-tokens_usage_based"
                )
                reset_tokens = new_h.get("x-ratelimit-reset-tokens") or new_h.get(
                    "x-ratelimit-reset-tokens_usage_based"
                )
                # If any of the primary values are present, return them.  Some
                # providers may omit remaining values until you are close to
                # the limit, so we treat the presence of a limit value as
                # success.
                if limit_requests or limit_tokens:
                    return {
                        "limit_requests": limit_requests,
                        "remaining_requests": remaining_requests,
                        "reset_requests": reset_requests,
                        "limit_tokens": limit_tokens,
                        "remaining_tokens": remaining_tokens,
                        "reset_tokens": reset_tokens,
                    }
            except Exception:
                # Ignore any errors and try the next client or candidate
                continue
    return None


def _print_usage_overview(
    prompts: List[str],
    n: int,
    max_output_tokens: Optional[int],
    model: str,