Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,37 @@ See if the Hugging Face version of GPT2 is vulnerable to DAN 11.0
python3 -m garak --target_type huggingface --target_name gpt2 --probes dan.Dan_11_0
```

## Resumable Scans

`garak` supports resumable scans that allow you to continue interrupted scans without starting from scratch. This is useful for:
- Long-running scans that may be interrupted by network issues, rate limits, or system crashes
- Saving API costs by avoiding redundant prompts
- Enabling flexible scan scheduling (pause/resume)

### Basic Usage

```bash
# Start a resumable scan
python3 -m garak --target_type openai --target_name gpt-4 --probes all

# List saved runs (newest first); copy the run_id you need
python3 -m garak --list_runs

# Resume with the same probe list and target as the original scan
python3 -m garak --resume 550e8400-e29b-41d4-a716-446655440000

# Delete old run state
python3 -m garak --delete_run 550e8400-e29b-41d4-a716-446655440000
```

### Resume Notes
Resume is probe-level: probes that are fully completed are skipped on resume.
Each resume creates a new timestamped output report file; the original report is preserved.

Resuming checks that your **probe list** and **generator target** (`--target_type` / `--target_name`) match the original run; if they differ, resume stops with an error so completed-probe bookkeeping stays meaningful.

Saved run state is stored under the XDG data directory in a `runs/` subdirectory (for example: `<xdg_data_home>/garak/runs/<run_id>/state.json`).
You can manage saved state with `--list_runs` and `--delete_run`.

## Reading the results

Expand Down
2 changes: 2 additions & 0 deletions garak/_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ class TransientConfig(GarakSubConfig):
hitlogfile = None
args = None # only access this when determining what was passed on CLI
run_id = None
resume_run_id = None
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is simply not needed.

package_dir = pathlib.Path(__file__).parents[0]
config_dir = xdg_config_home() / project_dir_name
data_dir = xdg_data_home() / project_dir_name
Expand Down Expand Up @@ -122,6 +123,7 @@ def _nested_dict():
run.soft_probe_prompt_cap = 64
run.target_lang = "en"
run.langproviders = []
# Note: resume configuration is provided only via transient.resume_run_id
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is simply not needed.


# placeholder
# generator, probe, detector, buff = {}, {}, {}, {}
Expand Down
6 changes: 3 additions & 3 deletions garak/analyze/bootstrap_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,15 +45,15 @@ def _bootstrap_calculation(

n = len(results)
corrected_asrs = np.empty(num_iterations)

# No correction needed when denominator ≈ 1.0
# This occurs when: (1) perfect detector (Se=Sp=1.0), or (2) fallback triggered above (Se+Sp-1 < 0.01)
is_perfect_detector = np.isclose(denominator, 1.0)

for i in range(num_iterations):
resampled_results = np.random.choice(results, size=n, replace=True)
p_obs = resampled_results.mean()

if is_perfect_detector:
corrected_asrs[i] = p_obs
else:
Expand Down
106 changes: 52 additions & 54 deletions garak/analyze/ci_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,7 @@ def _extract_reporting_config_from_setup(report_path: str) -> dict:
entry = json.loads(first_line)
if entry.get("entry_type") != "start_run setup":
return {}
return {
k: v for k, v in entry.items()
if k.startswith("reporting.")
}
return {k: v for k, v in entry.items() if k.startswith("reporting.")}


def _reconstruct_binary_from_aggregates(passed: int, failed: int) -> List[int]:
Expand All @@ -68,109 +65,109 @@ def calculate_ci_from_report(
report_path: str,
probe_detector_pairs: Optional[List[Tuple[str, str]]] = None,
num_iterations: Optional[int] = None,
confidence_level: Optional[float] = None
confidence_level: Optional[float] = None,
) -> Dict[Tuple[str, str], Tuple[float, float]]:
"""Calculate bootstrap CIs for probe/detector pairs using report digest aggregates"""
report_file = Path(report_path)

if not report_file.exists():
raise FileNotFoundError(
f"Report file not found at: {report_file}. "
f"Expected to find garak report JSONL file."
)

# Pull defaults from config
if num_iterations is None:
num_iterations = _config.reporting.bootstrap_num_iterations
if confidence_level is None:
confidence_level = _config.reporting.bootstrap_confidence_level

# Read digest entry from report
digest = _get_report_digest(str(report_file))

if digest is None:
raise ValueError(
f"Report {report_file} missing 'digest' entry. "
f"Digest is required for CI calculation from aggregates. "
f"Ensure report was generated with garak v0.11.0 or later."
)

eval_data = digest.get("eval", {})
if not eval_data:
logging.warning("No evaluation data found in digest for %s", report_file)
return {}

# Load detector metrics for Se/Sp correction
detector_metrics = get_detector_metrics()
min_sample_size = _config.reporting.bootstrap_min_sample_size

ci_results = {}

# Iterate through digest structure: probe_group -> probe_class -> detector
for probe_group in eval_data:
for probe_key in eval_data[probe_group]:
if probe_key == "_summary":
continue

# Parse probe module and class from key (format: "module.class")
if "." not in probe_key:
continue

probe_name = probe_key

for detector_key in eval_data[probe_group][probe_key]:
if detector_key == "_summary":
continue

detector_name = detector_key

# Skip if not in requested pairs (if specified)
if probe_detector_pairs is not None:
if (probe_name, detector_name) not in probe_detector_pairs:
continue

detector_result = eval_data[probe_group][probe_key][detector_key]

# Extract aggregates
total = detector_result.get("total_evaluated", 0)
passed = detector_result.get("passed", 0)

if total == 0:
logging.warning(
"No evaluated samples for probe=%s, detector=%s",
probe_name,
detector_name
detector_name,
)
continue

# Check minimum sample size
if total < min_sample_size:
logging.warning(
"Insufficient samples for CI calculation: probe=%s, detector=%s, n=%d (minimum: %d)",
probe_name,
detector_name,
total,
min_sample_size
min_sample_size,
)
continue

# Reconstruct binary data from aggregates
# Order irrelevant: bootstrap resamples randomly with replacement
failed = total - passed
binary_results = _reconstruct_binary_from_aggregates(passed, failed)

# Get detector Se/Sp for correction
se, sp = detector_metrics.get_detector_se_sp(detector_key)

# Calculate bootstrap CI
ci_result = calculate_bootstrap_ci(
results=binary_results,
sensitivity=se,
specificity=sp,
num_iterations=num_iterations,
confidence_level=confidence_level
confidence_level=confidence_level,
)

if ci_result is not None:
ci_results[(probe_name, detector_name)] = ci_result
logging.debug(
Expand All @@ -179,9 +176,9 @@ def calculate_ci_from_report(
detector_name,
ci_result[0],
ci_result[1],
total
total,
)

return ci_results


Expand All @@ -190,86 +187,87 @@ def update_eval_entries_with_ci(
ci_results: Dict[Tuple[str, str], Tuple[float, float]],
output_path: Optional[str] = None,
confidence_method: Optional[str] = None,
confidence_level: Optional[float] = None
confidence_level: Optional[float] = None,
) -> None:
"""Update eval entries in report JSONL with new CI values, overwrites if output_path is None"""
if confidence_method is None:
confidence_method = _config.reporting.confidence_interval_method
if confidence_level is None:
confidence_level = _config.reporting.bootstrap_confidence_level
report_file = Path(report_path)

if not report_file.exists():
raise FileNotFoundError(
f"Report file not found at: {report_file}. "
f"Cannot update eval entries."
f"Report file not found at: {report_file}. " f"Cannot update eval entries."
)

# Use pathlib.Path for output handling
if output_path is None:
output_file = report_file.with_suffix(".tmp")
overwrite = True
else:
output_file = Path(output_path)
overwrite = False

try:
with open(report_file, "r", encoding="utf-8") as infile, \
open(output_file, "w", encoding="utf-8") as outfile:

with (
open(report_file, "r", encoding="utf-8") as infile,
open(output_file, "w", encoding="utf-8") as outfile,
):

for line_num, line in enumerate(infile, 1):
try:
entry = json.loads(line.strip())
except json.JSONDecodeError as e:
raise json.JSONDecodeError(
f"Malformed JSON at line {line_num} in {report_file}: {e.msg}",
e.doc,
e.pos
e.pos,
) from e

if entry.get("entry_type") == "digest":
logging.debug("Stripping stale digest entry (will be recalculated)")
continue

if entry.get("entry_type") == "start_run setup":
for param in _config.reporting_params:
entry[f"reporting.{param}"] = getattr(
_config.reporting, param
)
entry[f"reporting.{param}"] = getattr(_config.reporting, param)

if entry.get("entry_type") == "eval":
probe = entry.get("probe")
detector = entry.get("detector")

if probe is None or detector is None:
outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")
continue

key = (probe, detector)

if key in ci_results:
ci_lower, ci_upper = ci_results[key]
entry["confidence_method"] = confidence_method
entry["confidence"] = str(confidence_level)
entry["confidence_lower"] = ci_lower / 100.0 # Store as 0-1 scale
entry["confidence_lower"] = (
ci_lower / 100.0
) # Store as 0-1 scale
entry["confidence_upper"] = ci_upper / 100.0

logging.debug(
"Updated CI for %s / %s: [%.2f, %.2f]",
probe,
detector,
ci_lower,
ci_upper
ci_upper,
)

outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")

if overwrite:
output_file.replace(report_file)
logging.info("Updated report file: %s", report_file)
else:
logging.info("Wrote updated report to: %s", output_file)

except OSError as e:
if overwrite and output_file.exists():
output_file.unlink()
Expand Down
8 changes: 6 additions & 2 deletions garak/analyze/rebuild_cis.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,18 @@ def rebuild_cis_for_report(
existing_method = existing.get("confidence_method", "unknown")
existing_level = existing.get("confidence_level")
if existing_method != ci_method:
print(f"📊 Report used '{existing_method}' method. Rebuilding with '{ci_method}'.")
print(
f"📊 Report used '{existing_method}' method. Rebuilding with '{ci_method}'."
)
if existing_level is not None and abs(existing_level - active_level) > 1e-9:
print(
f"📊 Report has existing CIs at {existing_level * 100:.1f}% confidence. "
f"Rebuilding with {active_level * 100:.1f}% confidence."
)
else:
print(f"📊 Rebuilding CIs at {active_level * 100:.1f}% confidence for {report_file}")
print(
f"📊 Rebuilding CIs at {active_level * 100:.1f}% confidence for {report_file}"
)
else:
print(
f"📊 No existing CIs found in report. "
Expand Down
Loading
Loading