Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/strands_evals/case.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ class Case(BaseModel, Generic[InputT, OutputT]):
name: The name of the test case. This will be used to identify the test in the summary report.
session_id: The session ID for the test case. Automatically generates a UUID4 if not provided.
expected_output: The expected response given the input. eg. the agent's response
expected_assertion: Human-authored success assertions describing expected agent actions,
responses, or behaviors. Used by assertion-based evaluators (e.g., GoalSuccessRateEvaluator)
to judge whether the agent satisfied explicit criteria rather than inferring goals
from the conversation.
expected_trajectory: The expected trajectory of a task given the input. eg. sequence of tools
expected_interactions: The expected interaction sequence given the input (ideal for multi-agent systems).
metadata: Additional information about the test case.
Expand All @@ -45,6 +49,7 @@ class Case(BaseModel, Generic[InputT, OutputT]):
session_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
input: InputT
expected_output: OutputT | None = None
expected_assertion: str | None = None
expected_trajectory: list[Any] | None = None
expected_interactions: list[Interaction] | None = None
expected_environment_state: list[EnvironmentState] | None = None
Expand Down
90 changes: 80 additions & 10 deletions src/strands_evals/evaluators/goal_success_rate_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
from ..types.trace import EvaluationLevel, SessionLevelInput
from .evaluator import Evaluator
from .prompt_templates.goal_success_rate import get_template
from .prompt_templates.goal_success_rate import get_assertion_template, get_template


class GoalSuccessScore(str, Enum):
Expand All @@ -26,8 +26,34 @@ class GoalSuccessRating(BaseModel):
score: GoalSuccessScore = Field(description="Score should be one of 'Yes' or 'No'")


class GoalSuccessAssertionScore(str, Enum):
"""Binary assertion-based goal success ratings."""

SUCCESS = "SUCCESS"
FAILURE = "FAILURE"


class GoalSuccessAssertionRating(BaseModel):
"""Structured output for assertion-based goal success evaluation."""

reasoning: str = Field(description="Brief explanation of the evaluation")
verdict: GoalSuccessAssertionScore = Field(description="Verdict should be one of 'SUCCESS' or 'FAILURE'")


class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):
"""Evaluates whether all user goals were successfully achieved in a conversation."""
"""Evaluates whether all user goals were successfully achieved in a conversation.

Supports two modes:
- **Basic mode**: Evaluates goal success based on conversation analysis alone.
The judge LLM infers user goals from the conversation and checks whether they were met.
Uses a Yes/No scoring rubric (Yes=1.0, No=0.0).
- **Assertion mode**: When ``expected_assertion`` is provided on the evaluation case,
evaluates whether the agent's behavior satisfies the specified success assertions —
human-authored statements describing expected agent actions, responses, or behaviors.
Unlike basic mode, assertion mode judges against explicit criteria defined upfront
rather than inferring goals from the conversation.
Uses a SUCCESS/FAILURE scoring rubric (SUCCESS=1.0, FAILURE=0.0).
"""

evaluation_level = EvaluationLevel.SESSION_LEVEL

Expand All @@ -36,19 +62,42 @@ class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):
GoalSuccessScore.NO: 0.0,
}

_assertion_score_mapping = {
GoalSuccessAssertionScore.SUCCESS: 1.0,
GoalSuccessAssertionScore.FAILURE: 0.0,
}

def __init__(
self,
version: str = "v0",
model: Union[Model, str, None] = None,
system_prompt: str | None = None,
assertion_system_prompt: str | None = None,
):
super().__init__()
self.system_prompt = system_prompt if system_prompt is not None else get_template(version).SYSTEM_PROMPT
self.assertion_system_prompt = (
assertion_system_prompt
if assertion_system_prompt is not None
else get_assertion_template(version).SYSTEM_PROMPT
)
self.version = version
self.model = model

def _has_assertion(self, evaluation_case: EvaluationData[InputT, OutputT]) -> bool:
"""Check if the evaluation case contains expected_assertion for assertion mode."""
return bool(evaluation_case.expected_assertion)

def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
session_input = self._parse_trajectory(evaluation_case)

if self._has_assertion(evaluation_case):
return self._evaluate_with_assertion(session_input, evaluation_case)

return self._evaluate_basic(session_input)

def _evaluate_basic(self, session_input: SessionLevelInput) -> list[EvaluationOutput]:
"""Evaluate goal success using the basic prompt (no criteria)."""
prompt = self._format_prompt(session_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = evaluator_agent(prompt, structured_output_model=GoalSuccessRating)
Expand All @@ -63,19 +112,23 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
)
]

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
session_input = self._parse_trajectory(evaluation_case)
prompt = self._format_prompt(session_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=GoalSuccessRating)
rating = cast(GoalSuccessRating, result.structured_output)
normalized_score = self._score_mapping[rating.score]
def _evaluate_with_assertion(
self,
session_input: SessionLevelInput,
evaluation_case: EvaluationData[InputT, OutputT],
) -> list[EvaluationOutput]:
"""Evaluate goal success using assertion-based prompt."""
prompt = self._format_assertion_prompt(session_input, evaluation_case)
evaluator_agent = Agent(model=self.model, system_prompt=self.assertion_system_prompt, callback_handler=None)
result = evaluator_agent(prompt, structured_output_model=GoalSuccessAssertionRating)
rating = cast(GoalSuccessAssertionRating, result.structured_output)
normalized_score = self._assertion_score_mapping[rating.verdict]
return [
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 1.0,
reason=rating.reasoning,
label=rating.score,
label=rating.verdict,
)
]

Expand All @@ -90,3 +143,20 @@ def _format_prompt(self, session_input: SessionLevelInput) -> str:
parts.append(f"# Conversation record\n{self._format_session_history(session_input.session_history)}")

return "\n\n".join(parts)

def _format_assertion_prompt(
self,
session_input: SessionLevelInput,
evaluation_case: EvaluationData[InputT, OutputT],
) -> str:
"""Format evaluation prompt for assertion-based evaluation."""
assertions = evaluation_case.expected_assertion or ""

parts = []

if session_input.session_history:
parts.append(f"CONVERSATION RECORD:\n{self._format_session_history(session_input.session_history)}")

parts.append(f"SUCCESS ASSERTIONS:\n{assertions}")

return "\n\n".join(parts)
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
from . import goal_success_rate_v0
from . import goal_success_rate_v0, goal_success_rate_with_assertions_v0

VERSIONS = {
"v0": goal_success_rate_v0,
}

ASSERTION_VERSIONS = {
"v0": goal_success_rate_with_assertions_v0,
}

DEFAULT_VERSION = "v0"


def get_template(version: str = DEFAULT_VERSION):
return VERSIONS[version]


def get_assertion_template(version: str = DEFAULT_VERSION):
return ASSERTION_VERSIONS[version]
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
SYSTEM_PROMPT = """You are an evaluator for an LLM-based agent.

You will be provided with:
1. A conversation record between a user and an AI assistant.
2. A set of success assertions that define what the agent must accomplish.

TASK:
Decide whether the agent successfully completed the task.

INSTRUCTIONS:
- Judge only based on whether the agent behavior satisfies the success assertions.
- Evaluate assertions by their intent, not by exact text matching. Minor differences in wording, parameter ordering, or formatting should not cause a failure.
- If an assertion describes a specific action or tool call to achieve a particular outcome, and the agent achieved the same outcome through an alternative approach clearly evidenced in the conversation, consider the assertion satisfied.
- Do not rationalize or make assumptions beyond what the conversation shows.
- Ignore style and verbosity.
- Keep your reasoning concise — under 200 words."""
1 change: 1 addition & 0 deletions src/strands_evals/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,7 @@ async def _run_task_async(
name=case.name,
input=case.input,
expected_output=case.expected_output,
expected_assertion=case.expected_assertion,
expected_trajectory=case.expected_trajectory,
expected_interactions=case.expected_interactions,
expected_environment_state=case.expected_environment_state,
Expand Down
6 changes: 6 additions & 0 deletions src/strands_evals/types/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ class EvaluationData(BaseModel, Generic[InputT, OutputT]):
input: The input to the task. eg. the query to the agent
actual_output: The actual response given the input.
expected_output: The expected response given the input.
expected_assertion: Human-authored success assertions describing expected agent actions,
responses, or behaviors. Used by assertion-based evaluators (e.g., GoalSuccessRateEvaluator)
to judge whether the agent satisfied explicit criteria rather than inferring goals
from the conversation. Example: 'find_user_id_by_name_zip is called with
{"first_name": "Yusuf", "last_name": "Rossi", "zip": "19122"}'
actual_trajectory: The actual trajectory of a task given the input.
expected_trajectory: The expected trajectory of a task given the input.
name: The name of the test case. This will be used to identify the test in the summary report.
Expand All @@ -94,6 +99,7 @@ class EvaluationData(BaseModel, Generic[InputT, OutputT]):
actual_output: OutputT | None = None
name: str | None = None
expected_output: OutputT | None = None
expected_assertion: str | None = None
expected_trajectory: Union[list[Any], Session, None] = None
actual_trajectory: Union[list[Any], Session, None] = None
metadata: dict[str, Any] | None = None
Expand Down
115 changes: 101 additions & 14 deletions tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@
import pytest

from strands_evals.evaluators import GoalSuccessRateEvaluator
from strands_evals.evaluators.goal_success_rate_evaluator import GoalSuccessRating, GoalSuccessScore
from strands_evals.evaluators.goal_success_rate_evaluator import (
GoalSuccessAssertionRating,
GoalSuccessAssertionScore,
GoalSuccessRating,
GoalSuccessScore,
)
from strands_evals.types import EvaluationData
from strands_evals.types.trace import (
AgentInvocationSpan,
Expand Down Expand Up @@ -53,15 +58,20 @@ def test_init_with_defaults():
assert evaluator.version == "v0"
assert evaluator.model is None
assert evaluator.system_prompt is not None
assert evaluator.assertion_system_prompt is not None
assert evaluator.assertion_system_prompt != evaluator.system_prompt
assert evaluator.evaluation_level == EvaluationLevel.SESSION_LEVEL


def test_init_with_custom_values():
evaluator = GoalSuccessRateEvaluator(version="v1", model="gpt-4", system_prompt="Custom")
evaluator = GoalSuccessRateEvaluator(
version="v1", model="gpt-4", system_prompt="Custom", assertion_system_prompt="Custom assertion"
)

assert evaluator.version == "v1"
assert evaluator.model == "gpt-4"
assert evaluator.system_prompt == "Custom"
assert evaluator.assertion_system_prompt == "Custom assertion"


@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent")
Expand Down Expand Up @@ -106,24 +116,101 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value,
assert result[0].label == score


@pytest.mark.asyncio
@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent")
async def test_evaluate_async(mock_agent_class, evaluation_data):
mock_agent = Mock()
@pytest.fixture
def evaluation_data_with_assertion():
now = datetime.now()
span_info = SpanInfo(session_id="test-session", start_time=now, end_time=now)

tool_config = ToolConfig(name="calculator", description="Evaluate mathematical expressions")

agent_span = AgentInvocationSpan(
span_info=span_info,
user_prompt="What is 2 + 2?",
agent_response="The answer is 4.",
available_tools=[tool_config],
)

tool_span = ToolExecutionSpan(
span_info=span_info,
tool_call=ToolCall(name="calculator", arguments={"expression": "2+2"}, tool_call_id="1"),
tool_result=ToolResult(content="4", tool_call_id="1"),
)

trace = Trace(spans=[agent_span, tool_span], trace_id="trace1", session_id="test-session")
session = Session(traces=[trace], session_id="test-session")

return EvaluationData(
input="What is 2 + 2?",
actual_output="The answer is 4.",
actual_trajectory=session,
name="test-criteria",
expected_assertion="The agent should use the calculator tool and return the correct answer of 4.",
)


def test_has_assertion_true(evaluation_data_with_assertion):
evaluator = GoalSuccessRateEvaluator()
assert evaluator._has_assertion(evaluation_data_with_assertion) is True


def test_has_assertion_false(evaluation_data):
evaluator = GoalSuccessRateEvaluator()
assert evaluator._has_assertion(evaluation_data) is False


def test_has_assertion_none():
data = EvaluationData(input="test")
evaluator = GoalSuccessRateEvaluator()
assert evaluator._has_assertion(data) is False


def test_has_assertion_empty_string():
data = EvaluationData(input="test", expected_assertion="")
evaluator = GoalSuccessRateEvaluator()
assert evaluator._has_assertion(data) is False

async def mock_invoke_async(*args, **kwargs):
mock_result = Mock()
mock_result.structured_output = GoalSuccessRating(reasoning="All goals achieved", score=GoalSuccessScore.YES)
return mock_result

mock_agent.invoke_async = mock_invoke_async
@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent")
def test_evaluate_with_assertion(mock_agent_class, evaluation_data_with_assertion):
mock_agent = Mock()
mock_result = Mock()
mock_result.structured_output = GoalSuccessAssertionRating(
reasoning="Agent used calculator and returned 4", verdict=GoalSuccessAssertionScore.SUCCESS
)
mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = GoalSuccessRateEvaluator()

result = await evaluator.evaluate_async(evaluation_data)
result = evaluator.evaluate(evaluation_data_with_assertion)

assert len(result) == 1
assert result[0].score == 1.0
assert result[0].test_pass is True
assert result[0].reason == "All goals achieved"
assert result[0].label == GoalSuccessScore.YES
assert result[0].reason == "Agent used calculator and returned 4"
assert result[0].label == GoalSuccessAssertionScore.SUCCESS


@pytest.mark.parametrize(
"verdict,expected_value,expected_pass",
[
(GoalSuccessAssertionScore.SUCCESS, 1.0, True),
(GoalSuccessAssertionScore.FAILURE, 0.0, False),
],
)
@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent")
def test_assertion_score_mapping(
mock_agent_class, evaluation_data_with_assertion, verdict, expected_value, expected_pass
):
mock_agent = Mock()
mock_result = Mock()
mock_result.structured_output = GoalSuccessAssertionRating(reasoning="Test", verdict=verdict)
mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = GoalSuccessRateEvaluator()

result = evaluator.evaluate(evaluation_data_with_assertion)

assert len(result) == 1
assert result[0].score == expected_value
assert result[0].test_pass == expected_pass
assert result[0].label == verdict
Loading