diff --git a/src/strands_evals/case.py b/src/strands_evals/case.py index aa438604..d344380e 100644 --- a/src/strands_evals/case.py +++ b/src/strands_evals/case.py @@ -19,6 +19,10 @@ class Case(BaseModel, Generic[InputT, OutputT]): name: The name of the test case. This will be used to identify the test in the summary report. session_id: The session ID for the test case. Automatically generates a UUID4 if not provided. expected_output: The expected response given the input. eg. the agent's response + expected_assertion: Human-authored success assertions describing expected agent actions, + responses, or behaviors. Used by assertion-based evaluators (e.g., GoalSuccessRateEvaluator) + to judge whether the agent satisfied explicit criteria rather than inferring goals + from the conversation. expected_trajectory: The expected trajectory of a task given the input. eg. sequence of tools expected_interactions: The expected interaction sequence given the input (ideal for multi-agent systems). metadata: Additional information about the test case. @@ -45,6 +49,7 @@ class Case(BaseModel, Generic[InputT, OutputT]): session_id: str = Field(default_factory=lambda: str(uuid.uuid4())) input: InputT expected_output: OutputT | None = None + expected_assertion: str | None = None expected_trajectory: list[Any] | None = None expected_interactions: list[Interaction] | None = None expected_environment_state: list[EnvironmentState] | None = None diff --git a/src/strands_evals/evaluators/goal_success_rate_evaluator.py b/src/strands_evals/evaluators/goal_success_rate_evaluator.py index 34aba956..197ac600 100644 --- a/src/strands_evals/evaluators/goal_success_rate_evaluator.py +++ b/src/strands_evals/evaluators/goal_success_rate_evaluator.py @@ -9,7 +9,7 @@ from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT from ..types.trace import EvaluationLevel, SessionLevelInput from .evaluator import Evaluator -from .prompt_templates.goal_success_rate import get_template +from .prompt_templates.goal_success_rate import get_assertion_template, get_template class GoalSuccessScore(str, Enum): @@ -26,8 +26,34 @@ class GoalSuccessRating(BaseModel): score: GoalSuccessScore = Field(description="Score should be one of 'Yes' or 'No'") +class GoalSuccessAssertionScore(str, Enum): + """Binary assertion-based goal success ratings.""" + + SUCCESS = "SUCCESS" + FAILURE = "FAILURE" + + +class GoalSuccessAssertionRating(BaseModel): + """Structured output for assertion-based goal success evaluation.""" + + reasoning: str = Field(description="Brief explanation of the evaluation") + verdict: GoalSuccessAssertionScore = Field(description="Verdict should be one of 'SUCCESS' or 'FAILURE'") + + class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]): - """Evaluates whether all user goals were successfully achieved in a conversation.""" + """Evaluates whether all user goals were successfully achieved in a conversation. + + Supports two modes: + - **Basic mode**: Evaluates goal success based on conversation analysis alone. + The judge LLM infers user goals from the conversation and checks whether they were met. + Uses a Yes/No scoring rubric (Yes=1.0, No=0.0). + - **Assertion mode**: When ``expected_assertion`` is provided on the evaluation case, + evaluates whether the agent's behavior satisfies the specified success assertions — + human-authored statements describing expected agent actions, responses, or behaviors. + Unlike basic mode, assertion mode judges against explicit criteria defined upfront + rather than inferring goals from the conversation. + Uses a SUCCESS/FAILURE scoring rubric (SUCCESS=1.0, FAILURE=0.0). + """ evaluation_level = EvaluationLevel.SESSION_LEVEL @@ -36,19 +62,42 @@ class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]): GoalSuccessScore.NO: 0.0, } + _assertion_score_mapping = { + GoalSuccessAssertionScore.SUCCESS: 1.0, + GoalSuccessAssertionScore.FAILURE: 0.0, + } + def __init__( self, version: str = "v0", model: Union[Model, str, None] = None, system_prompt: str | None = None, + assertion_system_prompt: str | None = None, ): super().__init__() self.system_prompt = system_prompt if system_prompt is not None else get_template(version).SYSTEM_PROMPT + self.assertion_system_prompt = ( + assertion_system_prompt + if assertion_system_prompt is not None + else get_assertion_template(version).SYSTEM_PROMPT + ) self.version = version self.model = model + def _has_assertion(self, evaluation_case: EvaluationData[InputT, OutputT]) -> bool: + """Check if the evaluation case contains expected_assertion for assertion mode.""" + return bool(evaluation_case.expected_assertion) + def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: session_input = self._parse_trajectory(evaluation_case) + + if self._has_assertion(evaluation_case): + return self._evaluate_with_assertion(session_input, evaluation_case) + + return self._evaluate_basic(session_input) + + def _evaluate_basic(self, session_input: SessionLevelInput) -> list[EvaluationOutput]: + """Evaluate goal success using the basic prompt (no criteria).""" prompt = self._format_prompt(session_input) evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) result = evaluator_agent(prompt, structured_output_model=GoalSuccessRating) @@ -63,19 +112,23 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva ) ] - async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]: - session_input = self._parse_trajectory(evaluation_case) - prompt = self._format_prompt(session_input) - evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None) - result = await evaluator_agent.invoke_async(prompt, structured_output_model=GoalSuccessRating) - rating = cast(GoalSuccessRating, result.structured_output) - normalized_score = self._score_mapping[rating.score] + def _evaluate_with_assertion( + self, + session_input: SessionLevelInput, + evaluation_case: EvaluationData[InputT, OutputT], + ) -> list[EvaluationOutput]: + """Evaluate goal success using assertion-based prompt.""" + prompt = self._format_assertion_prompt(session_input, evaluation_case) + evaluator_agent = Agent(model=self.model, system_prompt=self.assertion_system_prompt, callback_handler=None) + result = evaluator_agent(prompt, structured_output_model=GoalSuccessAssertionRating) + rating = cast(GoalSuccessAssertionRating, result.structured_output) + normalized_score = self._assertion_score_mapping[rating.verdict] return [ EvaluationOutput( score=normalized_score, test_pass=normalized_score >= 1.0, reason=rating.reasoning, - label=rating.score, + label=rating.verdict, ) ] @@ -90,3 +143,20 @@ def _format_prompt(self, session_input: SessionLevelInput) -> str: parts.append(f"# Conversation record\n{self._format_session_history(session_input.session_history)}") return "\n\n".join(parts) + + def _format_assertion_prompt( + self, + session_input: SessionLevelInput, + evaluation_case: EvaluationData[InputT, OutputT], + ) -> str: + """Format evaluation prompt for assertion-based evaluation.""" + assertions = evaluation_case.expected_assertion or "" + + parts = [] + + if session_input.session_history: + parts.append(f"CONVERSATION RECORD:\n{self._format_session_history(session_input.session_history)}") + + parts.append(f"SUCCESS ASSERTIONS:\n{assertions}") + + return "\n\n".join(parts) diff --git a/src/strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py b/src/strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py index 8c8229ed..785c3985 100644 --- a/src/strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py +++ b/src/strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py @@ -1,11 +1,19 @@ -from . import goal_success_rate_v0 +from . import goal_success_rate_v0, goal_success_rate_with_assertions_v0 VERSIONS = { "v0": goal_success_rate_v0, } +ASSERTION_VERSIONS = { + "v0": goal_success_rate_with_assertions_v0, +} + DEFAULT_VERSION = "v0" def get_template(version: str = DEFAULT_VERSION): return VERSIONS[version] + + +def get_assertion_template(version: str = DEFAULT_VERSION): + return ASSERTION_VERSIONS[version] diff --git a/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_with_assertions_v0.py b/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_with_assertions_v0.py new file mode 100644 index 00000000..babf36c6 --- /dev/null +++ b/src/strands_evals/evaluators/prompt_templates/goal_success_rate/goal_success_rate_with_assertions_v0.py @@ -0,0 +1,16 @@ +SYSTEM_PROMPT = """You are an evaluator for an LLM-based agent. + +You will be provided with: +1. A conversation record between a user and an AI assistant. +2. A set of success assertions that define what the agent must accomplish. + +TASK: +Decide whether the agent successfully completed the task. + +INSTRUCTIONS: +- Judge only based on whether the agent behavior satisfies the success assertions. +- Evaluate assertions by their intent, not by exact text matching. Minor differences in wording, parameter ordering, or formatting should not cause a failure. +- If an assertion describes a specific action or tool call to achieve a particular outcome, and the agent achieved the same outcome through an alternative approach clearly evidenced in the conversation, consider the assertion satisfied. +- Do not rationalize or make assumptions beyond what the conversation shows. +- Ignore style and verbosity. +- Keep your reasoning concise — under 200 words.""" diff --git a/src/strands_evals/experiment.py b/src/strands_evals/experiment.py index 8afeb37e..78a96461 100644 --- a/src/strands_evals/experiment.py +++ b/src/strands_evals/experiment.py @@ -188,6 +188,7 @@ async def _run_task_async( name=case.name, input=case.input, expected_output=case.expected_output, + expected_assertion=case.expected_assertion, expected_trajectory=case.expected_trajectory, expected_interactions=case.expected_interactions, expected_environment_state=case.expected_environment_state, diff --git a/src/strands_evals/types/evaluation.py b/src/strands_evals/types/evaluation.py index 1c4d627a..0167ff7c 100644 --- a/src/strands_evals/types/evaluation.py +++ b/src/strands_evals/types/evaluation.py @@ -82,6 +82,11 @@ class EvaluationData(BaseModel, Generic[InputT, OutputT]): input: The input to the task. eg. the query to the agent actual_output: The actual response given the input. expected_output: The expected response given the input. + expected_assertion: Human-authored success assertions describing expected agent actions, + responses, or behaviors. Used by assertion-based evaluators (e.g., GoalSuccessRateEvaluator) + to judge whether the agent satisfied explicit criteria rather than inferring goals + from the conversation. Example: 'find_user_id_by_name_zip is called with + {"first_name": "Yusuf", "last_name": "Rossi", "zip": "19122"}' actual_trajectory: The actual trajectory of a task given the input. expected_trajectory: The expected trajectory of a task given the input. name: The name of the test case. This will be used to identify the test in the summary report. @@ -94,6 +99,7 @@ class EvaluationData(BaseModel, Generic[InputT, OutputT]): actual_output: OutputT | None = None name: str | None = None expected_output: OutputT | None = None + expected_assertion: str | None = None expected_trajectory: Union[list[Any], Session, None] = None actual_trajectory: Union[list[Any], Session, None] = None metadata: dict[str, Any] | None = None diff --git a/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py b/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py index 86dce93e..140e1b92 100644 --- a/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py +++ b/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py @@ -4,7 +4,12 @@ import pytest from strands_evals.evaluators import GoalSuccessRateEvaluator -from strands_evals.evaluators.goal_success_rate_evaluator import GoalSuccessRating, GoalSuccessScore +from strands_evals.evaluators.goal_success_rate_evaluator import ( + GoalSuccessAssertionRating, + GoalSuccessAssertionScore, + GoalSuccessRating, + GoalSuccessScore, +) from strands_evals.types import EvaluationData from strands_evals.types.trace import ( AgentInvocationSpan, @@ -53,15 +58,20 @@ def test_init_with_defaults(): assert evaluator.version == "v0" assert evaluator.model is None assert evaluator.system_prompt is not None + assert evaluator.assertion_system_prompt is not None + assert evaluator.assertion_system_prompt != evaluator.system_prompt assert evaluator.evaluation_level == EvaluationLevel.SESSION_LEVEL def test_init_with_custom_values(): - evaluator = GoalSuccessRateEvaluator(version="v1", model="gpt-4", system_prompt="Custom") + evaluator = GoalSuccessRateEvaluator( + version="v1", model="gpt-4", system_prompt="Custom", assertion_system_prompt="Custom assertion" + ) assert evaluator.version == "v1" assert evaluator.model == "gpt-4" assert evaluator.system_prompt == "Custom" + assert evaluator.assertion_system_prompt == "Custom assertion" @patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent") @@ -106,24 +116,101 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value, assert result[0].label == score -@pytest.mark.asyncio -@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent") -async def test_evaluate_async(mock_agent_class, evaluation_data): - mock_agent = Mock() +@pytest.fixture +def evaluation_data_with_assertion(): + now = datetime.now() + span_info = SpanInfo(session_id="test-session", start_time=now, end_time=now) + + tool_config = ToolConfig(name="calculator", description="Evaluate mathematical expressions") + + agent_span = AgentInvocationSpan( + span_info=span_info, + user_prompt="What is 2 + 2?", + agent_response="The answer is 4.", + available_tools=[tool_config], + ) + + tool_span = ToolExecutionSpan( + span_info=span_info, + tool_call=ToolCall(name="calculator", arguments={"expression": "2+2"}, tool_call_id="1"), + tool_result=ToolResult(content="4", tool_call_id="1"), + ) + + trace = Trace(spans=[agent_span, tool_span], trace_id="trace1", session_id="test-session") + session = Session(traces=[trace], session_id="test-session") + + return EvaluationData( + input="What is 2 + 2?", + actual_output="The answer is 4.", + actual_trajectory=session, + name="test-criteria", + expected_assertion="The agent should use the calculator tool and return the correct answer of 4.", + ) + + +def test_has_assertion_true(evaluation_data_with_assertion): + evaluator = GoalSuccessRateEvaluator() + assert evaluator._has_assertion(evaluation_data_with_assertion) is True + + +def test_has_assertion_false(evaluation_data): + evaluator = GoalSuccessRateEvaluator() + assert evaluator._has_assertion(evaluation_data) is False + + +def test_has_assertion_none(): + data = EvaluationData(input="test") + evaluator = GoalSuccessRateEvaluator() + assert evaluator._has_assertion(data) is False + + +def test_has_assertion_empty_string(): + data = EvaluationData(input="test", expected_assertion="") + evaluator = GoalSuccessRateEvaluator() + assert evaluator._has_assertion(data) is False - async def mock_invoke_async(*args, **kwargs): - mock_result = Mock() - mock_result.structured_output = GoalSuccessRating(reasoning="All goals achieved", score=GoalSuccessScore.YES) - return mock_result - mock_agent.invoke_async = mock_invoke_async +@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent") +def test_evaluate_with_assertion(mock_agent_class, evaluation_data_with_assertion): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = GoalSuccessAssertionRating( + reasoning="Agent used calculator and returned 4", verdict=GoalSuccessAssertionScore.SUCCESS + ) + mock_agent.return_value = mock_result mock_agent_class.return_value = mock_agent evaluator = GoalSuccessRateEvaluator() - result = await evaluator.evaluate_async(evaluation_data) + result = evaluator.evaluate(evaluation_data_with_assertion) assert len(result) == 1 assert result[0].score == 1.0 assert result[0].test_pass is True - assert result[0].reason == "All goals achieved" - assert result[0].label == GoalSuccessScore.YES + assert result[0].reason == "Agent used calculator and returned 4" + assert result[0].label == GoalSuccessAssertionScore.SUCCESS + + +@pytest.mark.parametrize( + "verdict,expected_value,expected_pass", + [ + (GoalSuccessAssertionScore.SUCCESS, 1.0, True), + (GoalSuccessAssertionScore.FAILURE, 0.0, False), + ], +) +@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent") +def test_assertion_score_mapping( + mock_agent_class, evaluation_data_with_assertion, verdict, expected_value, expected_pass +): + mock_agent = Mock() + mock_result = Mock() + mock_result.structured_output = GoalSuccessAssertionRating(reasoning="Test", verdict=verdict) + mock_agent.return_value = mock_result + mock_agent_class.return_value = mock_agent + evaluator = GoalSuccessRateEvaluator() + + result = evaluator.evaluate(evaluation_data_with_assertion) + + assert len(result) == 1 + assert result[0].score == expected_value + assert result[0].test_pass == expected_pass + assert result[0].label == verdict diff --git a/tests/strands_evals/test_experiment.py b/tests/strands_evals/test_experiment.py index a806b5c9..9285a326 100644 --- a/tests/strands_evals/test_experiment.py +++ b/tests/strands_evals/test_experiment.py @@ -279,6 +279,7 @@ def test_experiment_to_dict_non_empty(mock_evaluator): "session_id": session_id, "input": "hello", "expected_output": "world", + "expected_assertion": None, "expected_trajectory": None, "expected_interactions": None, "expected_environment_state": None, @@ -310,6 +311,7 @@ def test_experiment_to_dict_OutputEvaluator_full(): "session_id": session_id, "input": "hello", "expected_output": "world", + "expected_assertion": None, "expected_trajectory": None, "expected_interactions": None, "expected_environment_state": None, @@ -343,6 +345,7 @@ def test_experiment_to_dict_OutputEvaluator_default(): "session_id": session_id, "input": "hello", "expected_output": "world", + "expected_assertion": None, "expected_trajectory": None, "expected_interactions": None, "expected_environment_state": None, @@ -367,6 +370,7 @@ def test_experiment_to_dict_TrajectoryEvaluator_default(): "session_id": session_id, "input": "hello", "expected_output": "world", + "expected_assertion": None, "expected_trajectory": ["step1", "step2"], "expected_interactions": None, "expected_environment_state": None, @@ -396,6 +400,7 @@ def test_experiment_to_dict_TrajectoryEvaluator_full(): "session_id": session_id, "input": "hello", "expected_output": "world", + "expected_assertion": None, "expected_trajectory": ["step1", "step2"], "expected_interactions": None, "expected_environment_state": None, @@ -428,6 +433,7 @@ def test_experiment_to_dict_InteractionsEvaluator_default(): "session_id": session_id, "input": "hello", "expected_output": "world", + "expected_assertion": None, "expected_trajectory": None, "expected_interactions": interactions, "expected_environment_state": None, @@ -460,6 +466,7 @@ def test_experiment_to_dict_InteractionsEvaluator_full(): "session_id": session_id, "input": "hello", "expected_output": "world", + "expected_assertion": None, "expected_trajectory": None, "expected_interactions": interactions, "expected_environment_state": None, @@ -491,6 +498,7 @@ def test_experiment_to_dict_case_dict(): "session_id": session_id, "input": {"field1": "hello"}, "expected_output": {"field2": "world"}, + "expected_assertion": None, "expected_trajectory": None, "expected_interactions": None, "expected_environment_state": None, @@ -518,6 +526,7 @@ def simple_echo(query): "session_id": session_id, "input": simple_echo, "expected_output": None, + "expected_assertion": None, "expected_trajectory": None, "expected_interactions": None, "expected_environment_state": None,