strands-agents · ybdarrenwang · Mar 24, 2026 · Mar 27, 2026
diff --git a/src/strands_evals/case.py b/src/strands_evals/case.py
@@ -19,6 +19,10 @@ class Case(BaseModel, Generic[InputT, OutputT]):
         name: The name of the test case. This will be used to identify the test in the summary report.
         session_id: The session ID for the test case. Automatically generates a UUID4 if not provided.
         expected_output: The expected response given the input. eg. the agent's response
+        expected_assertion: Human-authored success assertions describing expected agent actions,
+            responses, or behaviors. Used by assertion-based evaluators (e.g., GoalSuccessRateEvaluator)
+            to judge whether the agent satisfied explicit criteria rather than inferring goals
+            from the conversation.
         expected_trajectory: The expected trajectory of a task given the input. eg. sequence of tools
         expected_interactions: The expected interaction sequence given the input (ideal for multi-agent systems).
         metadata: Additional information about the test case.
@@ -45,6 +49,7 @@ class Case(BaseModel, Generic[InputT, OutputT]):
     session_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
     input: InputT
     expected_output: OutputT | None = None
+    expected_assertion: str | None = None
     expected_trajectory: list[Any] | None = None
     expected_interactions: list[Interaction] | None = None
     expected_environment_state: list[EnvironmentState] | None = None

diff --git a/src/strands_evals/evaluators/goal_success_rate_evaluator.py b/src/strands_evals/evaluators/goal_success_rate_evaluator.py
@@ -9,7 +9,7 @@
 from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
 from ..types.trace import EvaluationLevel, SessionLevelInput
 from .evaluator import Evaluator
-from .prompt_templates.goal_success_rate import get_template
+from .prompt_templates.goal_success_rate import get_assertion_template, get_template
 
 
 class GoalSuccessScore(str, Enum):
@@ -26,8 +26,34 @@ class GoalSuccessRating(BaseModel):
     score: GoalSuccessScore = Field(description="Score should be one of 'Yes' or 'No'")
 
 
+class GoalSuccessAssertionScore(str, Enum):
+    """Binary assertion-based goal success ratings."""
+
+    SUCCESS = "SUCCESS"
+    FAILURE = "FAILURE"
+
+
+class GoalSuccessAssertionRating(BaseModel):
+    """Structured output for assertion-based goal success evaluation."""
+
+    reasoning: str = Field(description="Brief explanation of the evaluation")
+    verdict: GoalSuccessAssertionScore = Field(description="Verdict should be one of 'SUCCESS' or 'FAILURE'")
+
+
 class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):
-    """Evaluates whether all user goals were successfully achieved in a conversation."""
+    """Evaluates whether all user goals were successfully achieved in a conversation.
+
+    Supports two modes:
+    - **Basic mode**: Evaluates goal success based on conversation analysis alone.
+      The judge LLM infers user goals from the conversation and checks whether they were met.
+      Uses a Yes/No scoring rubric (Yes=1.0, No=0.0).
+    - **Assertion mode**: When ``expected_assertion`` is provided on the evaluation case,
+      evaluates whether the agent's behavior satisfies the specified success assertions —
+      human-authored statements describing expected agent actions, responses, or behaviors.
+      Unlike basic mode, assertion mode judges against explicit criteria defined upfront
+      rather than inferring goals from the conversation.
+      Uses a SUCCESS/FAILURE scoring rubric (SUCCESS=1.0, FAILURE=0.0).
+    """
 
     evaluation_level = EvaluationLevel.SESSION_LEVEL
 
@@ -36,19 +62,42 @@ class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):
         GoalSuccessScore.NO: 0.0,
     }
 
+    _assertion_score_mapping = {
+        GoalSuccessAssertionScore.SUCCESS: 1.0,
+        GoalSuccessAssertionScore.FAILURE: 0.0,
+    }
+
     def __init__(
         self,
         version: str = "v0",
         model: Union[Model, str, None] = None,
         system_prompt: str | None = None,
+        assertion_system_prompt: str | None = None,
     ):
         super().__init__()
         self.system_prompt = system_prompt if system_prompt is not None else get_template(version).SYSTEM_PROMPT
+        self.assertion_system_prompt = (
+            assertion_system_prompt
+            if assertion_system_prompt is not None
+            else get_assertion_template(version).SYSTEM_PROMPT
+        )
         self.version = version
         self.model = model
 
+    def _has_assertion(self, evaluation_case: EvaluationData[InputT, OutputT]) -> bool:
+        """Check if the evaluation case contains expected_assertion for assertion mode."""
+        return bool(evaluation_case.expected_assertion)
+
     def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
         session_input = self._parse_trajectory(evaluation_case)
+
+        if self._has_assertion(evaluation_case):
+            return self._evaluate_with_assertion(session_input, evaluation_case)
+
+        return self._evaluate_basic(session_input)
+
+    def _evaluate_basic(self, session_input: SessionLevelInput) -> list[EvaluationOutput]:
+        """Evaluate goal success using the basic prompt (no criteria)."""
         prompt = self._format_prompt(session_input)
         evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
         result = evaluator_agent(prompt, structured_output_model=GoalSuccessRating)
@@ -63,19 +112,23 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
             )
         ]
 
-    async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
-        session_input = self._parse_trajectory(evaluation_case)
-        prompt = self._format_prompt(session_input)
-        evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
-        result = await evaluator_agent.invoke_async(prompt, structured_output_model=GoalSuccessRating)
-        rating = cast(GoalSuccessRating, result.structured_output)
-        normalized_score = self._score_mapping[rating.score]
+    def _evaluate_with_assertion(
+        self,
+        session_input: SessionLevelInput,
+        evaluation_case: EvaluationData[InputT, OutputT],
+    ) -> list[EvaluationOutput]:
+        """Evaluate goal success using assertion-based prompt."""
+        prompt = self._format_assertion_prompt(session_input, evaluation_case)
+        evaluator_agent = Agent(model=self.model, system_prompt=self.assertion_system_prompt, callback_handler=None)
+        result = evaluator_agent(prompt, structured_output_model=GoalSuccessAssertionRating)
+        rating = cast(GoalSuccessAssertionRating, result.structured_output)
+        normalized_score = self._assertion_score_mapping[rating.verdict]
         return [
             EvaluationOutput(
                 score=normalized_score,
                 test_pass=normalized_score >= 1.0,
                 reason=rating.reasoning,
-                label=rating.score,
+                label=rating.verdict,
             )
         ]
 
@@ -90,3 +143,20 @@ def _format_prompt(self, session_input: SessionLevelInput) -> str:
             parts.append(f"# Conversation record\n{self._format_session_history(session_input.session_history)}")
 
         return "\n\n".join(parts)
+
+    def _format_assertion_prompt(
+        self,
+        session_input: SessionLevelInput,
+        evaluation_case: EvaluationData[InputT, OutputT],
+    ) -> str:
+        """Format evaluation prompt for assertion-based evaluation."""
+        assertions = evaluation_case.expected_assertion or ""
+
+        parts = []
+
+        if session_input.session_history:
+            parts.append(f"CONVERSATION RECORD:\n{self._format_session_history(session_input.session_history)}")
+
+        parts.append(f"SUCCESS ASSERTIONS:\n{assertions}")
+
+        return "\n\n".join(parts)
diff --git a/src/strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py b/src/strands_evals/evaluators/prompt_templates/goal_success_rate/__init__.py
@@ -1,11 +1,19 @@
-from . import goal_success_rate_v0
+from . import goal_success_rate_v0, goal_success_rate_with_assertions_v0
 
 VERSIONS = {
     "v0": goal_success_rate_v0,
 }
 
+ASSERTION_VERSIONS = {
+    "v0": goal_success_rate_with_assertions_v0,
+}
+
 DEFAULT_VERSION = "v0"
 
 
 def get_template(version: str = DEFAULT_VERSION):
     return VERSIONS[version]
+
+
+def get_assertion_template(version: str = DEFAULT_VERSION):
+    return ASSERTION_VERSIONS[version]
diff --git a/...als/evaluators/prompt_templates/goal_success_rate/goal_success_rate_with_assertions_v0.py b/...als/evaluators/prompt_templates/goal_success_rate/goal_success_rate_with_assertions_v0.py
@@ -0,0 +1,16 @@
+SYSTEM_PROMPT = """You are an evaluator for an LLM-based agent.
+
+You will be provided with:
+1. A conversation record between a user and an AI assistant.
+2. A set of success assertions that define what the agent must accomplish.
+
+TASK:
+Decide whether the agent successfully completed the task.
+
+INSTRUCTIONS:
+- Judge only based on whether the agent behavior satisfies the success assertions.
+- Evaluate assertions by their intent, not by exact text matching. Minor differences in wording, parameter ordering, or formatting should not cause a failure.
+- If an assertion describes a specific action or tool call to achieve a particular outcome, and the agent achieved the same outcome through an alternative approach clearly evidenced in the conversation, consider the assertion satisfied.
+- Do not rationalize or make assumptions beyond what the conversation shows.
+- Ignore style and verbosity.
+- Keep your reasoning concise — under 200 words."""
diff --git a/src/strands_evals/experiment.py b/src/strands_evals/experiment.py
@@ -188,6 +188,7 @@ async def _run_task_async(
             name=case.name,
             input=case.input,
             expected_output=case.expected_output,
+            expected_assertion=case.expected_assertion,
             expected_trajectory=case.expected_trajectory,
             expected_interactions=case.expected_interactions,
             expected_environment_state=case.expected_environment_state,

diff --git a/src/strands_evals/types/evaluation.py b/src/strands_evals/types/evaluation.py
@@ -82,6 +82,11 @@ class EvaluationData(BaseModel, Generic[InputT, OutputT]):
         input: The input to the task. eg. the query to the agent
         actual_output: The actual response given the input.
         expected_output: The expected response given the input.
+        expected_assertion: Human-authored success assertions describing expected agent actions,
+            responses, or behaviors. Used by assertion-based evaluators (e.g., GoalSuccessRateEvaluator)
+            to judge whether the agent satisfied explicit criteria rather than inferring goals
+            from the conversation. Example: 'find_user_id_by_name_zip is called with
+            {"first_name": "Yusuf", "last_name": "Rossi", "zip": "19122"}'
         actual_trajectory: The actual trajectory of a task given the input.
         expected_trajectory: The expected trajectory of a task given the input.
         name: The name of the test case. This will be used to identify the test in the summary report.
@@ -94,6 +99,7 @@ class EvaluationData(BaseModel, Generic[InputT, OutputT]):
     actual_output: OutputT | None = None
     name: str | None = None
     expected_output: OutputT | None = None
+    expected_assertion: str | None = None
     expected_trajectory: Union[list[Any], Session, None] = None
     actual_trajectory: Union[list[Any], Session, None] = None
     metadata: dict[str, Any] | None = None

diff --git a/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py b/tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py
@@ -4,7 +4,12 @@
 import pytest
 
 from strands_evals.evaluators import GoalSuccessRateEvaluator
-from strands_evals.evaluators.goal_success_rate_evaluator import GoalSuccessRating, GoalSuccessScore
+from strands_evals.evaluators.goal_success_rate_evaluator import (
+    GoalSuccessAssertionRating,
+    GoalSuccessAssertionScore,
+    GoalSuccessRating,
+    GoalSuccessScore,
+)
 from strands_evals.types import EvaluationData
 from strands_evals.types.trace import (
     AgentInvocationSpan,
@@ -53,15 +58,20 @@ def test_init_with_defaults():
     assert evaluator.version == "v0"
     assert evaluator.model is None
     assert evaluator.system_prompt is not None
+    assert evaluator.assertion_system_prompt is not None
+    assert evaluator.assertion_system_prompt != evaluator.system_prompt
     assert evaluator.evaluation_level == EvaluationLevel.SESSION_LEVEL
 
 
 def test_init_with_custom_values():
-    evaluator = GoalSuccessRateEvaluator(version="v1", model="gpt-4", system_prompt="Custom")
+    evaluator = GoalSuccessRateEvaluator(
+        version="v1", model="gpt-4", system_prompt="Custom", assertion_system_prompt="Custom assertion"
+    )
 
     assert evaluator.version == "v1"
     assert evaluator.model == "gpt-4"
     assert evaluator.system_prompt == "Custom"
+    assert evaluator.assertion_system_prompt == "Custom assertion"
 
 
 @patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent")
@@ -106,24 +116,101 @@ def test_score_mapping(mock_agent_class, evaluation_data, score, expected_value,
     assert result[0].label == score
 
 
-@pytest.mark.asyncio
-@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent")
-async def test_evaluate_async(mock_agent_class, evaluation_data):
-    mock_agent = Mock()
+@pytest.fixture
+def evaluation_data_with_assertion():
+    now = datetime.now()
+    span_info = SpanInfo(session_id="test-session", start_time=now, end_time=now)
+
+    tool_config = ToolConfig(name="calculator", description="Evaluate mathematical expressions")
+
+    agent_span = AgentInvocationSpan(
+        span_info=span_info,
+        user_prompt="What is 2 + 2?",
+        agent_response="The answer is 4.",
+        available_tools=[tool_config],
+    )
+
+    tool_span = ToolExecutionSpan(
+        span_info=span_info,
+        tool_call=ToolCall(name="calculator", arguments={"expression": "2+2"}, tool_call_id="1"),
+        tool_result=ToolResult(content="4", tool_call_id="1"),
+    )
+
+    trace = Trace(spans=[agent_span, tool_span], trace_id="trace1", session_id="test-session")
+    session = Session(traces=[trace], session_id="test-session")
+
+    return EvaluationData(
+        input="What is 2 + 2?",
+        actual_output="The answer is 4.",
+        actual_trajectory=session,
+        name="test-criteria",
+        expected_assertion="The agent should use the calculator tool and return the correct answer of 4.",
+    )
+
+
+def test_has_assertion_true(evaluation_data_with_assertion):
+    evaluator = GoalSuccessRateEvaluator()
+    assert evaluator._has_assertion(evaluation_data_with_assertion) is True
+
+
+def test_has_assertion_false(evaluation_data):
+    evaluator = GoalSuccessRateEvaluator()
+    assert evaluator._has_assertion(evaluation_data) is False
+
+
+def test_has_assertion_none():
+    data = EvaluationData(input="test")
+    evaluator = GoalSuccessRateEvaluator()
+    assert evaluator._has_assertion(data) is False
+
+
+def test_has_assertion_empty_string():
+    data = EvaluationData(input="test", expected_assertion="")
+    evaluator = GoalSuccessRateEvaluator()
+    assert evaluator._has_assertion(data) is False
 
-    async def mock_invoke_async(*args, **kwargs):
-        mock_result = Mock()
-        mock_result.structured_output = GoalSuccessRating(reasoning="All goals achieved", score=GoalSuccessScore.YES)
-        return mock_result
 
-    mock_agent.invoke_async = mock_invoke_async
+@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent")
+def test_evaluate_with_assertion(mock_agent_class, evaluation_data_with_assertion):
+    mock_agent = Mock()
+    mock_result = Mock()
+    mock_result.structured_output = GoalSuccessAssertionRating(
+        reasoning="Agent used calculator and returned 4", verdict=GoalSuccessAssertionScore.SUCCESS
+    )
+    mock_agent.return_value = mock_result
     mock_agent_class.return_value = mock_agent
     evaluator = GoalSuccessRateEvaluator()
 
-    result = await evaluator.evaluate_async(evaluation_data)
+    result = evaluator.evaluate(evaluation_data_with_assertion)
 
     assert len(result) == 1
     assert result[0].score == 1.0
     assert result[0].test_pass is True
-    assert result[0].reason == "All goals achieved"
-    assert result[0].label == GoalSuccessScore.YES
+    assert result[0].reason == "Agent used calculator and returned 4"
+    assert result[0].label == GoalSuccessAssertionScore.SUCCESS
+
+
+@pytest.mark.parametrize(
+    "verdict,expected_value,expected_pass",
+    [
+        (GoalSuccessAssertionScore.SUCCESS, 1.0, True),
+        (GoalSuccessAssertionScore.FAILURE, 0.0, False),
+    ],
+)
+@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent")
+def test_assertion_score_mapping(
+    mock_agent_class, evaluation_data_with_assertion, verdict, expected_value, expected_pass
+):
+    mock_agent = Mock()
+    mock_result = Mock()
+    mock_result.structured_output = GoalSuccessAssertionRating(reasoning="Test", verdict=verdict)
+    mock_agent.return_value = mock_result
+    mock_agent_class.return_value = mock_agent
+    evaluator = GoalSuccessRateEvaluator()
+
+    result = evaluator.evaluate(evaluation_data_with_assertion)
+
+    assert len(result) == 1
+    assert result[0].score == expected_value
+    assert result[0].test_pass == expected_pass
+    assert result[0].label == verdict