Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 120 additions & 4 deletions src/strands_evals/evaluators/goal_success_rate_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from ..types.evaluation import EvaluationData, EvaluationOutput, InputT, OutputT
from ..types.trace import EvaluationLevel, SessionLevelInput
from .evaluator import Evaluator
from .prompt_templates.goal_success_rate import get_template
from .prompt_templates.goal_success_rate import get_assertion_template, get_template


class GoalSuccessScore(str, Enum):
Expand All @@ -26,8 +26,31 @@ class GoalSuccessRating(BaseModel):
score: GoalSuccessScore = Field(description="Score should be one of 'Yes' or 'No'")


class GoalSuccessAssertionScore(str, Enum):
"""Binary assertion-based goal success ratings."""

SUCCESS = "SUCCESS"
FAILURE = "FAILURE"


class GoalSuccessAssertionRating(BaseModel):
"""Structured output for assertion-based goal success evaluation."""

reasoning: str = Field(description="Brief explanation of the evaluation")
verdict: GoalSuccessAssertionScore = Field(description="Verdict should be one of 'SUCCESS' or 'FAILURE'")


class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):
"""Evaluates whether all user goals were successfully achieved in a conversation."""
"""Evaluates whether all user goals were successfully achieved in a conversation.

Supports two modes:
- **Basic mode**: Evaluates goal success based on conversation analysis alone.
Uses a Yes/No scoring rubric (Yes=1.0, No=0.0).
- **Assertion mode**: When assertions are provided via ``metadata["assertions"]``,
evaluates whether the agent's behavior satisfies the specified success assertions.
Uses a SUCCESS/FAILURE scoring rubric (SUCCESS=1.0, FAILURE=0.0).
Optionally accepts ``metadata["additional_context"]`` for extra evaluation context.
"""

evaluation_level = EvaluationLevel.SESSION_LEVEL

Expand All @@ -36,19 +59,52 @@ class GoalSuccessRateEvaluator(Evaluator[InputT, OutputT]):
GoalSuccessScore.NO: 0.0,
}

_assertion_score_mapping = {
GoalSuccessAssertionScore.SUCCESS: 1.0,
GoalSuccessAssertionScore.FAILURE: 0.0,
}

def __init__(
self,
version: str = "v0",
model: Union[Model, str, None] = None,
system_prompt: str | None = None,
assertion_system_prompt: str | None = None,
):
super().__init__()
self.system_prompt = system_prompt if system_prompt is not None else get_template(version).SYSTEM_PROMPT
self.assertion_system_prompt = (
assertion_system_prompt
if assertion_system_prompt is not None
else get_assertion_template(version).SYSTEM_PROMPT
)
self.version = version
self.model = model

def _has_assertions(self, evaluation_case: EvaluationData[InputT, OutputT]) -> bool:
"""Check if the evaluation case contains assertions in metadata."""
if evaluation_case.metadata and evaluation_case.metadata.get("assertions"):
return True
return False

def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
session_input = self._parse_trajectory(evaluation_case)

if self._has_assertions(evaluation_case):
return self._evaluate_with_assertions(session_input, evaluation_case)

return self._evaluate_basic(session_input)

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
session_input = self._parse_trajectory(evaluation_case)

if self._has_assertions(evaluation_case):
return await self._evaluate_with_assertions_async(session_input, evaluation_case)

return await self._evaluate_basic_async(session_input)

def _evaluate_basic(self, session_input: SessionLevelInput) -> list[EvaluationOutput]:
"""Evaluate goal success using the basic prompt (no assertions)."""
prompt = self._format_prompt(session_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = evaluator_agent(prompt, structured_output_model=GoalSuccessRating)
Comment thread
ybdarrenwang marked this conversation as resolved.
Expand All @@ -63,8 +119,8 @@ def evaluate(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[Eva
)
]

async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT]) -> list[EvaluationOutput]:
session_input = self._parse_trajectory(evaluation_case)
async def _evaluate_basic_async(self, session_input: SessionLevelInput) -> list[EvaluationOutput]:
"""Evaluate goal success using the basic prompt asynchronously."""
prompt = self._format_prompt(session_input)
evaluator_agent = Agent(model=self.model, system_prompt=self.system_prompt, callback_handler=None)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=GoalSuccessRating)
Expand All @@ -79,6 +135,46 @@ async def evaluate_async(self, evaluation_case: EvaluationData[InputT, OutputT])
)
]

def _evaluate_with_assertions(
self,
session_input: SessionLevelInput,
evaluation_case: EvaluationData[InputT, OutputT],
) -> list[EvaluationOutput]:
"""Evaluate goal success using assertion-based prompt."""
prompt = self._format_assertion_prompt(session_input, evaluation_case)
evaluator_agent = Agent(model=self.model, system_prompt=self.assertion_system_prompt, callback_handler=None)
result = evaluator_agent(prompt, structured_output_model=GoalSuccessAssertionRating)
rating = cast(GoalSuccessAssertionRating, result.structured_output)
normalized_score = self._assertion_score_mapping[rating.verdict]
return [
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 1.0,
reason=rating.reasoning,
label=rating.verdict,
)
]

async def _evaluate_with_assertions_async(
self,
session_input: SessionLevelInput,
evaluation_case: EvaluationData[InputT, OutputT],
) -> list[EvaluationOutput]:
"""Evaluate goal success using assertion-based prompt asynchronously."""
prompt = self._format_assertion_prompt(session_input, evaluation_case)
evaluator_agent = Agent(model=self.model, system_prompt=self.assertion_system_prompt, callback_handler=None)
result = await evaluator_agent.invoke_async(prompt, structured_output_model=GoalSuccessAssertionRating)
rating = cast(GoalSuccessAssertionRating, result.structured_output)
normalized_score = self._assertion_score_mapping[rating.verdict]
return [
EvaluationOutput(
score=normalized_score,
test_pass=normalized_score >= 1.0,
reason=rating.reasoning,
label=rating.verdict,
)
]

def _format_prompt(self, session_input: SessionLevelInput) -> str:
"""Format evaluation prompt from session-level input."""
parts = []
Expand All @@ -90,3 +186,23 @@ def _format_prompt(self, session_input: SessionLevelInput) -> str:
parts.append(f"# Conversation record\n{self._format_session_history(session_input.session_history)}")

return "\n\n".join(parts)

def _format_assertion_prompt(
self,
session_input: SessionLevelInput,
evaluation_case: EvaluationData[InputT, OutputT],
) -> str:
"""Format evaluation prompt for assertion-based evaluation."""
metadata = evaluation_case.metadata or {}
assertions = metadata.get("assertions", "")
additional_context = metadata.get("additional_context", "N/A")

parts = []

if session_input.session_history:
parts.append(f"CONVERSATION RECORD:\n{self._format_session_history(session_input.session_history)}")

parts.append(f"SUCCESS ASSERTIONS:\n{assertions}")
parts.append(f"ADDITIONAL CONTEXT:\n{additional_context}")

return "\n\n".join(parts)
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
from . import goal_success_rate_v0
from . import goal_success_rate_v0, goal_success_rate_with_assertions_v0

VERSIONS = {
"v0": goal_success_rate_v0,
}

ASSERTION_VERSIONS = {
"v0": goal_success_rate_with_assertions_v0,
}

DEFAULT_VERSION = "v0"


def get_template(version: str = DEFAULT_VERSION):
return VERSIONS[version]


def get_assertion_template(version: str = DEFAULT_VERSION):
return ASSERTION_VERSIONS[version]
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
SYSTEM_PROMPT = """You are an evaluator for an LLM-based agent.

You will be provided with:
1. A conversation record between a user and an AI assistant.
2. A set of success assertions that define what the agent must accomplish.
3. Optional additional context for evaluation.

TASK:
Decide whether the agent successfully completed the task.

INSTRUCTIONS:
- Judge only based on whether the agent behavior satisfies the success assertions.
- Evaluate assertions by their intent, not by exact text matching. Minor differences in wording, parameter ordering, or formatting should not cause a failure.
- If an assertion describes a specific action or tool call to achieve a particular outcome, and the agent achieved the same outcome through an alternative approach clearly evidenced in the conversation, consider the assertion satisfied.
- Do not rationalize or make assumptions beyond what the conversation shows.
- Ignore style and verbosity.
- Keep your reasoning concise — under 200 words."""
142 changes: 140 additions & 2 deletions tests/strands_evals/evaluators/test_goal_success_rate_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@
import pytest

from strands_evals.evaluators import GoalSuccessRateEvaluator
from strands_evals.evaluators.goal_success_rate_evaluator import GoalSuccessRating, GoalSuccessScore
from strands_evals.evaluators.goal_success_rate_evaluator import (
GoalSuccessAssertionRating,
GoalSuccessAssertionScore,
GoalSuccessRating,
GoalSuccessScore,
)
from strands_evals.types import EvaluationData
from strands_evals.types.trace import (
AgentInvocationSpan,
Expand Down Expand Up @@ -53,15 +58,20 @@ def test_init_with_defaults():
assert evaluator.version == "v0"
assert evaluator.model is None
assert evaluator.system_prompt is not None
assert evaluator.assertion_system_prompt is not None
assert evaluator.assertion_system_prompt != evaluator.system_prompt
assert evaluator.evaluation_level == EvaluationLevel.SESSION_LEVEL


def test_init_with_custom_values():
evaluator = GoalSuccessRateEvaluator(version="v1", model="gpt-4", system_prompt="Custom")
evaluator = GoalSuccessRateEvaluator(
version="v1", model="gpt-4", system_prompt="Custom", assertion_system_prompt="Custom assertion"
)

assert evaluator.version == "v1"
assert evaluator.model == "gpt-4"
assert evaluator.system_prompt == "Custom"
assert evaluator.assertion_system_prompt == "Custom assertion"


@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent")
Expand Down Expand Up @@ -127,3 +137,131 @@ async def mock_invoke_async(*args, **kwargs):
assert result[0].test_pass is True
assert result[0].reason == "All goals achieved"
assert result[0].label == GoalSuccessScore.YES


@pytest.fixture
def evaluation_data_with_assertions():
now = datetime.now()
span_info = SpanInfo(session_id="test-session", start_time=now, end_time=now)

tool_config = ToolConfig(name="calculator", description="Evaluate mathematical expressions")

agent_span = AgentInvocationSpan(
span_info=span_info,
user_prompt="What is 2 + 2?",
agent_response="The answer is 4.",
available_tools=[tool_config],
)

tool_span = ToolExecutionSpan(
span_info=span_info,
tool_call=ToolCall(name="calculator", arguments={"expression": "2+2"}, tool_call_id="1"),
tool_result=ToolResult(content="4", tool_call_id="1"),
)

trace = Trace(spans=[agent_span, tool_span], trace_id="trace1", session_id="test-session")
session = Session(traces=[trace], session_id="test-session")

return EvaluationData(
input="What is 2 + 2?",
actual_output="The answer is 4.",
actual_trajectory=session,
name="test-assertions",
metadata={
"assertions": "The agent should use the calculator tool and return the correct answer of 4.",
"additional_context": "This is a simple math test.",
},
)


def test_has_assertions_true(evaluation_data_with_assertions):
evaluator = GoalSuccessRateEvaluator()
assert evaluator._has_assertions(evaluation_data_with_assertions) is True


def test_has_assertions_false(evaluation_data):
evaluator = GoalSuccessRateEvaluator()
assert evaluator._has_assertions(evaluation_data) is False


def test_has_assertions_empty_metadata():
data = EvaluationData(input="test", metadata={})
evaluator = GoalSuccessRateEvaluator()
assert evaluator._has_assertions(data) is False


def test_has_assertions_no_metadata():
data = EvaluationData(input="test")
evaluator = GoalSuccessRateEvaluator()
assert evaluator._has_assertions(data) is False


@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent")
def test_evaluate_with_assertions(mock_agent_class, evaluation_data_with_assertions):
mock_agent = Mock()
mock_result = Mock()
mock_result.structured_output = GoalSuccessAssertionRating(
reasoning="Agent used calculator and returned 4", verdict=GoalSuccessAssertionScore.SUCCESS
)
mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = GoalSuccessRateEvaluator()

result = evaluator.evaluate(evaluation_data_with_assertions)

assert len(result) == 1
assert result[0].score == 1.0
assert result[0].test_pass is True
assert result[0].reason == "Agent used calculator and returned 4"
assert result[0].label == GoalSuccessAssertionScore.SUCCESS


@pytest.mark.parametrize(
"verdict,expected_value,expected_pass",
[
(GoalSuccessAssertionScore.SUCCESS, 1.0, True),
(GoalSuccessAssertionScore.FAILURE, 0.0, False),
],
)
@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent")
def test_assertion_score_mapping(
mock_agent_class, evaluation_data_with_assertions, verdict, expected_value, expected_pass
):
mock_agent = Mock()
mock_result = Mock()
mock_result.structured_output = GoalSuccessAssertionRating(reasoning="Test", verdict=verdict)
mock_agent.return_value = mock_result
mock_agent_class.return_value = mock_agent
evaluator = GoalSuccessRateEvaluator()

result = evaluator.evaluate(evaluation_data_with_assertions)

assert len(result) == 1
assert result[0].score == expected_value
assert result[0].test_pass == expected_pass
assert result[0].label == verdict


@pytest.mark.asyncio
@patch("strands_evals.evaluators.goal_success_rate_evaluator.Agent")
async def test_evaluate_async_with_assertions(mock_agent_class, evaluation_data_with_assertions):
mock_agent = Mock()

async def mock_invoke_async(*args, **kwargs):
mock_result = Mock()
mock_result.structured_output = GoalSuccessAssertionRating(
reasoning="Agent satisfied all assertions", verdict=GoalSuccessAssertionScore.SUCCESS
)
return mock_result

mock_agent.invoke_async = mock_invoke_async
mock_agent_class.return_value = mock_agent
evaluator = GoalSuccessRateEvaluator()

result = await evaluator.evaluate_async(evaluation_data_with_assertions)

assert len(result) == 1
assert result[0].score == 1.0
assert result[0].test_pass is True
assert result[0].reason == "Agent satisfied all assertions"
assert result[0].label == GoalSuccessAssertionScore.SUCCESS
Loading