refactor(multimodal): unify structured response parsing and error handling

XiaoBoAI · XiaoBoAI · commit 255492d05570 · 2026-01-13T16:14:23.000+08:00
- Add parse_structured_chat_response utility for streaming/non-streaming responses
- Return GraderError instead of score=0 on exceptions in multimodal graders
- Update tests to verify GraderError behavior
- Move exception handling to aevaluate level for cleaner code
diff --git a/openjudge/graders/multimodal/image_coherence.py b/openjudge/graders/multimodal/image_coherence.py
@@ -23,6 +23,7 @@
 from openjudge.models.base_chat_model import BaseChatModel
 from openjudge.models.schema.oai.message import ChatMessage
 from openjudge.models.schema.prompt_template import LanguageEnum, PromptTemplate
+from openjudge.utils.utils import parse_structured_chat_response
 
 # pylint: disable=line-too-long
 
@@ -228,24 +229,15 @@ async def _aevaluate_single_image(
             data_url = f"data:image/{image_format};base64,{image.base64}"
             content.append({"type": "image_url", "image_url": {"url": data_url}})
 
-        # Call model without structured output
         chat_response = await self.model.achat(
             messages=[{"role": "user", "content": content}],
             structured_model=GraderScoreCallback,
         )
 
-        # Handle both streaming and non-streaming responses
-        if hasattr(chat_response, "__aiter__"):
-            parsed = {}
-            async for chunk in chat_response:
-                if chunk.parsed:
-                    parsed.update(chunk.parsed)
-            # Default to 5.0 (neutral score on 0-10 scale) for missing fields
-            score = parsed.get("score", 5.0)
-            reason = parsed.get("reason", "")
-        else:
-            score = chat_response.parsed["score"]
-            reason = chat_response.parsed["reason"]
+        # Default to 5.0 (neutral score on 0-10 scale) for missing fields
+        parsed = await parse_structured_chat_response(chat_response)
+        score = parsed.get("score", 5.0)
+        reason = parsed.get("reason", "")
         return score, reason
 
     async def _acompute(
diff --git a/openjudge/graders/multimodal/image_helpfulness.py b/openjudge/graders/multimodal/image_helpfulness.py
@@ -24,6 +24,7 @@
 from openjudge.models.base_chat_model import BaseChatModel
 from openjudge.models.schema.oai.message import ChatMessage
 from openjudge.models.schema.prompt_template import LanguageEnum, PromptTemplate
+from openjudge.utils.utils import parse_structured_chat_response
 
 # pylint: disable=line-too-long
 
@@ -229,18 +230,10 @@ async def _aevaluate_single_image(
             structured_model=GraderScoreCallback,
         )
 
-        # Handle both streaming and non-streaming responses
-        if hasattr(chat_response, "__aiter__"):
-            parsed = {}
-            async for chunk in chat_response:
-                if chunk.parsed:
-                    parsed.update(chunk.parsed)
-            # Default to 5.0 (neutral score on 0-10 scale) for missing fields
-            score = parsed.get("score", 5.0)
-            reason = parsed.get("reason", "")
-        else:
-            score = chat_response.parsed["score"]
-            reason = chat_response.parsed["reason"]
+        # Default to 5.0 (neutral score on 0-10 scale) for missing fields
+        parsed = await parse_structured_chat_response(chat_response)
+        score = parsed.get("score", 5.0)
+        reason = parsed.get("reason", "")
         return score, reason
 
     async def _acompute(
diff --git a/openjudge/graders/multimodal/text_to_image.py b/openjudge/graders/multimodal/text_to_image.py
@@ -20,6 +20,7 @@
 from openjudge.models.openai_chat_model import OpenAIChatModel
 from openjudge.models.schema.oai.message import ChatMessage
 from openjudge.models.schema.prompt_template import LanguageEnum, PromptTemplate
+from openjudge.utils.utils import parse_structured_chat_response
 
 # pylint: disable=line-too-long
 
@@ -265,20 +266,11 @@ async def _aevaluate_semantic_consistency(
             structured_model=GraderScoreCallback,
         )
 
-        # Handle both streaming and non-streaming responses
-        if hasattr(chat_response, "__aiter__"):
-            parsed = {}
-            async for chunk in chat_response:
-                if chunk.parsed:
-                    parsed.update(chunk.parsed)
-            # Default to 5.0 (neutral score on 0-10 scale) for missing fields
-            score = parsed.get("score", 5.0)
-            score = score if isinstance(score, list) else [score]
-            reason = parsed.get("reason", "")
-        else:
-            score = chat_response.parsed["score"]
-            score = score if isinstance(score, list) else [score]
-            reason = chat_response.parsed["reason"]
+        # Default to 5.0 (neutral score on 0-10 scale) for missing fields
+        parsed = await parse_structured_chat_response(chat_response)
+        score = parsed.get("score", 5.0)
+        score = score if isinstance(score, list) else [score]
+        reason = parsed.get("reason", "")
         return score, reason
 
     async def _aevaluate_perceptual_quality(
@@ -295,20 +287,11 @@ async def _aevaluate_perceptual_quality(
             structured_model=GraderScoreCallback,
         )
 
-        # Handle both streaming and non-streaming responses
-        if hasattr(chat_response, "__aiter__"):
-            parsed = {}
-            async for chunk in chat_response:
-                if chunk.parsed:
-                    parsed.update(chunk.parsed)
-            # Default to 5.0 (neutral score on 0-10 scale) for missing fields
-            score = parsed.get("score", [5.0, 5.0])
-            reason = parsed.get("reason", "")
-        else:
-            score = chat_response.parsed["score"]
-            reason = chat_response.parsed["reason"]
-
+        # Default to [5.0, 5.0] (neutral scores on 0-10 scale) for missing fields
+        parsed = await parse_structured_chat_response(chat_response)
+        score = parsed.get("score", [5.0, 5.0])
         score = score[:2] if isinstance(score, list) else [score, score]
+        reason = parsed.get("reason", "")
         return score, reason
 
     async def _a_compute(
diff --git a/openjudge/utils/utils.py b/openjudge/utils/utils.py
@@ -6,7 +6,7 @@
 """
 
 import json
-from typing import Any, Dict, Type
+from typing import Any, Dict, Optional, Type
 
 from json_repair import repair_json
 from loguru import logger
@@ -203,3 +203,42 @@ def trim_and_load_json(response: str, metric: Any = None) -> Dict[str, Any]:
             metric_name = getattr(metric, "name", "unknown_metric")
             logger.error(f"{metric_name}: {error_msg}")
         raise ValueError(error_msg) from e
+
+
+async def parse_structured_chat_response(
+    chat_response: Any,
+    default: Optional[Dict[str, Any]] = None,
+) -> Dict[str, Any]:
+    """Parse structured response from streaming or non-streaming chat response.
+
+    For streaming responses, returns the last chunk's parsed result (complete).
+    For non-streaming responses, returns the parsed result directly.
+
+    Args:
+        chat_response: Chat response object from model.achat() with structured_model.
+            Can be either streaming (async iterator) or non-streaming.
+        default: Default dict to return if parsing fails. Defaults to empty dict.
+
+    Returns:
+        Dict[str, Any]: The parsed structured response containing fields like
+            'score' and 'reason'.
+
+    Example:
+        >>> response = await model.achat(messages, structured_model=GraderScoreCallback)
+        >>> parsed = await parse_structured_chat_response(response)
+        >>> score = parsed.get("score", 5.0)
+        >>> reason = parsed.get("reason", "")
+    """
+    if default is None:
+        default = {}
+
+    if hasattr(chat_response, "__aiter__"):
+        # Streaming response - only the last chunk contains complete result
+        parsed = None
+        async for chunk in chat_response:
+            if chunk.parsed:
+                parsed = chunk.parsed
+        return parsed if parsed is not None else default
+
+    # Non-streaming response
+    return chat_response.parsed if chat_response.parsed else default
diff --git a/tests/graders/multimodal/test_image_coherence.py b/tests/graders/multimodal/test_image_coherence.py
@@ -92,6 +92,8 @@ def __init__(self):
     @pytest.mark.asyncio
     async def test_error_handling(self):
         """Test graceful error handling"""
+        from openjudge.graders.base_grader import GraderError
+
         # Create mock model that raises exception
         mock_model = AsyncMock()
         mock_model.achat = AsyncMock(side_effect=Exception("API Error"))
@@ -105,9 +107,9 @@ async def test_error_handling(self):
             response=["Text before", mock_image, "Text after"],
         )
 
-        # Assertions
-        assert result.score == 0.0
-        assert "Evaluation error: API Error" in result.reason
+        # Assertions - grader returns GraderError on exception
+        assert isinstance(result, GraderError)
+        assert "Evaluation error: API Error" in result.error
 
 
 # ==================== QUALITY TESTS ====================
diff --git a/tests/graders/multimodal/test_image_helpfulness.py b/tests/graders/multimodal/test_image_helpfulness.py
@@ -92,6 +92,8 @@ def __init__(self):
     @pytest.mark.asyncio
     async def test_error_handling(self):
         """Test graceful error handling"""
+        from openjudge.graders.base_grader import GraderError
+
         # Create mock model that raises exception
         mock_model = AsyncMock()
         mock_model.achat = AsyncMock(side_effect=Exception("API Error"))
@@ -105,9 +107,9 @@ async def test_error_handling(self):
             response=["Text before", mock_image, "Text after"],
         )
 
-        # Assertions
-        assert result.score == 0.0
-        assert "Evaluation error: API Error" in result.reason
+        # Assertions - grader returns GraderError on exception
+        assert isinstance(result, GraderError)
+        assert "Evaluation error: API Error" in result.error
 
 
 # ==================== QUALITY TESTS ====================
diff --git a/tests/graders/multimodal/test_text_to_image.py b/tests/graders/multimodal/test_text_to_image.py
@@ -95,6 +95,8 @@ def __init__(self, score, reason):
     @pytest.mark.asyncio
     async def test_error_handling(self):
         """Test graceful error handling"""
+        from openjudge.graders.base_grader import GraderError
+
         # Create mock model that raises exception
         mock_model = AsyncMock(spec=BaseChatModel)
         mock_model.achat = AsyncMock(side_effect=Exception("API Error"))
@@ -109,10 +111,9 @@ async def test_error_handling(self):
             response=mock_image,
         )
 
-        # Assertions
-        # TextToImageGrader returns 0.5 (default) on error, not 0.0
-        assert result.score == 0.5
-        assert "error" in result.reason.lower()
+        # Assertions - grader returns GraderError on exception
+        assert isinstance(result, GraderError)
+        assert "Evaluation error: API Error" in result.error
 
 
 # ==================== QUALITY TESTS ====================