fix: Address Cursor Bugbot review feedback

fede-kamel · fede-kamel · commit a3c620039062 · 2026-01-25T21:53:02.000-05:00
- Fix multiple embedding types getting wrong indices by tracking
  used_batch_indices per embedding type instead of shared set
- Fix fallback parser to use batch_texts when API doesn't return texts
- Remove unused variables (current_path, in_embeddings) and dead code
- Remove unused stream_embed_response convenience function
diff --git a/src/cohere/base_client.py b/src/cohere/base_client.py
@@ -1221,20 +1221,28 @@ def embed_stream(
 
             # Parse embeddings from response incrementally
             parser = StreamingEmbedParser(response._response, batch_texts)
-            # Track used indices to handle duplicate texts correctly
-            used_batch_indices = set()
+            # Track used indices per embedding type to handle:
+            # 1. Duplicate texts within a batch
+            # 2. Multiple embedding types (float, int8, etc.) for the same texts
+            used_batch_indices_by_type: dict[str, set[int]] = {}
 
             for embedding in parser.iter_embeddings():
                 # The parser sets embedding.text correctly for multiple embedding types
                 # Adjust the global index based on text position in batch
                 if embedding.text and embedding.text in batch_texts:
+                    # Get or create the set of used indices for this embedding type
+                    emb_type = embedding.embedding_type
+                    if emb_type not in used_batch_indices_by_type:
+                        used_batch_indices_by_type[emb_type] = set()
+                    used_indices = used_batch_indices_by_type[emb_type]
+
                     # Find the next unused occurrence of this text in the batch
                     # This handles duplicate texts correctly
                     text_idx_in_batch = None
                     for idx, text in enumerate(batch_texts):
-                        if text == embedding.text and idx not in used_batch_indices:
+                        if text == embedding.text and idx not in used_indices:
                             text_idx_in_batch = idx
-                            used_batch_indices.add(idx)
+                            used_indices.add(idx)
                             break
 
                     if text_idx_in_batch is not None:
diff --git a/src/cohere/streaming_utils.py b/src/cohere/streaming_utils.py
@@ -80,22 +80,13 @@ def iter_embeddings(self) -> Iterator[StreamedEmbedding]:
     
     def _parse_with_ijson(self, parser) -> Iterator[StreamedEmbedding]:
         """Parse embeddings using ijson incremental parser."""
-        current_path: List[str] = []
         current_embedding = []
         # Track text index separately per embedding type
         # When multiple types requested, each text gets multiple embeddings
         type_text_indices: dict = {}
-        embedding_type = "float"
         response_type = None
-        in_embeddings = False
 
         for prefix, event, value in parser:
-            # Track current path
-            if event == 'map_key':
-                if current_path and current_path[-1] == 'embeddings':
-                    # This is an embedding type key (float_, int8, etc.)
-                    embedding_type = value.rstrip('_')
-
             # Detect response type
             if prefix == 'response_type':
                 response_type = value
@@ -170,10 +161,11 @@ def _iter_embeddings_fallback(self) -> Iterator[StreamedEmbedding]:
     def _iter_embeddings_fallback_from_dict(self, data: dict) -> Iterator[StreamedEmbedding]:
         """Parse embeddings from a dictionary (used by fallback methods)."""
         response_type = data.get('response_type', '')
+        # Use batch_texts from constructor as fallback if API doesn't return texts
+        texts = data.get('texts') or self.batch_texts
 
         if response_type == 'embeddings_floats':
             embeddings = data.get('embeddings', [])
-            texts = data.get('texts', [])
             for i, embedding in enumerate(embeddings):
                 yield StreamedEmbedding(
                     index=self.embeddings_yielded + i,
@@ -184,7 +176,6 @@ def _iter_embeddings_fallback_from_dict(self, data: dict) -> Iterator[StreamedEm
 
         elif response_type == 'embeddings_by_type':
             embeddings_obj = data.get('embeddings', {})
-            texts = data.get('texts', [])
 
             # Iterate through each embedding type
             for emb_type, embeddings_list in embeddings_obj.items():
@@ -198,18 +189,3 @@ def _iter_embeddings_fallback_from_dict(self, data: dict) -> Iterator[StreamedEm
                             text=texts[i] if i < len(texts) else None
                         )
                         self.embeddings_yielded += 1
-                        
-                        
-def stream_embed_response(response: httpx.Response, texts: List[str]) -> Iterator[StreamedEmbedding]:
-    """
-    Convenience function to stream embeddings from a response.
-    
-    Args:
-        response: The httpx response containing embeddings
-        texts: The original texts that were embedded
-        
-    Yields:
-        StreamedEmbedding objects
-    """
-    parser = StreamingEmbedParser(response, texts)
-    yield from parser.iter_embeddings()
diff --git a/src/cohere/v2/client.py b/src/cohere/v2/client.py
@@ -602,21 +602,30 @@ def embed_stream(
 
             # Parse embeddings from response incrementally
             parser = StreamingEmbedParser(response._response, batch_texts)
-            # Track used indices to handle duplicate texts correctly
-            used_batch_indices: set[int] = set()
+            # Track used indices per embedding type to handle:
+            # 1. Duplicate texts within a batch
+            # 2. Multiple embedding types (float, int8, etc.) for the same texts
+            used_batch_indices_by_type: dict[str, set[int]] = {}
 
             for embedding in parser.iter_embeddings():
                 # The parser sets embedding.text correctly for multiple embedding types
                 # Adjust the global index based on text position in batch
                 if embedding.text and embedding.text in batch_texts:
+                    # Get or create the set of used indices for this embedding type
+                    emb_type = embedding.embedding_type
+                    if emb_type not in used_batch_indices_by_type:
+                        used_batch_indices_by_type[emb_type] = set()
+                    used_indices = used_batch_indices_by_type[emb_type]
+
                     # Find the next unused occurrence of this text in the batch
                     # This handles duplicate texts correctly
                     text_idx_in_batch = None
                     for idx, text in enumerate(batch_texts):
-                        if text == embedding.text and idx not in used_batch_indices:
+                        if text == embedding.text and idx not in used_indices:
                             text_idx_in_batch = idx
-                            used_batch_indices.add(idx)
+                            used_indices.add(idx)
                             break
+
                     if text_idx_in_batch is not None:
                         embedding.index = batch_start + text_idx_in_batch
                 yield embedding