CM-57848-Fix UTF encoding for Windows characters

mateusz-sterczewski · mateusz-sterczewski · commit d40b459fcf87 · 2026-01-21T14:46:12.000+01:00
diff --git a/cycode/cli/files_collector/models/in_memory_zip.py b/cycode/cli/files_collector/models/in_memory_zip.py
@@ -26,7 +26,11 @@ def append(self, filename: str, unique_id: Optional[str], content: str) -> None:
         if unique_id:
             filename = concat_unique_id(filename, unique_id)
 
-        self.zip.writestr(filename, content)
+        # Encode content to bytes with error handling to handle surrogate characters
+        # that cannot be encoded to UTF-8. Use 'replace' to replace invalid characters
+        # with the Unicode replacement character (U+FFFD).
+        content_bytes = content.encode('utf-8', errors='replace')
+        self.zip.writestr(filename, content_bytes)
 
     def close(self) -> None:
         self.zip.close()
diff --git a/tests/cli/files_collector/test_in_memory_zip.py b/tests/cli/files_collector/test_in_memory_zip.py
@@ -0,0 +1,29 @@
+"""Tests for InMemoryZip class, specifically for handling surrogate characters and encoding issues."""
+
+import zipfile
+from io import BytesIO
+
+from cycode.cli.files_collector.models.in_memory_zip import InMemoryZip
+
+
+def test_append_with_surrogate_characters() -> None:
+    """Test that surrogate characters are handled gracefully without raising encoding errors."""
+    # Surrogate characters (U+D800 to U+DFFF) cannot be encoded to UTF-8 directly
+    zip_file = InMemoryZip()
+    content = 'Normal text \udc96 more text'
+
+    # Should not raise UnicodeEncodeError
+    zip_file.append('test.txt', None, content)
+    zip_file.close()
+
+    # Verify the ZIP was created successfully
+    zip_data = zip_file.read()
+    assert len(zip_data) > 0
+
+    # Verify we can read it back and the surrogate was replaced
+    with zipfile.ZipFile(BytesIO(zip_data), 'r') as zf:
+        extracted = zf.read('test.txt').decode('utf-8')
+        assert 'Normal text' in extracted
+        assert 'more text' in extracted
+        # The surrogate should have been replaced with the replacement character
+        assert '\udc96' not in extracted