Skip to content

Commit d40b459

Browse files
CM-57848-Fix UTF encoding for Windows characters
1 parent 3625c5c commit d40b459

File tree

2 files changed

+34
-1
lines changed

2 files changed

+34
-1
lines changed

cycode/cli/files_collector/models/in_memory_zip.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,11 @@ def append(self, filename: str, unique_id: Optional[str], content: str) -> None:
2626
if unique_id:
2727
filename = concat_unique_id(filename, unique_id)
2828

29-
self.zip.writestr(filename, content)
29+
# Encode content to bytes with error handling to handle surrogate characters
30+
# that cannot be encoded to UTF-8. Use 'replace' to replace invalid characters
31+
# with the Unicode replacement character (U+FFFD).
32+
content_bytes = content.encode('utf-8', errors='replace')
33+
self.zip.writestr(filename, content_bytes)
3034

3135
def close(self) -> None:
3236
self.zip.close()
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
"""Tests for InMemoryZip class, specifically for handling surrogate characters and encoding issues."""
2+
3+
import zipfile
4+
from io import BytesIO
5+
6+
from cycode.cli.files_collector.models.in_memory_zip import InMemoryZip
7+
8+
9+
def test_append_with_surrogate_characters() -> None:
10+
"""Test that surrogate characters are handled gracefully without raising encoding errors."""
11+
# Surrogate characters (U+D800 to U+DFFF) cannot be encoded to UTF-8 directly
12+
zip_file = InMemoryZip()
13+
content = 'Normal text \udc96 more text'
14+
15+
# Should not raise UnicodeEncodeError
16+
zip_file.append('test.txt', None, content)
17+
zip_file.close()
18+
19+
# Verify the ZIP was created successfully
20+
zip_data = zip_file.read()
21+
assert len(zip_data) > 0
22+
23+
# Verify we can read it back and the surrogate was replaced
24+
with zipfile.ZipFile(BytesIO(zip_data), 'r') as zf:
25+
extracted = zf.read('test.txt').decode('utf-8')
26+
assert 'Normal text' in extracted
27+
assert 'more text' in extracted
28+
# The surrogate should have been replaced with the replacement character
29+
assert '\udc96' not in extracted

0 commit comments

Comments
 (0)