From 0627289a3b2b8e7b75cdbf275df4d0402039d73c Mon Sep 17 00:00:00 2001
From: Andrew Brookins <a.m.brookins@gmail.com>
Date: Fri, 23 Jan 2026 09:03:22 -0800
Subject: [PATCH] Fix bytes fields failing with UnicodeDecodeError

Bytes fields containing non-UTF8 data (e.g., binary files, images)
caused UnicodeDecodeError because jsonable_encoder called bytes.decode()
without specifying an encoding.

Solution: Use base64 encoding for bytes fields before JSON serialization,
and decode back to bytes on retrieval. This follows the same pattern as
datetime timestamp conversion.

Fixes #779
---
 aredis_om/model/model.py | 122 ++++++++++++++++++++++++++++++++++++++-
 tests/test_hash_model.py |  68 ++++++++++++++++++++++
 tests/test_json_model.py |  98 +++++++++++++++++++++++++++++++
 3 files changed, 287 insertions(+), 1 deletion(-)

diff --git a/aredis_om/model/model.py b/aredis_om/model/model.py
index 8c3e7800..cc9db06a 100644
--- a/aredis_om/model/model.py
+++ b/aredis_om/model/model.py
@@ -216,6 +216,113 @@ def convert_timestamp_to_datetime(obj, model_fields):
         return obj
 
 
+def convert_bytes_to_base64(obj):
+    """Convert bytes objects to base64-encoded strings for storage.
+
+    This is necessary because Redis JSON and the jsonable_encoder cannot
+    handle arbitrary binary data. Base64 encoding ensures all byte values
+    (0-255) can be safely stored and retrieved.
+    """
+    import base64
+
+    if isinstance(obj, dict):
+        return {key: convert_bytes_to_base64(value) for key, value in obj.items()}
+    elif isinstance(obj, list):
+        return [convert_bytes_to_base64(item) for item in obj]
+    elif isinstance(obj, bytes):
+        return base64.b64encode(obj).decode("ascii")
+    else:
+        return obj
+
+
+def convert_base64_to_bytes(obj, model_fields):
+    """Convert base64-encoded strings back to bytes based on model field types."""
+    import base64
+
+    if isinstance(obj, dict):
+        result = {}
+        for key, value in obj.items():
+            if key in model_fields:
+                field_info = model_fields[key]
+                field_type = (
+                    field_info.annotation if hasattr(field_info, "annotation") else None
+                )
+
+                # Handle Optional types - extract the inner type
+                if hasattr(field_type, "__origin__") and field_type.__origin__ is Union:
+                    # For Optional[T] which is Union[T, None], get the non-None type
+                    args = getattr(field_type, "__args__", ())
+                    non_none_types = [
+                        arg for arg in args if arg is not type(None)  # noqa: E721
+                    ]
+                    if len(non_none_types) == 1:
+                        field_type = non_none_types[0]
+
+                # Handle bytes fields
+                if field_type is bytes and isinstance(value, str):
+                    try:
+                        result[key] = base64.b64decode(value)
+                    except (ValueError, TypeError):
+                        # If it's not valid base64, keep original value
+                        result[key] = value
+                # Handle nested models - check if it's a model with fields
+                elif isinstance(value, dict):
+                    try:
+                        if (
+                            isinstance(field_type, type)
+                            and hasattr(field_type, "model_fields")
+                            and field_type.model_fields
+                        ):
+                            result[key] = convert_base64_to_bytes(
+                                value, field_type.model_fields
+                            )
+                        else:
+                            result[key] = convert_base64_to_bytes(value, {})
+                    except (TypeError, AttributeError):
+                        result[key] = convert_base64_to_bytes(value, {})
+                # Handle lists that might contain nested models
+                elif isinstance(value, list):
+                    # Try to extract the inner type from List[SomeModel]
+                    inner_type = None
+                    if (
+                        hasattr(field_type, "__origin__")
+                        and field_type.__origin__ in (list, List)
+                        and hasattr(field_type, "__args__")
+                        and field_type.__args__
+                    ):
+                        inner_type = field_type.__args__[0]
+
+                    if inner_type is not None:
+                        try:
+                            if (
+                                isinstance(inner_type, type)
+                                and hasattr(inner_type, "model_fields")
+                                and inner_type.model_fields
+                            ):
+                                result[key] = [
+                                    convert_base64_to_bytes(item, inner_type.model_fields)
+                                    if isinstance(item, dict)
+                                    else item
+                                    for item in value
+                                ]
+                            else:
+                                result[key] = convert_base64_to_bytes(value, {})
+                        except (TypeError, AttributeError):
+                            result[key] = convert_base64_to_bytes(value, {})
+                    else:
+                        result[key] = convert_base64_to_bytes(value, {})
+                else:
+                    result[key] = convert_base64_to_bytes(value, {})
+            else:
+                # For keys not in model_fields, still recurse but with empty field info
+                result[key] = convert_base64_to_bytes(value, {})
+        return result
+    elif isinstance(obj, list):
+        return [convert_base64_to_bytes(item, model_fields) for item in obj]
+    else:
+        return obj
+
+
 class PartialModel:
     """A partial model instance that only contains certain fields.
 
@@ -2558,10 +2665,14 @@ def to_string(s):
                 json_fields = convert_timestamp_to_datetime(
                     json_fields, cls.model_fields
                 )
+                # Convert base64 strings back to bytes for bytes fields
+                json_fields = convert_base64_to_bytes(json_fields, cls.model_fields)
                 doc = cls(**json_fields)
             else:
                 # Convert timestamps back to datetime objects
                 fields = convert_timestamp_to_datetime(fields, cls.model_fields)
+                # Convert base64 strings back to bytes for bytes fields
+                fields = convert_base64_to_bytes(fields, cls.model_fields)
                 doc = cls(**fields)
 
             docs.append(doc)
@@ -2752,9 +2863,10 @@ async def save(
         self.check()
         db = self._get_db(pipeline)
 
-        # Get model data and convert datetime objects first
+        # Get model data and apply conversions in the correct order
         document = self.model_dump()
         document = convert_datetime_to_timestamp(document)
+        document = convert_bytes_to_base64(document)
 
         # Then apply jsonable encoding for other types
         document = jsonable_encoder(document)
@@ -2854,6 +2966,8 @@ async def get(cls: Type["Model"], pk: Any) -> "Model":
         try:
             # Convert timestamps back to datetime objects before validation
             document = convert_timestamp_to_datetime(document, cls.model_fields)
+            # Convert base64 strings back to bytes for bytes fields
+            document = convert_base64_to_bytes(document, cls.model_fields)
             result = cls.model_validate(document)
         except TypeError as e:
             log.warning(
@@ -2865,6 +2979,8 @@ async def get(cls: Type["Model"], pk: Any) -> "Model":
             document = decode_redis_value(document, cls.Meta.encoding)
             # Convert timestamps back to datetime objects after decoding
             document = convert_timestamp_to_datetime(document, cls.model_fields)
+            # Convert base64 strings back to bytes for bytes fields
+            document = convert_base64_to_bytes(document, cls.model_fields)
             result = cls.model_validate(document)
         return result
 
@@ -3126,6 +3242,8 @@ async def save(
         data = self.model_dump()
         # Convert datetime objects to timestamps for proper indexing
         data = convert_datetime_to_timestamp(data)
+        # Convert bytes to base64 strings for safe JSON storage
+        data = convert_bytes_to_base64(data)
         # Apply JSON encoding for complex types (Enums, UUIDs, Sets, etc.)
         data = jsonable_encoder(data)
 
@@ -3199,6 +3317,8 @@ async def get(cls: Type["Model"], pk: Any) -> "Model":
             raise NotFoundError
         # Convert timestamps back to datetime objects before validation
         document_data = convert_timestamp_to_datetime(document_data, cls.model_fields)
+        # Convert base64 strings back to bytes for bytes fields
+        document_data = convert_base64_to_bytes(document_data, cls.model_fields)
         return cls.model_validate(document_data)
 
     @classmethod
diff --git a/tests/test_hash_model.py b/tests/test_hash_model.py
index 22cd8366..5a010d4b 100644
--- a/tests/test_hash_model.py
+++ b/tests/test_hash_model.py
@@ -1467,3 +1467,71 @@ async def test_save_nx_with_pipeline_raises_error(m):
     async with m.Member.db().pipeline(transaction=True) as pipe:
         with pytest.raises(ValueError, match="Cannot use nx or xx with pipeline"):
             await member.save(pipeline=pipe, nx=True)
+
+
+
+
+@py_test_mark_asyncio
+async def test_bytes_field_with_binary_data(key_prefix, redis):
+    """Test that bytes fields can store arbitrary binary data including non-UTF8 bytes.
+
+    Regression test for GitHub issue #779: bytes fields failed with UnicodeDecodeError
+    when storing actual binary data (non-UTF8 bytes).
+    """
+
+    class FileHash(HashModel, index=True):
+        filename: str = Field(index=True)
+        content: bytes
+
+        class Meta:
+            global_key_prefix = key_prefix
+            database = redis
+
+    await Migrator().run()
+
+    # Test with binary data that is NOT valid UTF-8 (PNG header)
+    binary_content = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR"
+
+    f = FileHash(filename="image.png", content=binary_content)
+    await f.save()
+
+    # Retrieve and verify
+    retrieved = await FileHash.get(f.pk)
+    assert retrieved.content == binary_content
+    assert retrieved.filename == "image.png"
+
+    # Test with null bytes and other non-printable characters
+    null_content = b"\x00\x01\x02\x03\xff\xfe\xfd"
+    f2 = FileHash(filename="binary.bin", content=null_content)
+    await f2.save()
+
+    retrieved2 = await FileHash.get(f2.pk)
+    assert retrieved2.content == null_content
+
+
+@py_test_mark_asyncio
+async def test_optional_bytes_field(key_prefix, redis):
+    """Test that Optional[bytes] fields work correctly."""
+    from typing import Optional
+
+    class Attachment(HashModel, index=True):
+        name: str = Field(index=True)
+        data: Optional[bytes] = None
+
+        class Meta:
+            global_key_prefix = key_prefix
+            database = redis
+
+    await Migrator().run()
+
+    # Without data
+    a1 = Attachment(name="empty")
+    await a1.save()
+    r1 = await Attachment.get(a1.pk)
+    assert r1.data is None
+
+    # With binary data
+    a2 = Attachment(name="with_data", data=b"\x89PNG\x00\xff")
+    await a2.save()
+    r2 = await Attachment.get(a2.pk)
+    assert r2.data == b"\x89PNG\x00\xff"
diff --git a/tests/test_json_model.py b/tests/test_json_model.py
index d3537b0d..7375a57f 100644
--- a/tests/test_json_model.py
+++ b/tests/test_json_model.py
@@ -1892,3 +1892,101 @@ class Meta:
     assert "normal_field" in schema_str
     # Case sensitive fields use CASESENSITIVE in schema
     assert "CASESENSITIVE" in schema_str
+
+
+
+
+@py_test_mark_asyncio
+async def test_bytes_field_with_binary_data(key_prefix, redis):
+    """Test that bytes fields can store arbitrary binary data including non-UTF8 bytes.
+
+    Regression test for GitHub issue #779: bytes fields failed with UnicodeDecodeError
+    when storing actual binary data (non-UTF8 bytes).
+    """
+
+    class FileJson(JsonModel, index=True):
+        filename: str = Field(index=True)
+        content: bytes
+
+        class Meta:
+            global_key_prefix = key_prefix
+            database = redis
+
+    await Migrator().run()
+
+    # Test with binary data that is NOT valid UTF-8 (PNG header)
+    binary_content = b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR"
+
+    f = FileJson(filename="image.png", content=binary_content)
+    await f.save()
+
+    # Retrieve and verify
+    retrieved = await FileJson.get(f.pk)
+    assert retrieved.content == binary_content
+    assert retrieved.filename == "image.png"
+
+    # Test with null bytes and other non-printable characters
+    null_content = b"\x00\x01\x02\x03\xff\xfe\xfd"
+    f2 = FileJson(filename="binary.bin", content=null_content)
+    await f2.save()
+
+    retrieved2 = await FileJson.get(f2.pk)
+    assert retrieved2.content == null_content
+
+
+@py_test_mark_asyncio
+async def test_optional_bytes_field(key_prefix, redis):
+    """Test that Optional[bytes] fields work correctly."""
+    from typing import Optional
+
+    class Attachment(JsonModel, index=True):
+        name: str = Field(index=True)
+        data: Optional[bytes] = None
+
+        class Meta:
+            global_key_prefix = key_prefix
+            database = redis
+
+    await Migrator().run()
+
+    # Without data
+    a1 = Attachment(name="empty")
+    await a1.save()
+    r1 = await Attachment.get(a1.pk)
+    assert r1.data is None
+
+    # With binary data
+    a2 = Attachment(name="with_data", data=b"\x89PNG\x00\xff")
+    await a2.save()
+    r2 = await Attachment.get(a2.pk)
+    assert r2.data == b"\x89PNG\x00\xff"
+
+
+@py_test_mark_asyncio
+async def test_bytes_field_in_embedded_model(key_prefix, redis):
+    """Test that bytes fields work in embedded models."""
+
+    class FileData(EmbeddedJsonModel):
+        content: bytes
+        mime_type: str
+
+    class Document(JsonModel, index=True):
+        name: str = Field(index=True)
+        file: FileData
+
+        class Meta:
+            global_key_prefix = key_prefix
+            database = redis
+
+    await Migrator().run()
+
+    binary_content = b"\x89PNG\r\n\x1a\n\x00\x00"
+    doc = Document(
+        name="test.png",
+        file=FileData(content=binary_content, mime_type="image/png"),
+    )
+    await doc.save()
+
+    retrieved = await Document.get(doc.pk)
+    assert retrieved.file.content == binary_content
+    assert retrieved.file.mime_type == "image/png"