Skip to content

Commit e2bfb37

Browse files
committed
Add UUID conversion to and from 16 byte fixed sequences
UUIDs are often passed around in application code in their canonical, hex as string representation e.g. "550e8400-e29b-41d4-a716-446655440000". Encoding UUIDs as Avro "string"s takes 37 bytes, while encoding UUIDs in their binary form fits into a 16 byte sized "fixed", saving 21 bytes per encoding. This change allows application code to keep passing around canonical hex UUIDs while converting to the compact encoding, requiring only `uuid_format: :canonical_string` to be given in decode options. The [Java reference implementation][java-implementation] also supports encoding UUIDs as both strings and 16 byte fixed sequences. * Encoding is augmented such that a 16 byte fixed schema with `%{"logicalType" => "uuid"}`, converts a hex-string UUID to the 16 byte binary representation. * Decoding is augmented such that given `uuid_format: :canonical_string` in decode options, the binary representation is converted to the canonical hex-string representation. The encoding change is nearly backwards-compatible, previously when given an incorrectly size "fixed" with `{"logicalType": "uuid"}`, an error was raised, while now conversion is attempted. The decoding change is fully backwards-compatible, as `uuid_format` defaults to `:binary`. For UUID codec, the `uniq` library was added (no transitive dependencies). [java-implementation]: https://github.com/apache/avro/blob/230414abbb68e63e68f3b55bfc0cbca94f2737f6/lang/java/avro/src/main/java/org/apache/avro/LogicalTypes.java#L291-L309
1 parent f4091e2 commit e2bfb37

9 files changed

Lines changed: 124 additions & 20 deletions

File tree

lib/avro_ex.ex

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,14 @@ defmodule AvroEx do
136136
of blocks with their counts. This allows consumers of the encoded data to skip
137137
over those blocks in an efficient manner. Using the option `include_block_byte_size: true`
138138
enables adding those additional values.
139+
140+
## UUID encoding
141+
142+
UUIDs can be decoded as strings using the canonical hex representation with 37 bytes.
143+
Alternatively, encoding UUIDs in their 16 byte binary representation is much
144+
more compact, saving 21 bytes per encoding.
145+
See "UUIDs" on `decode/3` for how to convert binary representations back to
146+
canonical strings during decoding.
139147
"""
140148
@spec encode(Schema.t(), term, keyword()) ::
141149
{:ok, encoded_avro} | {:error, AvroEx.EncodeError.t() | Exception.t()}
@@ -185,6 +193,19 @@ defmodule AvroEx do
185193
186194
Otherwise, an approximate number is calculated.
187195
196+
## UUIDs
197+
198+
When decoding a 16 byte fixed quantity with logical type "uuid", specify
199+
`uuid_format: :binary` to retain the binary representation or
200+
`uuid_format: :canonical_string` to convert to the canonical, hex as string representation.
201+
202+
iex> schema = AvroEx.decode_schema!(~S({"type": "fixed", "size": 16, "name": "fixed_uuid", "logicalType":"uuid"}))
203+
iex> binary_uuid = <<85, 14, 132, 0, 226, 155, 65, 212, 167, 22, 68, 102, 85, 68, 0, 0>>
204+
iex> AvroEx.decode(schema, binary_uuid, uuid_format: :binary)
205+
{:ok, binary_uuid}
206+
iex> AvroEx.decode(schema, binary_uuid, uuid_format: :canonical_string)
207+
{:ok, "550e8400-e29b-41d4-a716-446655440000"}
208+
188209
"""
189210
@spec decode(Schema.t(), encoded_avro, keyword()) ::
190211
{:ok, term}

lib/avro_ex/decode.ex

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -270,6 +270,25 @@ defmodule AvroEx.Decode do
270270
{:lists.nth(index + 1, symbols), rest}
271271
end
272272

273+
defp do_decode(%Fixed{size: size = 16, metadata: %{"logicalType" => "uuid"}}, %Context{}, data, opts)
274+
when is_binary(data) do
275+
<<fixed::binary-size(size), rest::binary>> = data
276+
277+
case Keyword.get(opts, :uuid_format, :binary) do
278+
:binary ->
279+
{fixed, rest}
280+
281+
:canonical_string ->
282+
case Uniq.UUID.parse(fixed) do
283+
{:ok, uuid} ->
284+
{Uniq.UUID.to_string(uuid, :default), rest}
285+
286+
_ ->
287+
error({:invalid_binary_uuid, fixed})
288+
end
289+
end
290+
end
291+
273292
defp do_decode(%Fixed{size: size}, %Context{}, data, _) when is_binary(data) do
274293
<<fixed::binary-size(size), rest::binary>> = data
275294
{fixed, rest}

lib/avro_ex/decode_error.ex

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,9 @@ defmodule AvroEx.DecodeError do
1212
message = "Invalid UTF-8 string found #{inspect(str)}."
1313
%__MODULE__{message: message}
1414
end
15+
16+
def new({:invalid_binary_uuid, binary_uuid}) do
17+
message = "Invalid binary UUID found #{inspect(binary_uuid)}."
18+
%__MODULE__{message: message}
19+
end
1520
end

lib/avro_ex/encode.ex

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,11 @@ defmodule AvroEx.Encode do
169169
bin
170170
end
171171

172+
defp do_encode(%Fixed{size: 16, metadata: %{"logicalType" => "uuid"}} = f, %Context{} = context, bin, opts)
173+
when is_binary(bin) do
174+
do_encode(f, context, Uniq.UUID.string_to_binary!(bin), opts)
175+
end
176+
172177
defp do_encode(%Fixed{} = fixed, %Context{} = context, bin, _) when is_binary(bin) do
173178
error({:incorrect_fixed_size, fixed, bin, context})
174179
end

lib/avro_ex/schema/fixed.ex

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,10 @@ defmodule AvroEx.Schema.Fixed do
1919
true
2020
end
2121

22+
def match?(%__MODULE__{size: 16, metadata: %{"logicalType" => "uuid"}}, %Context{}, data)
23+
when is_binary(data) do
24+
Uniq.UUID.valid?(data)
25+
end
26+
2227
def match?(_fixed, _context, _data), do: false
2328
end

mix.exs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,8 @@ defmodule AvroEx.Mixfile do
3737
{:dialyxir, "~> 1.1", only: :dev, runtime: false},
3838
{:ex_doc, "~> 0.20", only: :dev, runtime: false},
3939
{:stream_data, "~> 0.5", only: [:dev, :test]},
40-
{:decimal, "~> 2.0", optional: true}
40+
{:decimal, "~> 2.0", optional: true},
41+
{:uniq, "~> 0.6"}
4142
]
4243
end
4344

mix.lock

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,4 +14,5 @@
1414
"nimble_parsec": {:hex, :nimble_parsec, "1.2.3", "244836e6e3f1200c7f30cb56733fd808744eca61fd182f731eac4af635cc6d0b", [:mix], [], "hexpm", "c8d789e39b9131acf7b99291e93dae60ab48ef14a7ee9d58c6964f59efb570b0"},
1515
"stream_data": {:hex, :stream_data, "0.5.0", "b27641e58941685c75b353577dc602c9d2c12292dd84babf506c2033cd97893e", [:mix], [], "hexpm", "012bd2eec069ada4db3411f9115ccafa38540a3c78c4c0349f151fc761b9e271"},
1616
"typed_struct": {:hex, :typed_struct, "0.3.0", "939789e3c1dca39d7170c87f729127469d1315dcf99fee8e152bb774b17e7ff7", [:mix], [], "hexpm", "c50bd5c3a61fe4e198a8504f939be3d3c85903b382bde4865579bc23111d1b6d"},
17+
"uniq": {:hex, :uniq, "0.6.1", "369660ecbc19051be526df3aa85dc393af5f61f45209bce2fa6d7adb051ae03c", [:mix], [{:ecto, "~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm", "6426c34d677054b3056947125b22e0daafd10367b85f349e24ac60f44effb916"},
1718
}

test/decode_test.exs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,20 @@ defmodule AvroEx.Decode.Test do
344344
"decimalField4" => 5.3e-11
345345
}
346346
end
347+
348+
test "16 byte fixed uuid" do
349+
{:ok, fixed_uuid_schema} =
350+
AvroEx.decode_schema(~S({"type": "fixed", "size": 16, "name": "fixed_uuid", "logicalType":"uuid"}))
351+
352+
# Example from https://en.wikipedia.org/wiki/Universally_unique_identifier#Textual_representation
353+
canonical_string = "550e8400-e29b-41d4-a716-446655440000"
354+
binary = :binary.encode_unsigned(113_059_749_145_936_325_402_354_257_176_981_405_696)
355+
356+
assert {:ok, ^binary} = AvroEx.decode(fixed_uuid_schema, binary, uuid_format: :binary)
357+
assert {:ok, ^binary} = AvroEx.decode(fixed_uuid_schema, binary)
358+
359+
assert {:ok, ^canonical_string} = AvroEx.decode(fixed_uuid_schema, binary, uuid_format: :canonical_string)
360+
end
347361
end
348362

349363
describe "DecodingError" do
@@ -354,5 +368,18 @@ defmodule AvroEx.Decode.Test do
354368
AvroEx.decode!(schema, <<"\nhell", 0xFFFF::16>>)
355369
end
356370
end
371+
372+
test "invalid fixed uuid" do
373+
{:ok, fixed_uuid_schema} =
374+
AvroEx.decode_schema(~S({"type": "fixed", "size": 16, "name": "fixed_uuid", "logicalType":"uuid"}))
375+
376+
non_uuid_binary = :binary.list_to_bin(List.duplicate(1, 16))
377+
378+
assert_raise DecodeError,
379+
"Invalid binary UUID found <<1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1>>.",
380+
fn ->
381+
AvroEx.decode!(fixed_uuid_schema, non_uuid_binary, uuid_format: :canonical_string)
382+
end
383+
end
357384
end
358385
end

test/encode_test.exs

Lines changed: 39 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -134,6 +134,19 @@ defmodule AvroEx.Encode.Test do
134134
"decimalField4" => 5.3e-11
135135
}
136136
end
137+
138+
test "16 byte fixed uuid" do
139+
assert %AvroEx.Schema{} =
140+
schema =
141+
AvroEx.decode_schema!(%{"type" => "fixed", "size" => 16, "name" => "fixed_uuid", "logicalType" => "uuid"})
142+
143+
# Example from https://en.wikipedia.org/wiki/Universally_unique_identifier#Textual_representation
144+
canonical_string = "550e8400-e29b-41d4-a716-446655440000"
145+
binary = :binary.encode_unsigned(113_059_749_145_936_325_402_354_257_176_981_405_696)
146+
147+
assert {:ok, ^binary} = AvroEx.encode(schema, canonical_string)
148+
assert {:ok, ^binary} = AvroEx.encode(schema, binary)
149+
end
137150
end
138151

139152
describe "variable_integer_encode" do
@@ -282,51 +295,50 @@ defmodule AvroEx.Encode.Test do
282295
end
283296

284297
describe "encode (union)" do
298+
defp union_index(index) do
299+
{:ok, int_schema} = AvroEx.decode_schema(~S("int"))
300+
{:ok, index} = @test_module.encode(int_schema, index)
301+
index
302+
end
303+
285304
test "works as expected with nulls" do
286305
{:ok, schema} = AvroEx.decode_schema(~S(["null", "int"]))
287306
{:ok, null_schema} = AvroEx.decode_schema(~S("null"))
288-
{:ok, int_schema} = AvroEx.decode_schema(~S("int"))
289307

290-
{:ok, index} = @test_module.encode(int_schema, 0)
291308
{:ok, encoded_null} = @test_module.encode(null_schema, nil)
292309
{:ok, encoded_union} = @test_module.encode(schema, nil)
293310

294-
assert encoded_union == index <> encoded_null
311+
assert encoded_union == union_index(0) <> encoded_null
295312
end
296313

297314
test "works as expected with ints" do
298315
{:ok, schema} = AvroEx.decode_schema(~S(["null", "int"]))
299316
{:ok, int_schema} = AvroEx.decode_schema(~S("int"))
300317

301-
{:ok, index} = @test_module.encode(int_schema, 1)
302318
{:ok, encoded_int} = @test_module.encode(int_schema, 2086)
303319
{:ok, encoded_union} = @test_module.encode(schema, 2086)
304320

305-
assert encoded_union == index <> encoded_int
321+
assert encoded_union == union_index(1) <> encoded_int
306322
end
307323

308324
test "works as expected with int and long" do
309325
{:ok, schema} = AvroEx.decode_schema(~S(["int", "long"]))
310-
{:ok, int_schema} = AvroEx.decode_schema(~S("int"))
311326
{:ok, long_schema} = AvroEx.decode_schema(~S("long"))
312327

313-
{:ok, index} = @test_module.encode(int_schema, 1)
314328
{:ok, encoded_long} = @test_module.encode(long_schema, -3_376_656_585_598_455_353)
315329
{:ok, encoded_union} = @test_module.encode(schema, -3_376_656_585_598_455_353)
316330

317-
assert encoded_union == index <> encoded_long
331+
assert encoded_union == union_index(1) <> encoded_long
318332
end
319333

320334
test "works as expected with float and double" do
321335
{:ok, schema} = AvroEx.decode_schema(~S(["float", "double"]))
322-
{:ok, int_schema} = AvroEx.decode_schema(~S("int"))
323336
{:ok, double_schema} = AvroEx.decode_schema(~S("double"))
324337

325-
{:ok, index} = @test_module.encode(int_schema, 1)
326338
{:ok, encoded_long} = @test_module.encode(double_schema, 0.0000000001)
327339
{:ok, encoded_union} = @test_module.encode(schema, 0.0000000001)
328340

329-
assert encoded_union == index <> encoded_long
341+
assert encoded_union == union_index(1) <> encoded_long
330342
end
331343

332344
test "works as expected with logical types" do
@@ -336,11 +348,23 @@ defmodule AvroEx.Encode.Test do
336348
{:ok, schema} = AvroEx.decode_schema(~s(["null", #{datetime_json}]))
337349
{:ok, datetime_schema} = AvroEx.decode_schema(datetime_json)
338350

339-
{:ok, index} = @test_module.encode(datetime_schema, 1)
340351
{:ok, encoded_datetime} = @test_module.encode(datetime_schema, datetime_value)
341352
{:ok, encoded_union} = @test_module.encode(schema, datetime_value)
342353

343-
assert encoded_union == index <> encoded_datetime
354+
assert encoded_union == union_index(1) <> encoded_datetime
355+
end
356+
357+
test "works as expected with 16 byte fixed UUID logical types" do
358+
fixed_uuid_json = ~S({"type": "fixed", "size": 16, "name": "fixed_uuid", "logicalType": "uuid"})
359+
uuid_value = "550e8400-e29b-41d4-a716-446655440000"
360+
361+
{:ok, schema} = AvroEx.decode_schema(~s(["null", #{fixed_uuid_json}]))
362+
{:ok, fixed_uuid_schema} = AvroEx.decode_schema(fixed_uuid_json)
363+
364+
{:ok, encoded_uuid} = @test_module.encode(fixed_uuid_schema, uuid_value)
365+
{:ok, encoded_union} = @test_module.encode(schema, uuid_value)
366+
367+
assert encoded_union == union_index(1) <> encoded_uuid
344368
end
345369

346370
test "works as expected with records" do
@@ -358,14 +382,12 @@ defmodule AvroEx.Encode.Test do
358382
json_schema = ~s(["null", #{record_json}])
359383

360384
{:ok, schema} = AvroEx.decode_schema(json_schema)
361-
{:ok, int_schema} = AvroEx.decode_schema(~S("int"))
362385
{:ok, record_schema} = AvroEx.decode_schema(record_json)
363386

364-
{:ok, index} = @test_module.encode(int_schema, 1)
365387
{:ok, encoded_record} = @test_module.encode(record_schema, %{"a" => 25, "b" => "hello"})
366388
{:ok, encoded_union} = @test_module.encode(schema, %{"a" => 25, "b" => "hello"})
367389

368-
assert encoded_union == index <> encoded_record
390+
assert encoded_union == union_index(1) <> encoded_record
369391
end
370392

371393
test "works as expected with union values tagged for a named possibility" do
@@ -384,14 +406,12 @@ defmodule AvroEx.Encode.Test do
384406
json_schema = ~s([#{record_json_factory.("a")}, #{record_json_factory.("b")}])
385407

386408
{:ok, schema} = AvroEx.decode_schema(json_schema)
387-
{:ok, int_schema} = AvroEx.decode_schema(~S("int"))
388409
{:ok, record_schema} = AvroEx.decode_schema(record_json_factory.("b"))
389410

390-
{:ok, index} = @test_module.encode(int_schema, 1)
391411
{:ok, encoded_record} = @test_module.encode(record_schema, %{"value" => "hello"})
392412
{:ok, encoded_union} = @test_module.encode(schema, {"b", %{"value" => "hello"}})
393413

394-
assert encoded_union == index <> encoded_record
414+
assert encoded_union == union_index(1) <> encoded_record
395415
end
396416

397417
test "errors with a clear error for tagged unions" do

0 commit comments

Comments
 (0)