Skip to content

Commit 3c77bba

Browse files
committed
Parquet: Fix UUID ClassCastException
java.util.UUID cannot be cast to class java.nio.ByteBuffer
1 parent 1ed0a75 commit 3c77bba

3 files changed

Lines changed: 59 additions & 4 deletions

File tree

data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,8 @@ public static List<Object> parameters() {
119119
optional(14, "all_nans", DoubleType.get()),
120120
optional(15, "some_nans", FloatType.get()),
121121
optional(16, "no_nans", DoubleType.get()),
122-
optional(17, "some_double_nans", DoubleType.get()));
122+
optional(17, "some_double_nans", DoubleType.get()),
123+
optional(18, "uuid_col", Types.UUIDType.get()));
123124

124125
private static final Types.StructType UNDERSCORE_STRUCT_FIELD_TYPE =
125126
Types.StructType.of(Types.NestedField.required(8, "_int_field", IntegerType.get()));
@@ -137,7 +138,8 @@ public static List<Object> parameters() {
137138
optional(14, "_all_nans", Types.DoubleType.get()),
138139
optional(15, "_some_nans", FloatType.get()),
139140
optional(16, "_no_nans", Types.DoubleType.get()),
140-
optional(17, "_some_double_nans", Types.DoubleType.get()));
141+
optional(17, "_some_double_nans", Types.DoubleType.get()),
142+
optional(18, "_uuid_col", Types.UUIDType.get()));
141143

142144
private static final Schema VARIANT_SCHEMA =
143145
new Schema(
@@ -157,6 +159,11 @@ public static List<Object> parameters() {
157159
private static final int INT_MIN_VALUE = 30;
158160
private static final int INT_MAX_VALUE = 79;
159161

162+
private static final UUID UUID_WITH_ZEROS =
163+
UUID.fromString("00000000-0000-0000-0000-000000000000");
164+
private static final UUID UUID_WITH_ONES =
165+
UUID.fromString("11111111-1111-1111-1111-111111111111");
166+
160167
private File orcFile = null;
161168
private MessageType parquetSchema = null;
162169
private BlockMetaData rowGroupMetadata = null;
@@ -211,6 +218,9 @@ public void createOrcInputFile() throws IOException {
211218
structNotNull.setField("_int_field", INT_MIN_VALUE + i);
212219
record.setField("_struct_not_null", structNotNull); // struct with int
213220

221+
record.setField(
222+
"_uuid_col", (i % 3 == 0) ? UUID_WITH_ZEROS : (i % 3 == 1) ? UUID_WITH_ONES : null);
223+
214224
appender.add(record);
215225
}
216226
}
@@ -248,6 +258,10 @@ private void createParquetInputFile() throws IOException {
248258
GenericRecord structNotNull = GenericRecord.create(UNDERSCORE_STRUCT_FIELD_TYPE);
249259
structNotNull.setField("_int_field", INT_MIN_VALUE + i);
250260
builder.setField("_struct_not_null", structNotNull); // struct with int
261+
262+
builder.setField(
263+
"_uuid_col", (i % 3 == 0) ? UUID_WITH_ZEROS : (i % 3 == 1) ? UUID_WITH_ONES : null);
264+
251265
records.add(builder);
252266
}
253267

@@ -1063,6 +1077,18 @@ public void testVariantFieldAllNullsNotNull() throws IOException {
10631077
}
10641078
}
10651079

1080+
@TestTemplate
1081+
public void testUUIDEq() {
1082+
assumeThat(format).as("Only valid for Parquet").isEqualTo(FileFormat.PARQUET);
1083+
1084+
boolean shouldRead = shouldRead(equal("uuid_col", UUID_WITH_ZEROS));
1085+
assertThat(shouldRead).as("Should read: UUID value exists in row group").isTrue();
1086+
1087+
UUID nonExistentUuid = UUID.fromString("99999999-9999-9999-9999-999999999999");
1088+
boolean shouldSkip = shouldRead(equal("uuid_col", nonExistentUuid));
1089+
assertThat(shouldSkip).as("Should skip: UUID value does not exist in row group").isFalse();
1090+
}
1091+
10661092
private boolean shouldRead(Expression expression) {
10671093
return shouldRead(expression, true);
10681094
}

parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ static Function<Object, Object> converterFromParquet(
8383
} else if (icebergType.typeId() == Type.TypeID.DOUBLE
8484
&& parquetType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.FLOAT) {
8585
return value -> ((Float) fromParquet.apply(value)).doubleValue();
86+
} else if (icebergType.typeId() == Type.TypeID.UUID) {
87+
return binary -> UUIDUtil.convert(((Binary) binary).toByteBuffer());
8688
}
8789
}
8890

parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,8 @@ public class TestDictionaryRowGroupFilter {
111111
14,
112112
"decimal_fixed",
113113
DecimalType.of(20, 10)), // >18 precision to enforce FIXED_LEN_BYTE_ARRAY
114-
optional(15, "_nans_and_nulls", DoubleType.get()));
114+
optional(15, "_nans_and_nulls", DoubleType.get()),
115+
optional(16, "uuid_col", Types.UUIDType.get()));
115116

116117
private static final Types.StructType UNDERSCORE_STRUCT_FIELD_TYPE =
117118
Types.StructType.of(Types.NestedField.required(9, "_int_field", IntegerType.get()));
@@ -133,7 +134,8 @@ public class TestDictionaryRowGroupFilter {
133134
14,
134135
"_decimal_fixed",
135136
DecimalType.of(20, 10)), // >18 precision to enforce FIXED_LEN_BYTE_ARRAY
136-
optional(15, "_nans_and_nulls", DoubleType.get()));
137+
optional(15, "_nans_and_nulls", DoubleType.get()),
138+
optional(16, "_uuid_col", Types.UUIDType.get()));
137139

138140
private static final String TOO_LONG_FOR_STATS;
139141

@@ -153,6 +155,11 @@ public class TestDictionaryRowGroupFilter {
153155
.subtract(DECIMAL_MIN_VALUE)
154156
.divide(new BigDecimal(INT_MAX_VALUE - INT_MIN_VALUE), RoundingMode.HALF_UP);
155157

158+
private static final UUID UUID_WITH_ZEROS =
159+
UUID.fromString("00000000-0000-0000-0000-000000000000");
160+
private static final UUID UUID_WITH_ONES =
161+
UUID.fromString("11111111-1111-1111-1111-111111111111");
162+
156163
private MessageType parquetSchema = null;
157164
private BlockMetaData rowGroupMetadata = null;
158165
private DictionaryPageReadStore dictionaryStore = null;
@@ -203,6 +210,9 @@ public void createInputFile() throws IOException {
203210
structNotNull.put("_int_field", INT_MIN_VALUE + i);
204211
builder.set("_struct_not_null", structNotNull); // struct with int
205212

213+
builder.set(
214+
"_uuid_col", (i % 3 == 0) ? UUID_WITH_ZEROS : (i % 3 == 1) ? UUID_WITH_ONES : null);
215+
206216
appender.add(builder.build());
207217
}
208218
}
@@ -1267,6 +1277,23 @@ public void testTransformFilter() {
12671277
.isTrue();
12681278
}
12691279

1280+
@TestTemplate
1281+
public void testUUIDDictionaryFilter() {
1282+
assumeThat(getColumnForName(rowGroupMetadata, "_uuid_col").getEncodings())
1283+
.contains(Encoding.RLE_DICTIONARY);
1284+
1285+
boolean shouldReadExisting =
1286+
new ParquetDictionaryRowGroupFilter(SCHEMA, equal("uuid_col", UUID_WITH_ZEROS))
1287+
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
1288+
assertThat(shouldReadExisting).as("Should read: Dictionary contains a matching entry").isTrue();
1289+
1290+
UUID nonExistentUUID = UUID.fromString("22222222-2222-2222-2222-222222222222");
1291+
boolean shouldRead =
1292+
new ParquetDictionaryRowGroupFilter(SCHEMA, equal("uuid_col", nonExistentUUID))
1293+
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
1294+
assertThat(shouldRead).as("Should skip: UUID not found in dictionary").isFalse();
1295+
}
1296+
12701297
private ColumnChunkMetaData getColumnForName(BlockMetaData rowGroup, String columnName) {
12711298
ColumnPath columnPath = ColumnPath.fromDotString(columnName);
12721299
for (ColumnChunkMetaData column : rowGroup.getColumns()) {

0 commit comments

Comments
 (0)