Skip to content

Commit 249d25d

Browse files
committed
Parquet: Fix UUID ClassCastException
java.util.UUID cannot be cast to class java.nio.ByteBuffer
1 parent 389c04c commit 249d25d

File tree

3 files changed

+58
-4
lines changed

3 files changed

+58
-4
lines changed

data/src/test/java/org/apache/iceberg/data/TestMetricsRowGroupFilter.java

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,8 @@ public static List<Object> parameters() {
118118
optional(14, "all_nans", DoubleType.get()),
119119
optional(15, "some_nans", FloatType.get()),
120120
optional(16, "no_nans", DoubleType.get()),
121-
optional(17, "some_double_nans", DoubleType.get()));
121+
optional(17, "some_double_nans", DoubleType.get()),
122+
optional(18, "uuid_col", Types.UUIDType.get()));
122123

123124
private static final Types.StructType UNDERSCORE_STRUCT_FIELD_TYPE =
124125
Types.StructType.of(Types.NestedField.required(8, "_int_field", IntegerType.get()));
@@ -136,7 +137,8 @@ public static List<Object> parameters() {
136137
optional(14, "_all_nans", Types.DoubleType.get()),
137138
optional(15, "_some_nans", FloatType.get()),
138139
optional(16, "_no_nans", Types.DoubleType.get()),
139-
optional(17, "_some_double_nans", Types.DoubleType.get()));
140+
optional(17, "_some_double_nans", Types.DoubleType.get()),
141+
optional(18, "_uuid_col", Types.UUIDType.get()));
140142

141143
private static final String TOO_LONG_FOR_STATS_PARQUET;
142144

@@ -151,6 +153,11 @@ public static List<Object> parameters() {
151153
private static final int INT_MIN_VALUE = 30;
152154
private static final int INT_MAX_VALUE = 79;
153155

156+
private static final UUID UUID_WITH_ZEROS =
157+
UUID.fromString("00000000-0000-0000-0000-000000000000");
158+
private static final UUID UUID_WITH_ONES =
159+
UUID.fromString("11111111-1111-1111-1111-111111111111");
160+
154161
private File orcFile = null;
155162
private MessageType parquetSchema = null;
156163
private BlockMetaData rowGroupMetadata = null;
@@ -205,6 +212,9 @@ public void createOrcInputFile() throws IOException {
205212
structNotNull.setField("_int_field", INT_MIN_VALUE + i);
206213
record.setField("_struct_not_null", structNotNull); // struct with int
207214

215+
record.setField(
216+
"_uuid_col", (i % 3 == 0) ? UUID_WITH_ZEROS : (i % 3 == 1) ? UUID_WITH_ONES : null);
217+
208218
appender.add(record);
209219
}
210220
}
@@ -250,6 +260,9 @@ private void createParquetInputFile() throws IOException {
250260
structNotNull.put("_int_field", INT_MIN_VALUE + i);
251261
builder.set("_struct_not_null", structNotNull); // struct with int
252262

263+
builder.set(
264+
"_uuid_col", (i % 3 == 0) ? UUID_WITH_ZEROS : (i % 3 == 1) ? UUID_WITH_ONES : null);
265+
253266
appender.add(builder.build());
254267
}
255268
}
@@ -988,6 +1001,18 @@ public void testTransformFilter() {
9881001
.isTrue();
9891002
}
9901003

1004+
@TestTemplate
1005+
public void testUUIDEq() {
1006+
assumeThat(format).as("Only valid for Parquet").isEqualTo(FileFormat.PARQUET);
1007+
1008+
boolean shouldRead = shouldRead(equal("uuid_col", UUID_WITH_ZEROS));
1009+
assertThat(shouldRead).as("Should read: UUID value exists in row group").isTrue();
1010+
1011+
UUID nonExistentUuid = UUID.fromString("99999999-9999-9999-9999-999999999999");
1012+
boolean shouldSkip = shouldRead(equal("uuid_col", nonExistentUuid));
1013+
assertThat(shouldSkip).as("Should skip: UUID value does not exist in row group").isFalse();
1014+
}
1015+
9911016
private boolean shouldRead(Expression expression) {
9921017
return shouldRead(expression, true);
9931018
}

parquet/src/main/java/org/apache/iceberg/parquet/ParquetConversions.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,8 @@ static Function<Object, Object> converterFromParquet(
8383
} else if (icebergType.typeId() == Type.TypeID.DOUBLE
8484
&& parquetType.getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.FLOAT) {
8585
return value -> ((Float) fromParquet.apply(value)).doubleValue();
86+
} else if (icebergType.typeId() == Type.TypeID.UUID) {
87+
return binary -> UUIDUtil.convert(((Binary) binary).toByteBuffer());
8688
}
8789
}
8890

parquet/src/test/java/org/apache/iceberg/parquet/TestDictionaryRowGroupFilter.java

Lines changed: 29 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,8 @@ public class TestDictionaryRowGroupFilter {
111111
14,
112112
"decimal_fixed",
113113
DecimalType.of(20, 10)), // >18 precision to enforce FIXED_LEN_BYTE_ARRAY
114-
optional(15, "_nans_and_nulls", DoubleType.get()));
114+
optional(15, "_nans_and_nulls", DoubleType.get()),
115+
optional(16, "uuid_col", Types.UUIDType.get()));
115116

116117
private static final Types.StructType UNDERSCORE_STRUCT_FIELD_TYPE =
117118
Types.StructType.of(Types.NestedField.required(9, "_int_field", IntegerType.get()));
@@ -133,7 +134,8 @@ public class TestDictionaryRowGroupFilter {
133134
14,
134135
"_decimal_fixed",
135136
DecimalType.of(20, 10)), // >18 precision to enforce FIXED_LEN_BYTE_ARRAY
136-
optional(15, "_nans_and_nulls", DoubleType.get()));
137+
optional(15, "_nans_and_nulls", DoubleType.get()),
138+
optional(16, "_uuid_col", Types.UUIDType.get()));
137139

138140
private static final String TOO_LONG_FOR_STATS;
139141

@@ -153,6 +155,11 @@ public class TestDictionaryRowGroupFilter {
153155
.subtract(DECIMAL_MIN_VALUE)
154156
.divide(new BigDecimal(INT_MAX_VALUE - INT_MIN_VALUE), RoundingMode.HALF_UP);
155157

158+
private static final UUID UUID_WITH_ZEROS =
159+
UUID.fromString("00000000-0000-0000-0000-000000000000");
160+
private static final UUID UUID_WITH_ONES =
161+
UUID.fromString("11111111-1111-1111-1111-111111111111");
162+
156163
private MessageType parquetSchema = null;
157164
private BlockMetaData rowGroupMetadata = null;
158165
private DictionaryPageReadStore dictionaryStore = null;
@@ -203,6 +210,9 @@ public void createInputFile() throws IOException {
203210
structNotNull.put("_int_field", INT_MIN_VALUE + i);
204211
builder.set("_struct_not_null", structNotNull); // struct with int
205212

213+
builder.set(
214+
"_uuid_col", (i % 3 == 0) ? UUID_WITH_ZEROS : (i % 3 == 1) ? UUID_WITH_ONES : null);
215+
206216
appender.add(builder.build());
207217
}
208218
}
@@ -1267,6 +1277,23 @@ public void testTransformFilter() {
12671277
.isTrue();
12681278
}
12691279

1280+
@TestTemplate
1281+
public void testUUIDDictionaryFilter() {
1282+
assumeThat(getColumnForName(rowGroupMetadata, "_uuid_col").getEncodings())
1283+
.contains(Encoding.RLE_DICTIONARY);
1284+
1285+
boolean shouldReadExisting =
1286+
new ParquetDictionaryRowGroupFilter(SCHEMA, equal("uuid_col", UUID_WITH_ZEROS))
1287+
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
1288+
assertThat(shouldReadExisting).as("Should read: Dictionary contains a matching entry").isTrue();
1289+
1290+
UUID nonExistentUUID = UUID.fromString("22222222-2222-2222-2222-222222222222");
1291+
boolean shouldRead =
1292+
new ParquetDictionaryRowGroupFilter(SCHEMA, equal("uuid_col", nonExistentUUID))
1293+
.shouldRead(parquetSchema, rowGroupMetadata, dictionaryStore);
1294+
assertThat(shouldRead).as("Should skip: UUID not found in dictionary").isFalse();
1295+
}
1296+
12701297
private ColumnChunkMetaData getColumnForName(BlockMetaData rowGroup, String columnName) {
12711298
ColumnPath columnPath = ColumnPath.fromDotString(columnName);
12721299
for (ColumnChunkMetaData column : rowGroup.getColumns()) {

0 commit comments

Comments
 (0)