-
Notifications
You must be signed in to change notification settings - Fork 82
Closed
Description
Generate a file with code below, read it val df = DataFrame.readArrowFeather("test_struct.arrow") and print schema
Actual:
id: Int
person: DataRow<*>
Expected:
id: Int
person:
name: String
age: Int
Problem is in the concatKeepingSchema that treats ColumnGroup as regular DataColumns. Only applicable for files with multiple batches, which is not always the case.
fun main() {
val allocator = RootAllocator()
val structType = FieldType(true, ArrowType.Struct.INSTANCE, null)
val structField = Field("person", structType, listOf(
Field("name", FieldType(false, ArrowType.Utf8.INSTANCE, null), emptyList()),
Field("age", FieldType(false, ArrowType.Int(32, true), null), emptyList()),
))
val schema = Schema(listOf(
Field("id", FieldType(false, ArrowType.Int(32, true), null), emptyList()),
structField,
))
VectorSchemaRoot.create(schema, allocator).use { root ->
val idVec = root.getVector("id") as IntVector
val structVec = root.getVector("person") as StructVector
val nameVec = structVec.getChild("name") as VarCharVector
val ageVec = structVec.getChild("age") as IntVector
FileOutputStream("test_struct.arrow").use { fos ->
ArrowFileWriter(root, DictionaryProvider.MapDictionaryProvider(), fos.channel).use { writer ->
writer.start()
// batch 1
idVec.allocateNew(2)
structVec.allocateNew()
for (i in 0 until 2) {
idVec.set(i, i)
nameVec.setSafe(i, "Alice_$i".toByteArray())
ageVec.setSafe(i, 25 + i)
structVec.setIndexDefined(i)
}
root.rowCount = 2
writer.writeBatch()
// batch 2
root.clear()
idVec.allocateNew(3)
structVec.allocateNew()
for (i in 0 until 3) {
idVec.set(i, 10 + i)
nameVec.setSafe(i, "Bob_$i".toByteArray())
ageVec.setSafe(i, 30 + i)
structVec.setIndexDefined(i)
}
root.rowCount = 3
writer.writeBatch()
writer.end()
}
}
}
allocator.close()
}Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working