Skip to content

Reading Struct field from an Arrow file with multiple batches leads to incomplete schema #1763

@koperagen

Description

@koperagen

Generate a file with code below, read it val df = DataFrame.readArrowFeather("test_struct.arrow") and print schema

Actual:

id: Int
person: DataRow<*>

Expected:

id: Int
person:
    name: String
    age: Int

Problem is in the concatKeepingSchema that treats ColumnGroup as regular DataColumns. Only applicable for files with multiple batches, which is not always the case.

fun main() {
    val allocator = RootAllocator()

    val structType = FieldType(true, ArrowType.Struct.INSTANCE, null)
    val structField = Field("person", structType, listOf(
        Field("name", FieldType(false, ArrowType.Utf8.INSTANCE, null), emptyList()),
        Field("age", FieldType(false, ArrowType.Int(32, true), null), emptyList()),
    ))
    val schema = Schema(listOf(
        Field("id", FieldType(false, ArrowType.Int(32, true), null), emptyList()),
        structField,
    ))

    VectorSchemaRoot.create(schema, allocator).use { root ->
        val idVec = root.getVector("id") as IntVector
        val structVec = root.getVector("person") as StructVector
        val nameVec = structVec.getChild("name") as VarCharVector
        val ageVec = structVec.getChild("age") as IntVector

        FileOutputStream("test_struct.arrow").use { fos ->
            ArrowFileWriter(root, DictionaryProvider.MapDictionaryProvider(), fos.channel).use { writer ->
                writer.start()

                // batch 1
                idVec.allocateNew(2)
                structVec.allocateNew()
                for (i in 0 until 2) {
                    idVec.set(i, i)
                    nameVec.setSafe(i, "Alice_$i".toByteArray())
                    ageVec.setSafe(i, 25 + i)
                    structVec.setIndexDefined(i)
                }
                root.rowCount = 2
                writer.writeBatch()

                // batch 2
                root.clear()
                idVec.allocateNew(3)
                structVec.allocateNew()
                for (i in 0 until 3) {
                    idVec.set(i, 10 + i)
                    nameVec.setSafe(i, "Bob_$i".toByteArray())
                    ageVec.setSafe(i, 30 + i)
                    structVec.setIndexDefined(i)
                }
                root.rowCount = 3
                writer.writeBatch()

                writer.end()
            }
        }
    }
    allocator.close()
}

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions