From cddad9cd8e5b6568ee35397113b8938e20409acf Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Wed, 13 May 2026 17:28:01 +0200 Subject: [PATCH 01/14] PoC for DataFrameReadSource --- core/build.gradle.kts | 6 +- .../jetbrains/kotlinx/dataframe/io/guess2.kt | 146 ++++++++++++++++++ .../jetbrains/kotlinx/dataframe/io/Guess2.kt | 58 +++++++ .../jetbrains/kotlinx/dataframe/io/json.kt | 98 ++++++++++++ ...s.kotlinx.dataframe.io.DataFrameReadSource | 1 + 5 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt create mode 100644 core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt create mode 100644 dataframe-json/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource diff --git a/core/build.gradle.kts b/core/build.gradle.kts index 60e39459a5..3a4e0cf0f1 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -76,7 +76,6 @@ dependencies { testImplementation(libs.kotlin.scriptingJvm) testImplementation(libs.jsoup) testImplementation(libs.sl4jsimple) - testImplementation(projects.dataframeJson) testImplementation(libs.serialization.core) testImplementation(libs.serialization.json) @@ -85,6 +84,11 @@ dependencies { // for samples.api testImplementation(projects.dataframeCsv) + testImplementation(projects.dataframeJson) + testImplementation(projects.dataframeArrow) + testImplementation(projects.dataframeExcel) +// testImplementation(projects.dataframeGeo) + testImplementation(projects.dataframeJdbc) } // Configure testJava16 dependencies to extend from test diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt new file mode 100644 index 0000000000..1a02e66cf2 --- /dev/null +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt @@ -0,0 +1,146 @@ +package org.jetbrains.kotlinx.dataframe.io + +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.DataFrame +import java.io.File +import java.io.FileNotFoundException +import java.net.URL +import java.nio.file.Path +import java.util.ServiceLoader +import kotlin.io.extension +import kotlin.io.path.extension +import kotlin.reflect.KType +import kotlin.reflect.full.withNullability +import kotlin.reflect.typeOf + +public interface DataFrameReadOptions + +public interface DataFrameReadSource { + public fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions? = null, + ): DataFrame<*>? + + public fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean + + // `DataFrame.Companion.read` methods uses this to sort list of all supported formats in ascending order (-1, 2, 10) + // sorted list is used to test if any format can read given input + public val testOrder: Int +} + +public data class DataSourceInfo( + public val type: DataSourceType, + public val extension: String? = null, + // TODO, Apache Tika? + public val mimeType: String? = null, +) + +public sealed class DataSourceType(public open val kType: KType) { + /** Like a path, file, or URL. */ + public data class Reference(override val kType: KType) : DataSourceType(kType) + + /** Actual data, like a String, ByteArray, InputStream */ + public data class InMemory(override val kType: KType) : DataSourceType(kType) + + public companion object { + public inline fun reference(): Reference = Reference(kType = typeOf()) + + public inline fun inMemory(): InMemory = InMemory(kType = typeOf()) + } +} + +/** + * NOTE: Needs to have fully qualified name in + * resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.NewSupportedDataFrameFormat + * to be detected here. + */ +internal val newSupportedFormats: List by lazy { + ServiceLoader.load(DataFrameReadSource::class.java) + .toList() + .distinct() + .sortedBy { it.testOrder } +} + +internal fun readDataFrameImpl( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions? = null, + formats: List = newSupportedFormats, +): AnyFrame { + val tries = mutableMapOf() + formats.sortedBy { it.testOrder }.forEach { + if (!it.acceptsSource(sourceInfo, options)) return@forEach + try { + val df = it.readDataFrameOrNull(source, sourceInfo, options) + if (df != null) return df + } catch (e: FileNotFoundException) { + throw e + } catch (e: Exception) { + tries[it::class.simpleName!!] = e + } + } + throw IllegalArgumentException("Unknown DataFrame source $source, $sourceInfo; Tried $tries") +} + +public fun DataFrame.Companion.readReference( + reference: Any, + type: KType, + options: DataFrameReadOptions? = null, +): AnyFrame = + readDataFrameImpl( + source = reference, + sourceInfo = DataSourceInfo( + type = DataSourceType.Reference(type.withNullability(false)), + extension = reference.extensionOrNull(), + mimeType = null, // TODO, Apache Tika? + ), + options = options, + ) + +public inline fun DataFrame.Companion.readReference( + reference: R, + options: DataFrameReadOptions? = null, +): AnyFrame = + readReference( + reference = reference, + type = typeOf(), + options = options, + ) + +internal fun Any.extensionOrNull(): String? = + when (this) { + is Path -> extension + + is File -> extension + + is URL -> path.takeIf { it.isNotBlank() }?.substringAfterLast('.') + + is String -> try { + asUrl(this).extensionOrNull() + } catch (_: Exception) { + null + } + + else -> null + } + +public fun DataFrame.Companion.readFromData(data: Any, type: KType, options: DataFrameReadOptions? = null): AnyFrame = + readDataFrameImpl( + source = data, + sourceInfo = DataSourceInfo( + type = DataSourceType.InMemory(type.withNullability(false)), + mimeType = null, // TODO, Apache Tika? + ), + options = options, + ) + +public inline fun DataFrame.Companion.readFromData( + data: R, + options: DataFrameReadOptions? = null, +): AnyFrame = + readFromData( + data = data, + type = typeOf(), + options = options, + ) diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt new file mode 100644 index 0000000000..3691dcbac8 --- /dev/null +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt @@ -0,0 +1,58 @@ +package org.jetbrains.kotlinx.dataframe.io + +import io.kotest.matchers.shouldBe +import kotlinx.serialization.json.Json +import kotlinx.serialization.json.JsonElement +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.junit.Test +import java.io.File +import java.net.URI +import java.net.URL +import kotlin.io.path.Path +import kotlin.io.path.absolute + +class Guess2 { + + @Test + fun `read JSON reference`() { + val expected = DataFrame.readJson("../data/participants.json") + + DataFrame.readReference("../data/participants.json") shouldBe expected + DataFrame.readReference(Path("../data/participants.json")) shouldBe expected + DataFrame.readReference(File("../data/participants.json")) shouldBe expected + DataFrame.readReference( + Path("../data/participants.json").absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = org.jetbrains.kotlinx.dataframe.io.Json.Options( + typeClashTactic = JSON.TypeClashTactic.ANY_COLUMNS, + ) + + DataFrame.readReference("../data/participants.json", options) shouldBe expected + DataFrame.readReference(Path("../data/participants.json"), options) shouldBe expected + DataFrame.readReference(File("../data/participants.json"), options) shouldBe expected + DataFrame.readReference( + Path("../data/participants.json").absolute().normalize().toUri().toURL(), + options, + ) shouldBe expected + } + + @Test + fun `read JSON in memory`() { + val expected = DataFrame.readJson("../data/participants.json") + + val file = File("../data/participants.json") + + DataFrame.readFromData(file.readText()) shouldBe expected + DataFrame.readFromData(file.inputStream()) shouldBe expected + DataFrame.readFromData(Json.decodeFromString(file.readText())) shouldBe expected + + val options = org.jetbrains.kotlinx.dataframe.io.Json.Options( + typeClashTactic = JSON.TypeClashTactic.ANY_COLUMNS, + ) + + DataFrame.readFromData(file.readText(), options) shouldBe expected + DataFrame.readFromData(file.inputStream(), options) shouldBe expected + DataFrame.readFromData(Json.decodeFromString(file.readText()), options) shouldBe expected + } +} diff --git a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt index 4949d1f104..9786d4f8f2 100644 --- a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt +++ b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt @@ -23,6 +23,8 @@ import org.jetbrains.kotlinx.dataframe.impl.io.encodeDataFrameWithMetadata import org.jetbrains.kotlinx.dataframe.impl.io.encodeFrame import org.jetbrains.kotlinx.dataframe.impl.io.encodeRow import org.jetbrains.kotlinx.dataframe.impl.io.readJsonImpl +import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions.Companion.GZIP_ON +import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions.Companion.LIMIT_SIZE_ON import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ARRAY_AND_VALUE_COLUMNS import java.io.File @@ -30,8 +32,104 @@ import java.io.InputStream import java.net.URL import java.nio.file.Path import kotlin.io.path.writeText +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf import kotlin.reflect.typeOf +public class Json : DataFrameReadSource { + + public data class Options( + val header: List = emptyList(), + val typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS, + val keyValuePaths: List = emptyList(), + val unifyNumbers: Boolean = true, + ) : DataFrameReadOptions + + public companion object { + public val supportedReferenceTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf()) + public val supportedInMemoryTypes: Set = + setOf(typeOf(), typeOf(), typeOf()) + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + if (sourceInfo.extension?.lowercase()?.equals("json") == false) return false + if (sourceInfo.mimeType?.lowercase()?.equals("application/json") == false) return false + + val kType = sourceInfo.type.kType + return when (sourceInfo.type) { + is DataSourceType.Reference -> + supportedReferenceTypes.any { kType.isSubtypeOf(it) } + + is DataSourceType.InMemory -> + supportedInMemoryTypes.any { kType.isSubtypeOf(it) } + } + } + + @OptIn(ExperimentalSerializationApi::class) + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val options = (options ?: Options()) as Options + val kType = sourceInfo.type.kType + return when (sourceInfo.type) { + is DataSourceType.Reference -> { + val url = when { + kType.isSubTypeOf() -> (source as? String)?.let(::asUrl) + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } ?: return null + + DataFrame.readJson( + url = url, + header = options.header, + typeClashTactic = options.typeClashTactic, + keyValuePaths = options.keyValuePaths, + unifyNumbers = options.unifyNumbers, + ) + } + + is DataSourceType.InMemory -> { + val element = when { + kType.isSubTypeOf() -> + (source as? InputStream)?.let { + Json.decodeFromStream(it) + } + + kType.isSubTypeOf() -> + (source as? String)?.let { + Json.decodeFromString(it) + } + + kType.isSubTypeOf() -> + source as? JsonElement + + else -> null + } ?: return null + + readJsonImpl( + parsed = element, + header = options.header, + typeClashTactic = options.typeClashTactic, + keyValuePaths = options.keyValuePaths, + unifyNumbers = options.unifyNumbers, + ) + } + } + } + + override val testOrder: Int = 10_000 + + override fun toString(): String = "Json" +} + +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + public class JSON( private val typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS, private val keyValuePaths: List = emptyList(), diff --git a/dataframe-json/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource b/dataframe-json/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource new file mode 100644 index 0000000000..bb9b992aea --- /dev/null +++ b/dataframe-json/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource @@ -0,0 +1 @@ +org.jetbrains.kotlinx.dataframe.io.Json From f1697689bb0fe42f719a9a2639ebd1a44202e5ef Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Thu, 14 May 2026 14:16:50 +0200 Subject: [PATCH 02/14] PoC for DataFrameReadSource with csv, tsv and excel support --- .../jetbrains/kotlinx/dataframe/io/guess2.kt | 17 +- .../jetbrains/kotlinx/dataframe/io/Guess2.kt | 149 +++++++++++++++++- .../org/jetbrains/kotlinx/dataframe/io/csv.kt | 140 ++++++++++++++++ .../org/jetbrains/kotlinx/dataframe/io/tsv.kt | 140 ++++++++++++++++ ...s.kotlinx.dataframe.io.DataFrameReadSource | 2 + .../jetbrains/kotlinx/dataframe/io/xlsx.kt | 135 ++++++++++++++++ ...s.kotlinx.dataframe.io.DataFrameReadSource | 1 + .../jetbrains/kotlinx/dataframe/io/json.kt | 1 + 8 files changed, 582 insertions(+), 3 deletions(-) create mode 100644 dataframe-csv/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource create mode 100644 dataframe-excel/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt index 1a02e66cf2..5192ae5017 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt @@ -2,8 +2,10 @@ package org.jetbrains.kotlinx.dataframe.io import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.DataFrame +import java.io.ByteArrayInputStream import java.io.File import java.io.FileNotFoundException +import java.io.InputStream import java.net.URL import java.nio.file.Path import java.util.ServiceLoader @@ -68,11 +70,24 @@ internal fun readDataFrameImpl( options: DataFrameReadOptions? = null, formats: List = newSupportedFormats, ): AnyFrame { + // Some sources can only be read once, like InputStreams, so we need to buffer them + var bufferedSource: Any? = null + + fun getSource(): Any = + when (source) { + is InputStream -> { + if (bufferedSource == null) bufferedSource = source.readBytes() + ByteArrayInputStream(bufferedSource as ByteArray) + } + + else -> source + } + val tries = mutableMapOf() formats.sortedBy { it.testOrder }.forEach { if (!it.acceptsSource(sourceInfo, options)) return@forEach try { - val df = it.readDataFrameOrNull(source, sourceInfo, options) + val df = it.readDataFrameOrNull(getSource(), sourceInfo, options) if (df != null) return df } catch (e: FileNotFoundException) { throw e diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt index 3691dcbac8..f9bef49023 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt @@ -3,11 +3,10 @@ package org.jetbrains.kotlinx.dataframe.io import io.kotest.matchers.shouldBe import kotlinx.serialization.json.Json import kotlinx.serialization.json.JsonElement +import org.apache.poi.ss.usermodel.WorkbookFactory import org.jetbrains.kotlinx.dataframe.DataFrame import org.junit.Test import java.io.File -import java.net.URI -import java.net.URL import kotlin.io.path.Path import kotlin.io.path.absolute @@ -55,4 +54,150 @@ class Guess2 { DataFrame.readFromData(file.inputStream(), options) shouldBe expected DataFrame.readFromData(Json.decodeFromString(file.readText()), options) shouldBe expected } + + @Test + fun `read CSV reference`() { + val csvPath = "../data/movies.csv" + val expected = DataFrame.readCsv(csvPath) + + DataFrame.readReference(csvPath) shouldBe expected + DataFrame.readReference(Path(csvPath)) shouldBe expected + DataFrame.readReference(File(csvPath)) shouldBe expected + DataFrame.readReference( + Path(csvPath).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = Csv.Options(delimiter = ',') + + DataFrame.readReference(csvPath, options) shouldBe expected + DataFrame.readReference(Path(csvPath), options) shouldBe expected + DataFrame.readReference(File(csvPath), options) shouldBe expected + DataFrame.readReference( + Path(csvPath).absolute().normalize().toUri().toURL(), + options, + ) shouldBe expected + } + + @Test + fun `read CSV in memory`() { + val file = File("../data/movies.csv") + val expected = DataFrame.readCsv(file) + + // String content has no extension hint, so we pin the format via options. + val options = Csv.Options(delimiter = ',') + + DataFrame.readFromData(file.readText(), options) shouldBe expected + DataFrame.readFromData(file.inputStream(), options) shouldBe expected + } + + @Test + fun `read TSV reference`() { + val tsvFile = File("src/test/resources/abc.tsv") + val expected = DataFrame.readTsv(tsvFile) + + DataFrame.readReference(tsvFile.path) shouldBe expected + DataFrame.readReference(Path(tsvFile.path)) shouldBe expected + DataFrame.readReference(tsvFile) shouldBe expected + DataFrame.readReference( + Path(tsvFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = Tsv.Options(delimiter = '\t') + + DataFrame.readReference(tsvFile.path, options) shouldBe expected + DataFrame.readReference(Path(tsvFile.path), options) shouldBe expected + DataFrame.readReference(tsvFile, options) shouldBe expected + DataFrame.readReference( + Path(tsvFile.path).absolute().normalize().toUri().toURL(), + options, + ) shouldBe expected + } + + @Test + fun `read TSV in memory`() { + val tsvFile = File("src/test/resources/abc.tsv") + val expected = DataFrame.readTsv(tsvFile) + val options = Tsv.Options(delimiter = '\t') + + // Binary/text without extension — options pin Tsv over Csv/Json/Xlsx. + DataFrame.readFromData(tsvFile.readText(), options) shouldBe expected + DataFrame.readFromData(tsvFile.inputStream(), options) shouldBe expected + } + + @Test + fun `read XLSX reference`() { + val xlsxFile = File("src/test/resources/sample2.xlsx") + val expected = DataFrame.readExcel(xlsxFile) + + DataFrame.readReference(xlsxFile.path) shouldBe expected + DataFrame.readReference(Path(xlsxFile.path)) shouldBe expected + DataFrame.readReference(xlsxFile) shouldBe expected + DataFrame.readReference( + Path(xlsxFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = ExcelNEW.Options(sheetName = "Sheet1") + + DataFrame.readReference(xlsxFile.path, options) shouldBe expected + DataFrame.readReference(Path(xlsxFile.path), options) shouldBe expected + DataFrame.readReference(xlsxFile, options) shouldBe expected + DataFrame.readReference( + Path(xlsxFile.path).absolute().normalize().toUri().toURL(), + options, + ) shouldBe expected + } + + @Test + fun `read XLS reference`() { + val xlsFile = File("src/test/resources/sample.xls") + val expected = DataFrame.readExcel(xlsFile) + + DataFrame.readReference(xlsFile.path) shouldBe expected + DataFrame.readReference(Path(xlsFile.path)) shouldBe expected + DataFrame.readReference(xlsFile) shouldBe expected + DataFrame.readReference( + Path(xlsFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + } + + @Test + fun `read XLSX in memory`() { + val xlsxFile = File("src/test/resources/sample2.xlsx") + val expected = DataFrame.readExcel(xlsxFile) + + // Workbook and Sheet are exclusive to ExcelNEW, so type-based dispatch works without options. + WorkbookFactory.create(xlsxFile.inputStream()).use { wb -> + DataFrame.readFromData(wb) shouldBe expected + DataFrame.readFromData(wb.getSheetAt(0)) shouldBe expected + } + + val options = ExcelNEW.Options() + + // Binary streams have no extension and are accepted by every format, + // so options are needed to pin ExcelNEW for the InputStream variant. + DataFrame.readFromData(xlsxFile.inputStream(), options) shouldBe expected + + WorkbookFactory.create(xlsxFile.inputStream()).use { wb -> + DataFrame.readFromData(wb, options) shouldBe expected + DataFrame.readFromData(wb.getSheetAt(0), options) shouldBe expected + } + } + + @Test + fun `read XLS in memory`() { + val xlsFile = File("src/test/resources/sample.xls") + val expected = DataFrame.readExcel(xlsFile) + +// WorkbookFactory.create(xlsFile.inputStream()).use { wb -> +// DataFrame.readFromData(wb) shouldBe expected +// } + + val options = ExcelNEW.Options() + + DataFrame.readFromData(xlsFile.inputStream()) shouldBe expected + +// WorkbookFactory.create(xlsFile.inputStream()).use { wb -> +// DataFrame.readFromData(wb) shouldBe expected +// } + } } diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt index 801fd3a717..e2c5be5832 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt @@ -3,12 +3,17 @@ package org.jetbrains.kotlinx.dataframe.io import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams import java.io.File import java.io.InputStream +import java.net.URL +import java.nio.charset.Charset import java.nio.file.Path +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf import kotlin.reflect.typeOf public class CsvDeephaven(private val delimiter: Char = DelimParams.CSV_DELIMITER) : SupportedDataFrameFormat { @@ -33,6 +38,141 @@ public class CsvDeephaven(private val delimiter: Char = DelimParams.CSV_DELIMITE } } +public class Csv : DataFrameReadSource { + + public data class Options( + val delimiter: Char = DelimParams.CSV_DELIMITER, + val header: List = DelimParams.HEADER, + val charset: Charset? = DelimParams.CHARSET, + val colTypes: Map = DelimParams.COL_TYPES, + val skipLines: Long = DelimParams.SKIP_LINES, + val readLines: Long? = DelimParams.READ_LINES, + val parserOptions: ParserOptions? = DelimParams.PARSER_OPTIONS, + val ignoreEmptyLines: Boolean = DelimParams.IGNORE_EMPTY_LINES, + val allowMissingColumns: Boolean = DelimParams.ALLOW_MISSING_COLUMNS, + val ignoreExcessColumns: Boolean = DelimParams.IGNORE_EXCESS_COLUMNS, + val quote: Char = DelimParams.QUOTE, + val ignoreSurroundingSpaces: Boolean = DelimParams.IGNORE_SURROUNDING_SPACES, + val trimInsideQuoted: Boolean = DelimParams.TRIM_INSIDE_QUOTED, + val parseParallel: Boolean = DelimParams.PARSE_PARALLEL, + ) : DataFrameReadOptions + + public companion object { + public val supportedReferenceTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf()) + public val supportedInMemoryTypes: Set = + setOf(typeOf(), typeOf()) + + internal const val EXTENSION: String = "csv" + internal const val MIME_TYPE: String = "text/csv" + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + if (sourceInfo.mimeType?.lowercase()?.equals(MIME_TYPE) == false) return false + + val kType = sourceInfo.type.kType + return when (sourceInfo.type) { + is DataSourceType.Reference -> + supportedReferenceTypes.any { kType.isSubtypeOf(it) } + + is DataSourceType.InMemory -> + supportedInMemoryTypes.any { kType.isSubtypeOf(it) } + } + } + + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.type.kType + return when (sourceInfo.type) { + is DataSourceType.Reference -> { + val url = when { + kType.isSubTypeOf() -> (source as? String)?.let(::asUrl) + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } ?: return null + + DataFrame.readCsv( + url = url, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + is DataSourceType.InMemory -> when { + kType.isSubTypeOf() -> + (source as? InputStream)?.let { stream -> + runCatching { stream.reset() } + DataFrame.readCsv( + inputStream = stream, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + kType.isSubTypeOf() -> + (source as? String)?.let { text -> + DataFrame.readCsvStr( + text = text, + delimiter = opts.delimiter, + header = opts.header, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + else -> null + } + } + } + + override val testOrder: Int = 20_000 + + override fun toString(): String = "Csv" +} + +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + private const val READ_CSV = "readCsv" internal class DefaultReadCsvMethod(path: String?, arguments: MethodArguments) : diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt index ecb123b93c..be8f7f5e20 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt @@ -3,12 +3,17 @@ package org.jetbrains.kotlinx.dataframe.io import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams import java.io.File import java.io.InputStream +import java.net.URL +import java.nio.charset.Charset import java.nio.file.Path +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf import kotlin.reflect.typeOf public class TsvDeephaven(private val delimiter: Char = DelimParams.TSV_DELIMITER) : SupportedDataFrameFormat { @@ -33,6 +38,141 @@ public class TsvDeephaven(private val delimiter: Char = DelimParams.TSV_DELIMITE } } +public class Tsv : DataFrameReadSource { + + public data class Options( + val delimiter: Char = DelimParams.TSV_DELIMITER, + val header: List = DelimParams.HEADER, + val charset: Charset? = DelimParams.CHARSET, + val colTypes: Map = DelimParams.COL_TYPES, + val skipLines: Long = DelimParams.SKIP_LINES, + val readLines: Long? = DelimParams.READ_LINES, + val parserOptions: ParserOptions? = DelimParams.PARSER_OPTIONS, + val ignoreEmptyLines: Boolean = DelimParams.IGNORE_EMPTY_LINES, + val allowMissingColumns: Boolean = DelimParams.ALLOW_MISSING_COLUMNS, + val ignoreExcessColumns: Boolean = DelimParams.IGNORE_EXCESS_COLUMNS, + val quote: Char = DelimParams.QUOTE, + val ignoreSurroundingSpaces: Boolean = DelimParams.IGNORE_SURROUNDING_SPACES, + val trimInsideQuoted: Boolean = DelimParams.TRIM_INSIDE_QUOTED, + val parseParallel: Boolean = DelimParams.PARSE_PARALLEL, + ) : DataFrameReadOptions + + public companion object { + public val supportedReferenceTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf()) + public val supportedInMemoryTypes: Set = + setOf(typeOf(), typeOf()) + + internal const val EXTENSION: String = "tsv" + internal const val MIME_TYPE: String = "text/tab-separated-values" + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + if (sourceInfo.mimeType?.lowercase()?.equals(MIME_TYPE) == false) return false + + val kType = sourceInfo.type.kType + return when (sourceInfo.type) { + is DataSourceType.Reference -> + supportedReferenceTypes.any { kType.isSubtypeOf(it) } + + is DataSourceType.InMemory -> + supportedInMemoryTypes.any { kType.isSubtypeOf(it) } + } + } + + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.type.kType + return when (sourceInfo.type) { + is DataSourceType.Reference -> { + val url = when { + kType.isSubTypeOf() -> (source as? String)?.let(::asUrl) + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } ?: return null + + DataFrame.readTsv( + url = url, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + is DataSourceType.InMemory -> when { + kType.isSubTypeOf() -> + (source as? InputStream)?.let { stream -> + runCatching { stream.reset() } + DataFrame.readTsv( + inputStream = stream, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + kType.isSubTypeOf() -> + (source as? String)?.let { text -> + DataFrame.readTsvStr( + text = text, + delimiter = opts.delimiter, + header = opts.header, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + else -> null + } + } + } + + override val testOrder: Int = 30_000 + + override fun toString(): String = "Tsv" +} + +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + private const val READ_TSV = "readTsv" internal class DefaultReadTsvMethod(path: String?, arguments: MethodArguments) : diff --git a/dataframe-csv/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource b/dataframe-csv/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource new file mode 100644 index 0000000000..ebcae7710f --- /dev/null +++ b/dataframe-csv/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource @@ -0,0 +1,2 @@ +org.jetbrains.kotlinx.dataframe.io.Csv +org.jetbrains.kotlinx.dataframe.io.Tsv diff --git a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt index e76332bfc7..90caeec347 100644 --- a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt +++ b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt @@ -45,6 +45,9 @@ import kotlin.io.path.exists import kotlin.io.path.fileSize import kotlin.io.path.inputStream import kotlin.io.path.outputStream +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf +import kotlin.reflect.typeOf import java.time.LocalDate as JavaLocalDate import java.time.LocalDateTime as JavaLocalDateTime import java.util.Date as JavaDate @@ -64,6 +67,138 @@ public class Excel : SupportedDataFrameFormat { DefaultReadExcelMethod(pathRepresentation) } +public class ExcelNEW : DataFrameReadSource { + + public data class Options( + val sheetName: String? = null, + val skipRows: Int = 0, + val columns: String? = null, + val stringColumns: StringColumns? = null, + val rowsCount: Int? = null, + val nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, + val firstRowIsHeader: Boolean = true, + val parseEmptyAsNull: Boolean = true, + ) : DataFrameReadOptions + + public companion object { + public val supportedReferenceTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf()) + public val supportedInMemoryTypes: Set = + setOf(typeOf(), typeOf(), typeOf()) + + internal val EXTENSIONS: Set = setOf("xls", "xlsx") + internal val MIME_TYPES: Set = setOf( + "application/vnd.ms-excel", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ) + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + val ext = sourceInfo.extension?.lowercase() + if (ext != null && ext !in EXTENSIONS) return false + val mime = sourceInfo.mimeType?.lowercase() + if (mime != null && mime !in MIME_TYPES) return false + + val kType = sourceInfo.type.kType + return when (sourceInfo.type) { + is DataSourceType.Reference -> + supportedReferenceTypes.any { kType.isSubtypeOf(it) } + + is DataSourceType.InMemory -> + supportedInMemoryTypes.any { kType.isSubtypeOf(it) } + } + } + + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.type.kType + return when (sourceInfo.type) { + is DataSourceType.Reference -> { + val url = when { + kType.isSubTypeOf() -> (source as? String)?.let(::asUrl) + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } ?: return null + + DataFrame.readExcel( + url = url, + sheetName = opts.sheetName, + skipRows = opts.skipRows, + columns = opts.columns, + stringColumns = opts.stringColumns, + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } + + is DataSourceType.InMemory -> when { + kType.isSubTypeOf() -> + (source as? InputStream)?.let { stream -> + runCatching { stream.reset() } + DataFrame.readExcel( + inputStream = stream, + sheetName = opts.sheetName, + skipRows = opts.skipRows, + columns = opts.columns, + stringColumns = opts.stringColumns, + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } + + kType.isSubTypeOf() -> + (source as? Workbook)?.let { wb -> + DataFrame.readExcel( + wb = wb, + sheetName = opts.sheetName, + skipRows = opts.skipRows, + columns = opts.columns, + formattingOptions = opts.stringColumns?.toFormattingOptions(), + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } + + kType.isSubTypeOf() -> + (source as? Sheet)?.let { sheet -> + // readExcel(Sheet) has no sheetName parameter — the sheet is already selected. + DataFrame.readExcel( + sheet = sheet, + columns = opts.columns, + formattingOptions = opts.stringColumns?.toFormattingOptions(), + skipRows = opts.skipRows, + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } + + else -> null + } + } + } + + override val testOrder: Int = 40_000 + + override fun toString(): String = "Xlsx" +} + +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + private const val MESSAGE_REMOVE_1_1 = "Will be removed in 1.1." internal const val READ_EXCEL_OLD = "This function is only here for binary compatibility. $MESSAGE_REMOVE_1_1" diff --git a/dataframe-excel/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource b/dataframe-excel/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource new file mode 100644 index 0000000000..9ceeadf039 --- /dev/null +++ b/dataframe-excel/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource @@ -0,0 +1 @@ +org.jetbrains.kotlinx.dataframe.io.ExcelNEW diff --git a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt index 9786d4f8f2..3cf874a5a7 100644 --- a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt +++ b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt @@ -98,6 +98,7 @@ public class Json : DataFrameReadSource { val element = when { kType.isSubTypeOf() -> (source as? InputStream)?.let { + runCatching { it.reset() } Json.decodeFromStream(it) } From 783d1dc93ac9de8d88f1539a7c767ddd9aa60e08 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Thu, 14 May 2026 14:43:26 +0200 Subject: [PATCH 03/14] PoC for DataFrameReadSource with jdbc --- core/build.gradle.kts | 1 + .../jetbrains/kotlinx/dataframe/io/Guess2.kt | 91 ++++++++++-- .../jetbrains/kotlinx/dataframe/io/Jdbc.kt | 133 ++++++++++++++++++ 3 files changed, 217 insertions(+), 8 deletions(-) diff --git a/core/build.gradle.kts b/core/build.gradle.kts index 3a4e0cf0f1..a6a2fc3115 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -89,6 +89,7 @@ dependencies { testImplementation(projects.dataframeExcel) // testImplementation(projects.dataframeGeo) testImplementation(projects.dataframeJdbc) + testImplementation(libs.h2db) } // Configure testJava16 dependencies to extend from test diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt index f9bef49023..8728512a8b 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt @@ -5,8 +5,12 @@ import kotlinx.serialization.json.Json import kotlinx.serialization.json.JsonElement import org.apache.poi.ss.usermodel.WorkbookFactory import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.io.db.H2 import org.junit.Test import java.io.File +import java.sql.Connection +import java.sql.DriverManager +import javax.sql.DataSource import kotlin.io.path.Path import kotlin.io.path.absolute @@ -188,16 +192,87 @@ class Guess2 { val xlsFile = File("src/test/resources/sample.xls") val expected = DataFrame.readExcel(xlsFile) -// WorkbookFactory.create(xlsFile.inputStream()).use { wb -> -// DataFrame.readFromData(wb) shouldBe expected -// } + WorkbookFactory.create(xlsFile.inputStream()).use { wb -> + DataFrame.readFromData(wb) shouldBe expected + } + DataFrame.readFromData(xlsFile.inputStream()) shouldBe expected + WorkbookFactory.create(xlsFile.inputStream()).use { wb -> + DataFrame.readFromData(wb) shouldBe expected + } + } - val options = ExcelNEW.Options() + private fun h2Url(name: String) = "jdbc:h2:mem:$name;DB_CLOSE_DELAY=-1" - DataFrame.readFromData(xlsFile.inputStream()) shouldBe expected + private fun seed(connection: Connection) { + connection.createStatement().use { st -> + st.execute("CREATE TABLE Customer (id INT, name VARCHAR(255), age INT)") + st.execute("INSERT INTO Customer (id, name, age) VALUES (1, 'John', 40), (2, 'Alice', 25), (3, 'Bob', 47)") + } + } + + @Test + fun `read JDBC in memory`() { + val url = h2Url("guess2_inmem") + DriverManager.getConnection(url).use { conn -> + seed(conn) + + val expected = DataFrame.readSqlTable(conn, "Customer") + val tableOpts = Jdbc2.Options(sqlQueryOrTableName = "Customer") + val queryOpts = Jdbc2.Options(sqlQueryOrTableName = "SELECT * FROM Customer") + + // Connection — exclusive type, but query/table name must come from options. + DataFrame.readFromData(conn, tableOpts) shouldBe expected + DataFrame.readFromData(conn, queryOpts) shouldBe expected + + // DbConnectionConfig as InMemory. + val config = DbConnectionConfig(url = url) + DataFrame.readFromData(config, tableOpts) shouldBe expected + DataFrame.readFromData(config, queryOpts) shouldBe expected + + // DataSource — opens a fresh connection each call (DataSource.readDataFrame closes it via `use`). + val dataSource = object : DataSource { + override fun getConnection() = DriverManager.getConnection(url) + override fun getConnection(u: String?, p: String?) = DriverManager.getConnection(url) + override fun getLogWriter() = null + override fun setLogWriter(out: java.io.PrintWriter?) {} + override fun setLoginTimeout(seconds: Int) {} + override fun getLoginTimeout() = 0 + override fun getParentLogger() = throw UnsupportedOperationException() + override fun unwrap(iface: Class?): T = throw UnsupportedOperationException() + override fun isWrapperFor(iface: Class<*>?) = false + } + DataFrame.readFromData(dataSource, tableOpts) shouldBe expected + + // ResultSet — no sqlQueryOrTableName needed; just dbType (or a Connection to derive it). + conn.prepareStatement("SELECT * FROM Customer").use { ps -> + ps.executeQuery().use { rs -> + DataFrame.readFromData( + rs, + Jdbc2.Options(dbType = H2()), + ) shouldBe expected + } + } + conn.prepareStatement("SELECT * FROM Customer").use { ps -> + ps.executeQuery().use { rs -> + DataFrame.readFromData( + rs, + Jdbc2.Options(resultSetConnection = conn), + ) shouldBe expected + } + } + } + } + + @Test + fun `read JDBC reference`() { + val url = h2Url("guess2_ref") + DriverManager.getConnection(url).use { conn -> seed(conn) } + + val config = DbConnectionConfig(url = url) + val expected = DataFrame.readSqlTable(config, "Customer") + val tableOpts = Jdbc2.Options(sqlQueryOrTableName = "Customer") -// WorkbookFactory.create(xlsFile.inputStream()).use { wb -> -// DataFrame.readFromData(wb) shouldBe expected -// } + DataFrame.readReference(config, tableOpts) shouldBe expected + DataFrame.readReference(config, Jdbc2.Options(sqlQueryOrTableName = "SELECT * FROM Customer")) shouldBe expected } } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt index a5307b96d9..0372c06d8d 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt @@ -5,9 +5,17 @@ import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod import org.jetbrains.kotlinx.dataframe.codeGen.Code import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod +import org.jetbrains.kotlinx.dataframe.io.db.DbType import java.io.File import java.io.InputStream import java.nio.file.Path +import java.sql.Connection +import java.sql.PreparedStatement +import java.sql.ResultSet +import javax.sql.DataSource +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf +import kotlin.reflect.typeOf // TODO: https://github.com/Kotlin/dataframe/issues/450 public class Jdbc : @@ -36,6 +44,131 @@ public class Jdbc : DefaultReadJdbcMethod(pathRepresentation) } +/** + * [DataFrameReadSource] for JDBC. + * + * Reading from JDBC always needs a "what" (a SQL query or table name) — unlike a file, a [Connection] doesn't + * carry that instruction. Provide it via [Options.sqlQueryOrTableName]. The only exception is [ResultSet], + * which is already an executed query. + * + * Supports the following sources: + * - [Reference][DataSourceType.Reference]: [DbConnectionConfig] + * - [InMemory][DataSourceType.InMemory]: [Connection], [DataSource], [DbConnectionConfig], [ResultSet] + * + * Note: [DbConnectionConfig] is accepted as both reference and in-memory deliberately, to compare which + * feels more natural in practice. Other read-paths in this module — notably `readAllSqlTables` returning a + * `Map` — don't fit the single-DataFrame contract and are unchanged. + */ +public class Jdbc2 : DataFrameReadSource { + + public data class Options( + /** + * SQL query (e.g. `"SELECT * FROM users"`) or table name (e.g. `"users"`). + * Required for [Connection], [DataSource], and [DbConnectionConfig] sources. + * Ignored for [ResultSet] (it's already an executed query). + */ + val sqlQueryOrTableName: String? = null, + val limit: Int? = null, + val inferNullability: Boolean = true, + /** Optional, auto-detected from the source when `null`. */ + val dbType: DbType? = null, + val strictValidation: Boolean = true, + val configureStatement: (PreparedStatement) -> Unit = {}, + /** + * Only used when the source is a [ResultSet] and [dbType] is `null`; provides a [Connection] + * to auto-detect the database type. Ignored otherwise. + */ + val resultSetConnection: Connection? = null, + ) : DataFrameReadOptions + + public companion object { + public val supportedReferenceTypes: Set = setOf(typeOf()) + public val supportedInMemoryTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + val kType = sourceInfo.type.kType + return when (sourceInfo.type) { + is DataSourceType.Reference -> + supportedReferenceTypes.any { kType.isSubtypeOf(it) } + + is DataSourceType.InMemory -> + supportedInMemoryTypes.any { kType.isSubtypeOf(it) } + } + } + + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + return when (source) { + is ResultSet -> when { + opts.dbType != null -> + DataFrame.readResultSet(source, opts.dbType, opts.limit, opts.inferNullability) + + opts.resultSetConnection != null -> + DataFrame.readResultSet( + source, + opts.resultSetConnection, + opts.limit, + opts.inferNullability, + ) + + // Without dbType or a connection we can't read a ResultSet — fall through. + else -> null + } + + is Connection -> opts.sqlQueryOrTableName?.let { + source.readDataFrame( + sqlQueryOrTableName = it, + limit = opts.limit, + inferNullability = opts.inferNullability, + dbType = opts.dbType, + strictValidation = opts.strictValidation, + configureStatement = opts.configureStatement, + ) + } + + is DataSource -> opts.sqlQueryOrTableName?.let { + source.readDataFrame( + sqlQueryOrTableName = it, + limit = opts.limit, + inferNullability = opts.inferNullability, + dbType = opts.dbType, + strictValidation = opts.strictValidation, + configureStatement = opts.configureStatement, + ) + } + + is DbConnectionConfig -> opts.sqlQueryOrTableName?.let { + source.readDataFrame( + sqlQueryOrTableName = it, + limit = opts.limit, + inferNullability = opts.inferNullability, + dbType = opts.dbType, + strictValidation = opts.strictValidation, + configureStatement = opts.configureStatement, + ) + } + + else -> null + } + } + + override val testOrder: Int = 50_000 + + override fun toString(): String = "Jdbc" +} + private fun DataFrame.Companion.readJDBC(stream: File): DataFrame<*> { TODO("Not yet implemented") } From 08179ceff8ce17c224ea0d11e4a27fcd92637e1a Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Thu, 14 May 2026 15:23:58 +0200 Subject: [PATCH 04/14] Refactored `readReference` and `readFromData` to `readSource` across test and production code for improved API unification and flexibility. --- .../jetbrains/kotlinx/dataframe/io/guess2.kt | 118 +++++++------ .../jetbrains/kotlinx/dataframe/io/Guess2.kt | 148 +++++++++------- .../org/jetbrains/kotlinx/dataframe/io/csv.kt | 162 ++++++++---------- .../org/jetbrains/kotlinx/dataframe/io/tsv.kt | 162 ++++++++---------- .../jetbrains/kotlinx/dataframe/io/xlsx.kt | 158 ++++++++--------- .../jetbrains/kotlinx/dataframe/io/Jdbc.kt | 21 +-- .../jetbrains/kotlinx/dataframe/io/json.kt | 103 +++++------ 7 files changed, 427 insertions(+), 445 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt index 5192ae5017..3c7eff0c30 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt @@ -2,10 +2,12 @@ package org.jetbrains.kotlinx.dataframe.io import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.io.readSource import java.io.ByteArrayInputStream import java.io.File import java.io.FileNotFoundException import java.io.InputStream +import java.net.URI import java.net.URL import java.nio.file.Path import java.util.ServiceLoader @@ -31,30 +33,22 @@ public interface DataFrameReadSource { public val testOrder: Int } +/** + * Description of a source passed to [DataFrameReadSource]. Carries the static [kType] of the value and + * optional [extension]/[mimeType] hints, both of which may be `null` when the source is in-memory content + * with no reasonable file-extension/MIME interpretation (e.g., a raw [String], [InputStream], [java.sql.Connection], + * etc.). + */ public data class DataSourceInfo( - public val type: DataSourceType, + public val kType: KType, public val extension: String? = null, // TODO, Apache Tika? public val mimeType: String? = null, ) -public sealed class DataSourceType(public open val kType: KType) { - /** Like a path, file, or URL. */ - public data class Reference(override val kType: KType) : DataSourceType(kType) - - /** Actual data, like a String, ByteArray, InputStream */ - public data class InMemory(override val kType: KType) : DataSourceType(kType) - - public companion object { - public inline fun reference(): Reference = Reference(kType = typeOf()) - - public inline fun inMemory(): InMemory = InMemory(kType = typeOf()) - } -} - /** * NOTE: Needs to have fully qualified name in - * resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.NewSupportedDataFrameFormat + * resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource * to be detected here. */ internal val newSupportedFormats: List by lazy { @@ -70,6 +64,20 @@ internal fun readDataFrameImpl( options: DataFrameReadOptions? = null, formats: List = newSupportedFormats, ): AnyFrame { + if (source is String) { + val url = asUrlOrNull(source) + if (url != null) { + return readDataFrameImpl( + source = url, + sourceInfo = sourceInfo.copy( + kType = typeOf(), + ), + options = options, + formats = formats, + ) + } + } + // Some sources can only be read once, like InputStreams, so we need to buffer them var bufferedSource: Any? = null @@ -98,30 +106,33 @@ internal fun readDataFrameImpl( throw IllegalArgumentException("Unknown DataFrame source $source, $sourceInfo; Tried $tries") } -public fun DataFrame.Companion.readReference( - reference: Any, - type: KType, - options: DataFrameReadOptions? = null, -): AnyFrame = +/** + * Unified entry point for the [DataFrameReadSource] framework: passes [source] through every registered + * format until one reads it. + * + * For a [String] that points to an existing file or a recognized URL (`http://`, `https://`, `ftp://`), + * the source is normalized to a [URL] so the file-extension hint can be used to disambiguate formats. Any + * other [String] is treated as in-memory content (raw JSON/CSV/etc.). + * + * Named [readSource] rather than `read` to avoid shadowing the legacy `DataFrame.read(File/URL/Path/String, header)` + * entries in `guess.kt` that use the older [SupportedDataFrameFormat] system. Once the legacy entries are + * retired, this can be renamed to `read`. + */ +public fun DataFrame.Companion.readSource(source: Any, type: KType, options: DataFrameReadOptions? = null): AnyFrame = readDataFrameImpl( - source = reference, + source = source, sourceInfo = DataSourceInfo( - type = DataSourceType.Reference(type.withNullability(false)), - extension = reference.extensionOrNull(), + kType = type.withNullability(false), + extension = source.extensionOrNull(), mimeType = null, // TODO, Apache Tika? ), options = options, ) -public inline fun DataFrame.Companion.readReference( - reference: R, +public inline fun DataFrame.Companion.readSource( + source: R, options: DataFrameReadOptions? = null, -): AnyFrame = - readReference( - reference = reference, - type = typeOf(), - options = options, - ) +): AnyFrame = readSource(source = source, type = typeOf(), options = options) internal fun Any.extensionOrNull(): String? = when (this) { @@ -140,22 +151,29 @@ internal fun Any.extensionOrNull(): String? = else -> null } -public fun DataFrame.Companion.readFromData(data: Any, type: KType, options: DataFrameReadOptions? = null): AnyFrame = - readDataFrameImpl( - source = data, - sourceInfo = DataSourceInfo( - type = DataSourceType.InMemory(type.withNullability(false)), - mimeType = null, // TODO, Apache Tika? - ), - options = options, - ) +/** + * Non-throwing variant of [asUrl]: returns the [URL] iff [string] is a recognized URL (`http`/`https`/`ftp`) + * or an existing file path. Used by [readSource] to decide whether a [String] should be treated as a reference + * or as raw content. + */ +internal fun asUrlOrNull(string: String): URL? = + when { + isUrl(string) -> try { + URI(string).toURL() + } catch (_: Exception) { + null + } -public inline fun DataFrame.Companion.readFromData( - data: R, - options: DataFrameReadOptions? = null, -): AnyFrame = - readFromData( - data = data, - type = typeOf(), - options = options, - ) + else -> { + val file = try { + File(string) + } catch (_: Exception) { + null + } + if (file != null && file.exists() && file.isFile) { + file.toURI().toURL() + } else { + null + } + } + } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt index 8728512a8b..086796316e 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt @@ -20,10 +20,10 @@ class Guess2 { fun `read JSON reference`() { val expected = DataFrame.readJson("../data/participants.json") - DataFrame.readReference("../data/participants.json") shouldBe expected - DataFrame.readReference(Path("../data/participants.json")) shouldBe expected - DataFrame.readReference(File("../data/participants.json")) shouldBe expected - DataFrame.readReference( + DataFrame.readSource("../data/participants.json") shouldBe expected + DataFrame.readSource(Path("../data/participants.json")) shouldBe expected + DataFrame.readSource(File("../data/participants.json")) shouldBe expected + DataFrame.readSource( Path("../data/participants.json").absolute().normalize().toUri().toURL(), ) shouldBe expected @@ -31,10 +31,10 @@ class Guess2 { typeClashTactic = JSON.TypeClashTactic.ANY_COLUMNS, ) - DataFrame.readReference("../data/participants.json", options) shouldBe expected - DataFrame.readReference(Path("../data/participants.json"), options) shouldBe expected - DataFrame.readReference(File("../data/participants.json"), options) shouldBe expected - DataFrame.readReference( + DataFrame.readSource("../data/participants.json", options) shouldBe expected + DataFrame.readSource(Path("../data/participants.json"), options) shouldBe expected + DataFrame.readSource(File("../data/participants.json"), options) shouldBe expected + DataFrame.readSource( Path("../data/participants.json").absolute().normalize().toUri().toURL(), options, ) shouldBe expected @@ -46,17 +46,17 @@ class Guess2 { val file = File("../data/participants.json") - DataFrame.readFromData(file.readText()) shouldBe expected - DataFrame.readFromData(file.inputStream()) shouldBe expected - DataFrame.readFromData(Json.decodeFromString(file.readText())) shouldBe expected + DataFrame.readSource(file.readText()) shouldBe expected + DataFrame.readSource(file.inputStream()) shouldBe expected + DataFrame.readSource(Json.decodeFromString(file.readText())) shouldBe expected val options = org.jetbrains.kotlinx.dataframe.io.Json.Options( typeClashTactic = JSON.TypeClashTactic.ANY_COLUMNS, ) - DataFrame.readFromData(file.readText(), options) shouldBe expected - DataFrame.readFromData(file.inputStream(), options) shouldBe expected - DataFrame.readFromData(Json.decodeFromString(file.readText()), options) shouldBe expected + DataFrame.readSource(file.readText(), options) shouldBe expected + DataFrame.readSource(file.inputStream(), options) shouldBe expected + DataFrame.readSource(Json.decodeFromString(file.readText()), options) shouldBe expected } @Test @@ -64,19 +64,19 @@ class Guess2 { val csvPath = "../data/movies.csv" val expected = DataFrame.readCsv(csvPath) - DataFrame.readReference(csvPath) shouldBe expected - DataFrame.readReference(Path(csvPath)) shouldBe expected - DataFrame.readReference(File(csvPath)) shouldBe expected - DataFrame.readReference( + DataFrame.readSource(csvPath) shouldBe expected + DataFrame.readSource(Path(csvPath)) shouldBe expected + DataFrame.readSource(File(csvPath)) shouldBe expected + DataFrame.readSource( Path(csvPath).absolute().normalize().toUri().toURL(), ) shouldBe expected val options = Csv.Options(delimiter = ',') - DataFrame.readReference(csvPath, options) shouldBe expected - DataFrame.readReference(Path(csvPath), options) shouldBe expected - DataFrame.readReference(File(csvPath), options) shouldBe expected - DataFrame.readReference( + DataFrame.readSource(csvPath, options) shouldBe expected + DataFrame.readSource(Path(csvPath), options) shouldBe expected + DataFrame.readSource(File(csvPath), options) shouldBe expected + DataFrame.readSource( Path(csvPath).absolute().normalize().toUri().toURL(), options, ) shouldBe expected @@ -90,8 +90,8 @@ class Guess2 { // String content has no extension hint, so we pin the format via options. val options = Csv.Options(delimiter = ',') - DataFrame.readFromData(file.readText(), options) shouldBe expected - DataFrame.readFromData(file.inputStream(), options) shouldBe expected + DataFrame.readSource(file.readText(), options) shouldBe expected + DataFrame.readSource(file.inputStream(), options) shouldBe expected } @Test @@ -99,19 +99,19 @@ class Guess2 { val tsvFile = File("src/test/resources/abc.tsv") val expected = DataFrame.readTsv(tsvFile) - DataFrame.readReference(tsvFile.path) shouldBe expected - DataFrame.readReference(Path(tsvFile.path)) shouldBe expected - DataFrame.readReference(tsvFile) shouldBe expected - DataFrame.readReference( + DataFrame.readSource(tsvFile.path) shouldBe expected + DataFrame.readSource(Path(tsvFile.path)) shouldBe expected + DataFrame.readSource(tsvFile) shouldBe expected + DataFrame.readSource( Path(tsvFile.path).absolute().normalize().toUri().toURL(), ) shouldBe expected val options = Tsv.Options(delimiter = '\t') - DataFrame.readReference(tsvFile.path, options) shouldBe expected - DataFrame.readReference(Path(tsvFile.path), options) shouldBe expected - DataFrame.readReference(tsvFile, options) shouldBe expected - DataFrame.readReference( + DataFrame.readSource(tsvFile.path, options) shouldBe expected + DataFrame.readSource(Path(tsvFile.path), options) shouldBe expected + DataFrame.readSource(tsvFile, options) shouldBe expected + DataFrame.readSource( Path(tsvFile.path).absolute().normalize().toUri().toURL(), options, ) shouldBe expected @@ -124,8 +124,8 @@ class Guess2 { val options = Tsv.Options(delimiter = '\t') // Binary/text without extension — options pin Tsv over Csv/Json/Xlsx. - DataFrame.readFromData(tsvFile.readText(), options) shouldBe expected - DataFrame.readFromData(tsvFile.inputStream(), options) shouldBe expected + DataFrame.readSource(tsvFile.readText(), options) shouldBe expected + DataFrame.readSource(tsvFile.inputStream(), options) shouldBe expected } @Test @@ -133,19 +133,19 @@ class Guess2 { val xlsxFile = File("src/test/resources/sample2.xlsx") val expected = DataFrame.readExcel(xlsxFile) - DataFrame.readReference(xlsxFile.path) shouldBe expected - DataFrame.readReference(Path(xlsxFile.path)) shouldBe expected - DataFrame.readReference(xlsxFile) shouldBe expected - DataFrame.readReference( + DataFrame.readSource(xlsxFile.path) shouldBe expected + DataFrame.readSource(Path(xlsxFile.path)) shouldBe expected + DataFrame.readSource(xlsxFile) shouldBe expected + DataFrame.readSource( Path(xlsxFile.path).absolute().normalize().toUri().toURL(), ) shouldBe expected val options = ExcelNEW.Options(sheetName = "Sheet1") - DataFrame.readReference(xlsxFile.path, options) shouldBe expected - DataFrame.readReference(Path(xlsxFile.path), options) shouldBe expected - DataFrame.readReference(xlsxFile, options) shouldBe expected - DataFrame.readReference( + DataFrame.readSource(xlsxFile.path, options) shouldBe expected + DataFrame.readSource(Path(xlsxFile.path), options) shouldBe expected + DataFrame.readSource(xlsxFile, options) shouldBe expected + DataFrame.readSource( Path(xlsxFile.path).absolute().normalize().toUri().toURL(), options, ) shouldBe expected @@ -156,10 +156,10 @@ class Guess2 { val xlsFile = File("src/test/resources/sample.xls") val expected = DataFrame.readExcel(xlsFile) - DataFrame.readReference(xlsFile.path) shouldBe expected - DataFrame.readReference(Path(xlsFile.path)) shouldBe expected - DataFrame.readReference(xlsFile) shouldBe expected - DataFrame.readReference( + DataFrame.readSource(xlsFile.path) shouldBe expected + DataFrame.readSource(Path(xlsFile.path)) shouldBe expected + DataFrame.readSource(xlsFile) shouldBe expected + DataFrame.readSource( Path(xlsFile.path).absolute().normalize().toUri().toURL(), ) shouldBe expected } @@ -171,19 +171,19 @@ class Guess2 { // Workbook and Sheet are exclusive to ExcelNEW, so type-based dispatch works without options. WorkbookFactory.create(xlsxFile.inputStream()).use { wb -> - DataFrame.readFromData(wb) shouldBe expected - DataFrame.readFromData(wb.getSheetAt(0)) shouldBe expected + DataFrame.readSource(wb) shouldBe expected + DataFrame.readSource(wb.getSheetAt(0)) shouldBe expected } val options = ExcelNEW.Options() // Binary streams have no extension and are accepted by every format, // so options are needed to pin ExcelNEW for the InputStream variant. - DataFrame.readFromData(xlsxFile.inputStream(), options) shouldBe expected + DataFrame.readSource(xlsxFile.inputStream(), options) shouldBe expected WorkbookFactory.create(xlsxFile.inputStream()).use { wb -> - DataFrame.readFromData(wb, options) shouldBe expected - DataFrame.readFromData(wb.getSheetAt(0), options) shouldBe expected + DataFrame.readSource(wb, options) shouldBe expected + DataFrame.readSource(wb.getSheetAt(0), options) shouldBe expected } } @@ -193,11 +193,11 @@ class Guess2 { val expected = DataFrame.readExcel(xlsFile) WorkbookFactory.create(xlsFile.inputStream()).use { wb -> - DataFrame.readFromData(wb) shouldBe expected + DataFrame.readSource(wb) shouldBe expected } - DataFrame.readFromData(xlsFile.inputStream()) shouldBe expected + DataFrame.readSource(xlsFile.inputStream()) shouldBe expected WorkbookFactory.create(xlsFile.inputStream()).use { wb -> - DataFrame.readFromData(wb) shouldBe expected + DataFrame.readSource(wb) shouldBe expected } } @@ -221,13 +221,13 @@ class Guess2 { val queryOpts = Jdbc2.Options(sqlQueryOrTableName = "SELECT * FROM Customer") // Connection — exclusive type, but query/table name must come from options. - DataFrame.readFromData(conn, tableOpts) shouldBe expected - DataFrame.readFromData(conn, queryOpts) shouldBe expected + DataFrame.readSource(conn, tableOpts) shouldBe expected + DataFrame.readSource(conn, queryOpts) shouldBe expected // DbConnectionConfig as InMemory. val config = DbConnectionConfig(url = url) - DataFrame.readFromData(config, tableOpts) shouldBe expected - DataFrame.readFromData(config, queryOpts) shouldBe expected + DataFrame.readSource(config, tableOpts) shouldBe expected + DataFrame.readSource(config, queryOpts) shouldBe expected // DataSource — opens a fresh connection each call (DataSource.readDataFrame closes it via `use`). val dataSource = object : DataSource { @@ -241,12 +241,12 @@ class Guess2 { override fun unwrap(iface: Class?): T = throw UnsupportedOperationException() override fun isWrapperFor(iface: Class<*>?) = false } - DataFrame.readFromData(dataSource, tableOpts) shouldBe expected + DataFrame.readSource(dataSource, tableOpts) shouldBe expected // ResultSet — no sqlQueryOrTableName needed; just dbType (or a Connection to derive it). conn.prepareStatement("SELECT * FROM Customer").use { ps -> ps.executeQuery().use { rs -> - DataFrame.readFromData( + DataFrame.readSource( rs, Jdbc2.Options(dbType = H2()), ) shouldBe expected @@ -254,7 +254,7 @@ class Guess2 { } conn.prepareStatement("SELECT * FROM Customer").use { ps -> ps.executeQuery().use { rs -> - DataFrame.readFromData( + DataFrame.readSource( rs, Jdbc2.Options(resultSetConnection = conn), ) shouldBe expected @@ -263,6 +263,28 @@ class Guess2 { } } + @Test + fun `unified readSource auto-detects references vs content`() { + // String that points to an existing file → routed through URL → JSON wins on extension + val jsonExpected = DataFrame.readJson("../data/participants.json") + DataFrame.readSource("../data/participants.json") shouldBe jsonExpected + + // Same idea for CSV/XLSX + val csvExpected = DataFrame.readCsv("../data/movies.csv") + DataFrame.readSource("../data/movies.csv") shouldBe csvExpected + + val xlsxExpected = DataFrame.readExcel(File("src/test/resources/sample2.xlsx")) + DataFrame.readSource("src/test/resources/sample2.xlsx") shouldBe xlsxExpected + + // String that doesn't resolve to a file → treated as raw content (JSON content here) + val file = File("../data/participants.json") + DataFrame.readSource(file.readText()) shouldBe jsonExpected + + // Non-String types: still work, no special handling needed + DataFrame.readSource(file) shouldBe jsonExpected + DataFrame.readSource(Path("../data/participants.json")) shouldBe jsonExpected + } + @Test fun `read JDBC reference`() { val url = h2Url("guess2_ref") @@ -272,7 +294,7 @@ class Guess2 { val expected = DataFrame.readSqlTable(config, "Customer") val tableOpts = Jdbc2.Options(sqlQueryOrTableName = "Customer") - DataFrame.readReference(config, tableOpts) shouldBe expected - DataFrame.readReference(config, Jdbc2.Options(sqlQueryOrTableName = "SELECT * FROM Customer")) shouldBe expected + DataFrame.readSource(config, tableOpts) shouldBe expected + DataFrame.readSource(config, Jdbc2.Options(sqlQueryOrTableName = "SELECT * FROM Customer")) shouldBe expected } } diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt index e2c5be5832..b168062a49 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt @@ -58,10 +58,8 @@ public class Csv : DataFrameReadSource { ) : DataFrameReadOptions public companion object { - public val supportedReferenceTypes: Set = - setOf(typeOf(), typeOf(), typeOf(), typeOf()) - public val supportedInMemoryTypes: Set = - setOf(typeOf(), typeOf()) + public val supportedTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) internal const val EXTENSION: String = "csv" internal const val MIME_TYPE: String = "text/csv" @@ -71,15 +69,7 @@ public class Csv : DataFrameReadSource { if (options != null && options !is Options) return false if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false if (sourceInfo.mimeType?.lowercase()?.equals(MIME_TYPE) == false) return false - - val kType = sourceInfo.type.kType - return when (sourceInfo.type) { - is DataSourceType.Reference -> - supportedReferenceTypes.any { kType.isSubtypeOf(it) } - - is DataSourceType.InMemory -> - supportedInMemoryTypes.any { kType.isSubtypeOf(it) } - } + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( @@ -88,81 +78,77 @@ public class Csv : DataFrameReadSource { options: DataFrameReadOptions?, ): DataFrame<*>? { val opts = (options ?: Options()) as Options - val kType = sourceInfo.type.kType - return when (sourceInfo.type) { - is DataSourceType.Reference -> { - val url = when { - kType.isSubTypeOf() -> (source as? String)?.let(::asUrl) - kType.isSubTypeOf() -> source as? URL - kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() - kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() - else -> null - } ?: return null - - DataFrame.readCsv( - url = url, - delimiter = opts.delimiter, - header = opts.header, - charset = opts.charset, - colTypes = opts.colTypes, - skipLines = opts.skipLines, - readLines = opts.readLines, - parserOptions = opts.parserOptions, - ignoreEmptyLines = opts.ignoreEmptyLines, - allowMissingColumns = opts.allowMissingColumns, - ignoreExcessColumns = opts.ignoreExcessColumns, - quote = opts.quote, - ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, - trimInsideQuoted = opts.trimInsideQuoted, - parseParallel = opts.parseParallel, - ) - } - - is DataSourceType.InMemory -> when { - kType.isSubTypeOf() -> - (source as? InputStream)?.let { stream -> - runCatching { stream.reset() } - DataFrame.readCsv( - inputStream = stream, - delimiter = opts.delimiter, - header = opts.header, - charset = opts.charset, - colTypes = opts.colTypes, - skipLines = opts.skipLines, - readLines = opts.readLines, - parserOptions = opts.parserOptions, - ignoreEmptyLines = opts.ignoreEmptyLines, - allowMissingColumns = opts.allowMissingColumns, - ignoreExcessColumns = opts.ignoreExcessColumns, - quote = opts.quote, - ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, - trimInsideQuoted = opts.trimInsideQuoted, - parseParallel = opts.parseParallel, - ) - } - - kType.isSubTypeOf() -> - (source as? String)?.let { text -> - DataFrame.readCsvStr( - text = text, - delimiter = opts.delimiter, - header = opts.header, - colTypes = opts.colTypes, - skipLines = opts.skipLines, - readLines = opts.readLines, - parserOptions = opts.parserOptions, - ignoreEmptyLines = opts.ignoreEmptyLines, - allowMissingColumns = opts.allowMissingColumns, - ignoreExcessColumns = opts.ignoreExcessColumns, - quote = opts.quote, - ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, - trimInsideQuoted = opts.trimInsideQuoted, - parseParallel = opts.parseParallel, - ) - } - - else -> null - } + val kType = sourceInfo.kType + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return DataFrame.readCsv( + url = url, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + return when { + kType.isSubTypeOf() -> + (source as? InputStream)?.let { stream -> + DataFrame.readCsv( + inputStream = stream, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + kType.isSubTypeOf() -> + (source as? String)?.let { text -> + DataFrame.readCsvStr( + text = text, + delimiter = opts.delimiter, + header = opts.header, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + else -> null } } diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt index be8f7f5e20..fa605e43a0 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt @@ -58,10 +58,8 @@ public class Tsv : DataFrameReadSource { ) : DataFrameReadOptions public companion object { - public val supportedReferenceTypes: Set = - setOf(typeOf(), typeOf(), typeOf(), typeOf()) - public val supportedInMemoryTypes: Set = - setOf(typeOf(), typeOf()) + public val supportedTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) internal const val EXTENSION: String = "tsv" internal const val MIME_TYPE: String = "text/tab-separated-values" @@ -71,15 +69,7 @@ public class Tsv : DataFrameReadSource { if (options != null && options !is Options) return false if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false if (sourceInfo.mimeType?.lowercase()?.equals(MIME_TYPE) == false) return false - - val kType = sourceInfo.type.kType - return when (sourceInfo.type) { - is DataSourceType.Reference -> - supportedReferenceTypes.any { kType.isSubtypeOf(it) } - - is DataSourceType.InMemory -> - supportedInMemoryTypes.any { kType.isSubtypeOf(it) } - } + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( @@ -88,81 +78,77 @@ public class Tsv : DataFrameReadSource { options: DataFrameReadOptions?, ): DataFrame<*>? { val opts = (options ?: Options()) as Options - val kType = sourceInfo.type.kType - return when (sourceInfo.type) { - is DataSourceType.Reference -> { - val url = when { - kType.isSubTypeOf() -> (source as? String)?.let(::asUrl) - kType.isSubTypeOf() -> source as? URL - kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() - kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() - else -> null - } ?: return null - - DataFrame.readTsv( - url = url, - delimiter = opts.delimiter, - header = opts.header, - charset = opts.charset, - colTypes = opts.colTypes, - skipLines = opts.skipLines, - readLines = opts.readLines, - parserOptions = opts.parserOptions, - ignoreEmptyLines = opts.ignoreEmptyLines, - allowMissingColumns = opts.allowMissingColumns, - ignoreExcessColumns = opts.ignoreExcessColumns, - quote = opts.quote, - ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, - trimInsideQuoted = opts.trimInsideQuoted, - parseParallel = opts.parseParallel, - ) - } - - is DataSourceType.InMemory -> when { - kType.isSubTypeOf() -> - (source as? InputStream)?.let { stream -> - runCatching { stream.reset() } - DataFrame.readTsv( - inputStream = stream, - delimiter = opts.delimiter, - header = opts.header, - charset = opts.charset, - colTypes = opts.colTypes, - skipLines = opts.skipLines, - readLines = opts.readLines, - parserOptions = opts.parserOptions, - ignoreEmptyLines = opts.ignoreEmptyLines, - allowMissingColumns = opts.allowMissingColumns, - ignoreExcessColumns = opts.ignoreExcessColumns, - quote = opts.quote, - ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, - trimInsideQuoted = opts.trimInsideQuoted, - parseParallel = opts.parseParallel, - ) - } - - kType.isSubTypeOf() -> - (source as? String)?.let { text -> - DataFrame.readTsvStr( - text = text, - delimiter = opts.delimiter, - header = opts.header, - colTypes = opts.colTypes, - skipLines = opts.skipLines, - readLines = opts.readLines, - parserOptions = opts.parserOptions, - ignoreEmptyLines = opts.ignoreEmptyLines, - allowMissingColumns = opts.allowMissingColumns, - ignoreExcessColumns = opts.ignoreExcessColumns, - quote = opts.quote, - ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, - trimInsideQuoted = opts.trimInsideQuoted, - parseParallel = opts.parseParallel, - ) - } - - else -> null - } + val kType = sourceInfo.kType + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return DataFrame.readTsv( + url = url, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + return when { + kType.isSubTypeOf() -> + (source as? InputStream)?.let { stream -> + DataFrame.readTsv( + inputStream = stream, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + kType.isSubTypeOf() -> + (source as? String)?.let { text -> + DataFrame.readTsvStr( + text = text, + delimiter = opts.delimiter, + header = opts.header, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + else -> null } } diff --git a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt index 90caeec347..8830c0db55 100644 --- a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt +++ b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt @@ -81,10 +81,16 @@ public class ExcelNEW : DataFrameReadSource { ) : DataFrameReadOptions public companion object { - public val supportedReferenceTypes: Set = - setOf(typeOf(), typeOf(), typeOf(), typeOf()) - public val supportedInMemoryTypes: Set = - setOf(typeOf(), typeOf(), typeOf()) + public val supportedTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) internal val EXTENSIONS: Set = setOf("xls", "xlsx") internal val MIME_TYPES: Set = setOf( @@ -99,15 +105,7 @@ public class ExcelNEW : DataFrameReadSource { if (ext != null && ext !in EXTENSIONS) return false val mime = sourceInfo.mimeType?.lowercase() if (mime != null && mime !in MIME_TYPES) return false - - val kType = sourceInfo.type.kType - return when (sourceInfo.type) { - is DataSourceType.Reference -> - supportedReferenceTypes.any { kType.isSubtypeOf(it) } - - is DataSourceType.InMemory -> - supportedInMemoryTypes.any { kType.isSubtypeOf(it) } - } + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( @@ -116,79 +114,75 @@ public class ExcelNEW : DataFrameReadSource { options: DataFrameReadOptions?, ): DataFrame<*>? { val opts = (options ?: Options()) as Options - val kType = sourceInfo.type.kType - return when (sourceInfo.type) { - is DataSourceType.Reference -> { - val url = when { - kType.isSubTypeOf() -> (source as? String)?.let(::asUrl) - kType.isSubTypeOf() -> source as? URL - kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() - kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() - else -> null - } ?: return null - - DataFrame.readExcel( - url = url, - sheetName = opts.sheetName, - skipRows = opts.skipRows, - columns = opts.columns, - stringColumns = opts.stringColumns, - rowsCount = opts.rowsCount, - nameRepairStrategy = opts.nameRepairStrategy, - firstRowIsHeader = opts.firstRowIsHeader, - parseEmptyAsNull = opts.parseEmptyAsNull, - ) - } + val kType = sourceInfo.kType - is DataSourceType.InMemory -> when { - kType.isSubTypeOf() -> - (source as? InputStream)?.let { stream -> - runCatching { stream.reset() } - DataFrame.readExcel( - inputStream = stream, - sheetName = opts.sheetName, - skipRows = opts.skipRows, - columns = opts.columns, - stringColumns = opts.stringColumns, - rowsCount = opts.rowsCount, - nameRepairStrategy = opts.nameRepairStrategy, - firstRowIsHeader = opts.firstRowIsHeader, - parseEmptyAsNull = opts.parseEmptyAsNull, - ) - } + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return DataFrame.readExcel( + url = url, + sheetName = opts.sheetName, + skipRows = opts.skipRows, + columns = opts.columns, + stringColumns = opts.stringColumns, + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } - kType.isSubTypeOf() -> - (source as? Workbook)?.let { wb -> - DataFrame.readExcel( - wb = wb, - sheetName = opts.sheetName, - skipRows = opts.skipRows, - columns = opts.columns, - formattingOptions = opts.stringColumns?.toFormattingOptions(), - rowsCount = opts.rowsCount, - nameRepairStrategy = opts.nameRepairStrategy, - firstRowIsHeader = opts.firstRowIsHeader, - parseEmptyAsNull = opts.parseEmptyAsNull, - ) - } + return when { + kType.isSubTypeOf() -> + (source as? InputStream)?.let { stream -> + DataFrame.readExcel( + inputStream = stream, + sheetName = opts.sheetName, + skipRows = opts.skipRows, + columns = opts.columns, + stringColumns = opts.stringColumns, + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } - kType.isSubTypeOf() -> - (source as? Sheet)?.let { sheet -> - // readExcel(Sheet) has no sheetName parameter — the sheet is already selected. - DataFrame.readExcel( - sheet = sheet, - columns = opts.columns, - formattingOptions = opts.stringColumns?.toFormattingOptions(), - skipRows = opts.skipRows, - rowsCount = opts.rowsCount, - nameRepairStrategy = opts.nameRepairStrategy, - firstRowIsHeader = opts.firstRowIsHeader, - parseEmptyAsNull = opts.parseEmptyAsNull, - ) - } + kType.isSubTypeOf() -> + (source as? Workbook)?.let { wb -> + DataFrame.readExcel( + wb = wb, + sheetName = opts.sheetName, + skipRows = opts.skipRows, + columns = opts.columns, + formattingOptions = opts.stringColumns?.toFormattingOptions(), + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } - else -> null - } + kType.isSubTypeOf() -> + (source as? Sheet)?.let { sheet -> + // readExcel(Sheet) has no sheetName parameter — the sheet is already selected. + DataFrame.readExcel( + sheet = sheet, + columns = opts.columns, + formattingOptions = opts.stringColumns?.toFormattingOptions(), + skipRows = opts.skipRows, + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } + + else -> null } } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt index 0372c06d8d..5a8e1695c7 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt @@ -51,13 +51,10 @@ public class Jdbc : * carry that instruction. Provide it via [Options.sqlQueryOrTableName]. The only exception is [ResultSet], * which is already an executed query. * - * Supports the following sources: - * - [Reference][DataSourceType.Reference]: [DbConnectionConfig] - * - [InMemory][DataSourceType.InMemory]: [Connection], [DataSource], [DbConnectionConfig], [ResultSet] + * Supported source types: [Connection], [DataSource], [DbConnectionConfig], [ResultSet]. * - * Note: [DbConnectionConfig] is accepted as both reference and in-memory deliberately, to compare which - * feels more natural in practice. Other read-paths in this module — notably `readAllSqlTables` returning a - * `Map` — don't fit the single-DataFrame contract and are unchanged. + * `readAllSqlTables` returns a `Map` and doesn't fit the single-DataFrame contract; it + * remains as a direct API call. */ public class Jdbc2 : DataFrameReadSource { @@ -82,8 +79,7 @@ public class Jdbc2 : DataFrameReadSource { ) : DataFrameReadOptions public companion object { - public val supportedReferenceTypes: Set = setOf(typeOf()) - public val supportedInMemoryTypes: Set = + public val supportedTypes: Set = setOf( typeOf(), typeOf(), @@ -94,14 +90,7 @@ public class Jdbc2 : DataFrameReadSource { override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { if (options != null && options !is Options) return false - val kType = sourceInfo.type.kType - return when (sourceInfo.type) { - is DataSourceType.Reference -> - supportedReferenceTypes.any { kType.isSubtypeOf(it) } - - is DataSourceType.InMemory -> - supportedInMemoryTypes.any { kType.isSubtypeOf(it) } - } + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( diff --git a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt index 3cf874a5a7..3f8c731bef 100644 --- a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt +++ b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt @@ -46,25 +46,22 @@ public class Json : DataFrameReadSource { ) : DataFrameReadOptions public companion object { - public val supportedReferenceTypes: Set = - setOf(typeOf(), typeOf(), typeOf(), typeOf()) - public val supportedInMemoryTypes: Set = - setOf(typeOf(), typeOf(), typeOf()) + public val supportedTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) } override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { if (options != null && options !is Options) return false if (sourceInfo.extension?.lowercase()?.equals("json") == false) return false if (sourceInfo.mimeType?.lowercase()?.equals("application/json") == false) return false - - val kType = sourceInfo.type.kType - return when (sourceInfo.type) { - is DataSourceType.Reference -> - supportedReferenceTypes.any { kType.isSubtypeOf(it) } - - is DataSourceType.InMemory -> - supportedInMemoryTypes.any { kType.isSubtypeOf(it) } - } + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } @OptIn(ExperimentalSerializationApi::class) @@ -73,55 +70,45 @@ public class Json : DataFrameReadSource { sourceInfo: DataSourceInfo, options: DataFrameReadOptions?, ): DataFrame<*>? { - val options = (options ?: Options()) as Options - val kType = sourceInfo.type.kType - return when (sourceInfo.type) { - is DataSourceType.Reference -> { - val url = when { - kType.isSubTypeOf() -> (source as? String)?.let(::asUrl) - kType.isSubTypeOf() -> source as? URL - kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() - kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() - else -> null - } ?: return null - - DataFrame.readJson( - url = url, - header = options.header, - typeClashTactic = options.typeClashTactic, - keyValuePaths = options.keyValuePaths, - unifyNumbers = options.unifyNumbers, - ) - } - - is DataSourceType.InMemory -> { - val element = when { - kType.isSubTypeOf() -> - (source as? InputStream)?.let { - runCatching { it.reset() } - Json.decodeFromStream(it) - } + val opts = (options ?: Options()) as Options + val kType = sourceInfo.kType + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return DataFrame.readJson( + url = url, + header = opts.header, + typeClashTactic = opts.typeClashTactic, + keyValuePaths = opts.keyValuePaths, + unifyNumbers = opts.unifyNumbers, + ) + } - kType.isSubTypeOf() -> - (source as? String)?.let { - Json.decodeFromString(it) - } + val element: JsonElement? = when { + kType.isSubTypeOf() -> + (source as? InputStream)?.let { Json.decodeFromStream(it) } - kType.isSubTypeOf() -> - source as? JsonElement + kType.isSubTypeOf() -> + (source as? String)?.let { Json.decodeFromString(it) } - else -> null - } ?: return null + kType.isSubTypeOf() -> + source as? JsonElement - readJsonImpl( - parsed = element, - header = options.header, - typeClashTactic = options.typeClashTactic, - keyValuePaths = options.keyValuePaths, - unifyNumbers = options.unifyNumbers, - ) - } - } + else -> null + } ?: return null + + return readJsonImpl( + parsed = element, + header = opts.header, + typeClashTactic = opts.typeClashTactic, + keyValuePaths = opts.keyValuePaths, + unifyNumbers = opts.unifyNumbers, + ) } override val testOrder: Int = 10_000 From 6143eab3ee0b9555635192c0fcde3adaa31b651f Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Thu, 14 May 2026 15:47:24 +0200 Subject: [PATCH 05/14] json early exit --- .../org/jetbrains/kotlinx/dataframe/io/json.kt | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt index 3f8c731bef..679e728ce6 100644 --- a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt +++ b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt @@ -89,12 +89,15 @@ public class Json : DataFrameReadSource { ) } - val element: JsonElement? = when { + val element: JsonElement = when { kType.isSubTypeOf() -> (source as? InputStream)?.let { Json.decodeFromStream(it) } kType.isSubTypeOf() -> - (source as? String)?.let { Json.decodeFromString(it) } + (source as? String)?.let { + if (it.isNotJson()) return null + Json.decodeFromString(it) + } kType.isSubTypeOf() -> source as? JsonElement @@ -114,6 +117,16 @@ public class Json : DataFrameReadSource { override val testOrder: Int = 10_000 override fun toString(): String = "Json" + + // early-exit check for String to see if it's definitely not json + private fun String.isNotJson(): Boolean = + trim().let { + it.isEmpty() || + !( + it.startsWith('{') && it.endsWith('}') || + it.startsWith('[') && it.endsWith(']') + ) + } } private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) From 0c93a45ea0d9ce8e25dd2fdb0879c0a6bd33b746 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Thu, 14 May 2026 16:39:35 +0200 Subject: [PATCH 06/14] DataFrameSchema.readSource --- .../jetbrains/kotlinx/dataframe/io/guess2.kt | 101 +++++++++++++++--- .../jetbrains/kotlinx/dataframe/io/Guess2.kt | 71 ++++++++++++ .../jetbrains/kotlinx/dataframe/io/Jdbc.kt | 36 +++++++ 3 files changed, 195 insertions(+), 13 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt index 3c7eff0c30..7725f1204e 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt @@ -2,7 +2,8 @@ package org.jetbrains.kotlinx.dataframe.io import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.DataFrame -import org.jetbrains.kotlinx.dataframe.io.readSource +import org.jetbrains.kotlinx.dataframe.api.schema +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import java.io.ByteArrayInputStream import java.io.File import java.io.FileNotFoundException @@ -26,6 +27,19 @@ public interface DataFrameReadSource { options: DataFrameReadOptions? = null, ): DataFrame<*>? + /** + * Read just the [DataFrameSchema] for [source]. + * + * The default implementation reads the full DataFrame and calls [DataFrame.schema]. Override when the + * source format can introspect types without materializing rows (e.g., JDBC metadata queries, Parquet/Arrow + * file footers, OpenAPI specs). + */ + public fun readDataFrameSchemaOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions? = null, + ): DataFrameSchema? = readDataFrameOrNull(source, sourceInfo, options)?.schema() + public fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean // `DataFrame.Companion.read` methods uses this to sort list of all supported formats in ascending order (-1, 2, 10) @@ -58,22 +72,36 @@ internal val newSupportedFormats: List by lazy { .sortedBy { it.testOrder } } -internal fun readDataFrameImpl( +/** + * Shared dispatch loop for [readDataFrameImpl] and [readDataFrameSchemaImpl]: handles String→URL + * normalization, InputStream buffering, sorted iteration, and error aggregation. The per-format read + * operation is supplied as [readOrNull]; [resultKind] is used only in the "unknown source" error message. + * + * @param [readOrNull] [DataFrameReadSource.readDataFrameOrNull] or [DataFrameReadSource.readDataFrameSchemaOrNull] + * Potentially, this could also return another type, like a GeoDataFrame. + */ +internal fun readSourceImpl( source: Any, sourceInfo: DataSourceInfo, - options: DataFrameReadOptions? = null, - formats: List = newSupportedFormats, -): AnyFrame { + options: DataFrameReadOptions?, + formats: List, + resultKind: String, + readOrNull: DataFrameReadSource.( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ) -> T?, +): T { if (source is String) { val url = asUrlOrNull(source) if (url != null) { - return readDataFrameImpl( + return readSourceImpl( source = url, - sourceInfo = sourceInfo.copy( - kType = typeOf(), - ), + sourceInfo = sourceInfo.copy(kType = typeOf()), options = options, formats = formats, + resultKind = resultKind, + readOrNull = readOrNull, ) } } @@ -95,15 +123,15 @@ internal fun readDataFrameImpl( formats.sortedBy { it.testOrder }.forEach { if (!it.acceptsSource(sourceInfo, options)) return@forEach try { - val df = it.readDataFrameOrNull(getSource(), sourceInfo, options) - if (df != null) return df + val result = it.readOrNull(getSource(), sourceInfo, options) + if (result != null) return result } catch (e: FileNotFoundException) { throw e } catch (e: Exception) { tries[it::class.simpleName!!] = e } } - throw IllegalArgumentException("Unknown DataFrame source $source, $sourceInfo; Tried $tries") + throw IllegalArgumentException("Unknown $resultKind source $source, $sourceInfo; Tried $tries") } /** @@ -119,7 +147,7 @@ internal fun readDataFrameImpl( * retired, this can be renamed to `read`. */ public fun DataFrame.Companion.readSource(source: Any, type: KType, options: DataFrameReadOptions? = null): AnyFrame = - readDataFrameImpl( + readSourceImpl( source = source, sourceInfo = DataSourceInfo( kType = type.withNullability(false), @@ -127,6 +155,9 @@ public fun DataFrame.Companion.readSource(source: Any, type: KType, options: Dat mimeType = null, // TODO, Apache Tika? ), options = options, + formats = newSupportedFormats, + resultKind = "DataFrame", + readOrNull = DataFrameReadSource::readDataFrameOrNull, ) public inline fun DataFrame.Companion.readSource( @@ -134,6 +165,50 @@ public inline fun DataFrame.Companion.readSource( options: DataFrameReadOptions? = null, ): AnyFrame = readSource(source = source, type = typeOf(), options = options) +/** + * Schema-only counterpart of [DataFrame.Companion.readSource]: dispatches through every registered + * [DataFrameReadSource] and returns the resulting [DataFrameSchema] without materializing rows when the + * format supports it (e.g., JDBC). Formats with no fast schema path fall back to reading the full DataFrame + * and calling [DataFrame.schema]. + */ +public fun DataFrameSchema.Companion.readSource( + source: Any, + type: KType, + options: DataFrameReadOptions? = null, +): DataFrameSchema = + readSourceImpl( + source = source, + sourceInfo = DataSourceInfo( + kType = type.withNullability(false), + extension = source.extensionOrNull(), + mimeType = null, // TODO, Apache Tika? + ), + options = options, + formats = newSupportedFormats, + resultKind = "DataFrameSchema", + readOrNull = DataFrameReadSource::readDataFrameSchemaOrNull, + ) + +internal fun readDataFrameSchemaImpl( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions? = null, + formats: List = newSupportedFormats, +): DataFrameSchema = + readSourceImpl( + source = source, + sourceInfo = sourceInfo, + options = options, + formats = formats, + resultKind = "DataFrameSchema", + readOrNull = DataFrameReadSource::readDataFrameSchemaOrNull, + ) + +public inline fun DataFrameSchema.Companion.readSource( + source: R, + options: DataFrameReadOptions? = null, +): DataFrameSchema = readSource(source = source, type = typeOf(), options = options) + internal fun Any.extensionOrNull(): String? = when (this) { is Path -> extension diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt index 086796316e..e83603a939 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt @@ -5,7 +5,9 @@ import kotlinx.serialization.json.Json import kotlinx.serialization.json.JsonElement import org.apache.poi.ss.usermodel.WorkbookFactory import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.io.db.H2 +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import org.junit.Test import java.io.File import java.sql.Connection @@ -297,4 +299,73 @@ class Guess2 { DataFrame.readSource(config, tableOpts) shouldBe expected DataFrame.readSource(config, Jdbc2.Options(sqlQueryOrTableName = "SELECT * FROM Customer")) shouldBe expected } + + @Test + fun `read schema via default fallback (file-based formats)`() { + // JSON + val jsonExpected = DataFrame.readJson("../data/participants.json").schema() + DataFrameSchema.readSource( + File("../data/participants.json"), + ) shouldBe jsonExpected + DataFrameSchema.readSource( + "../data/participants.json", + ) shouldBe jsonExpected + + // CSV + val csvExpected = DataFrame.readCsv("../data/movies.csv").schema() + DataFrameSchema.readSource( + File("../data/movies.csv"), + ) shouldBe csvExpected + + // TSV + val tsvFile = File("src/test/resources/abc.tsv") + val tsvExpected = DataFrame.readTsv(tsvFile).schema() + DataFrameSchema.readSource(tsvFile) shouldBe tsvExpected + + // XLSX + val xlsxFile = File("src/test/resources/sample2.xlsx") + val xlsxExpected = DataFrame.readExcel(xlsxFile).schema() + DataFrameSchema.readSource(xlsxFile) shouldBe xlsxExpected + } + + @Test + fun `read JDBC schema via override`() { + val url = h2Url("guess2_schema") + DriverManager.getConnection(url).use { conn -> + seed(conn) + val expected = DataFrameSchema.readSqlTable(conn, "Customer") + val tableOpts = Jdbc2.Options(sqlQueryOrTableName = "Customer") + val queryOpts = Jdbc2.Options(sqlQueryOrTableName = "SELECT * FROM Customer") + + DataFrameSchema.readSource(conn, tableOpts) shouldBe expected + DataFrameSchema.readSource(conn, queryOpts) shouldBe expected + + val config = DbConnectionConfig(url = url) + DataFrameSchema.readSource(config, tableOpts) shouldBe expected + } + } + + @Test + fun `read JDBC schema from ResultSet does not advance cursor`() { + val url = h2Url("guess2_rs_schema") + DriverManager.getConnection(url).use { conn -> + seed(conn) + + conn.prepareStatement("SELECT * FROM Customer").use { ps -> + ps.executeQuery().use { rs -> + // Schema-from-ResultSet uses JDBC metadata only — no rows are fetched, so the + // cursor stays at "before first row". (And nullability comes from the column metadata, + // which is conservatively nullable for columns without NOT NULL constraints; this is + // why we don't compare against the data-inferred schema directly.) + val expected = DataFrameSchema.readResultSet( + conn.prepareStatement("SELECT * FROM Customer").executeQuery(), + H2(), + ) + val schema = DataFrameSchema.readSource(rs, Jdbc2.Options(dbType = H2())) + schema shouldBe expected + rs.isBeforeFirst shouldBe true + } + } + } + } } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt index 5a8e1695c7..1d29bc328e 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt @@ -6,6 +6,8 @@ import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod import org.jetbrains.kotlinx.dataframe.codeGen.Code import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod import org.jetbrains.kotlinx.dataframe.io.db.DbType +import org.jetbrains.kotlinx.dataframe.io.db.extractDBTypeFromConnection +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import java.io.File import java.io.InputStream import java.nio.file.Path @@ -153,6 +155,40 @@ public class Jdbc2 : DataFrameReadSource { } } + override fun readDataFrameSchemaOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrameSchema? { + val opts = (options ?: Options()) as Options + return when (source) { + // ResultSet has a true zero-row metadata-only path. + is ResultSet -> when { + opts.dbType != null -> + DataFrameSchema.readResultSet(source, opts.dbType) + + opts.resultSetConnection != null -> + DataFrameSchema.readResultSet(source, extractDBTypeFromConnection(opts.resultSetConnection)) + + else -> null + } + + is Connection -> opts.sqlQueryOrTableName?.let { + source.readDataFrameSchema(sqlQueryOrTableName = it, dbType = opts.dbType) + } + + is DataSource -> opts.sqlQueryOrTableName?.let { + source.readDataFrameSchema(sqlQueryOrTableName = it, dbType = opts.dbType) + } + + is DbConnectionConfig -> opts.sqlQueryOrTableName?.let { + source.readDataFrameSchema(sqlQueryOrTableName = it, dbType = opts.dbType) + } + + else -> null + } + } + override val testOrder: Int = 50_000 override fun toString(): String = "Jdbc" From 2283c93037f4445521350fe0aa4d2ed9ca36b1ff Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Mon, 18 May 2026 14:03:29 +0200 Subject: [PATCH 07/14] added Arrow support to DataFrameReadSource --- core/build.gradle.kts | 2 + .../jetbrains/kotlinx/dataframe/io/Guess2.kt | 64 +++++ .../kotlinx/dataframe/io/arrowReading.kt | 244 ++++++++++++++++++ ...s.kotlinx.dataframe.io.DataFrameReadSource | 3 + .../org/jetbrains/kotlinx/dataframe/io/csv.kt | 4 +- .../org/jetbrains/kotlinx/dataframe/io/tsv.kt | 4 +- .../jetbrains/kotlinx/dataframe/io/xlsx.kt | 5 +- .../jetbrains/kotlinx/dataframe/io/Jdbc.kt | 4 +- .../jetbrains/kotlinx/dataframe/io/json.kt | 4 +- 9 files changed, 323 insertions(+), 11 deletions(-) create mode 100644 dataframe-arrow/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource diff --git a/core/build.gradle.kts b/core/build.gradle.kts index a6a2fc3115..962ac51863 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -313,6 +313,8 @@ tasks.withType { tasks.test { maxHeapSize = "1g" + // Arrow's off-heap allocator needs deep reflection into java.nio. + jvmArgs("--add-opens", "java.base/java.nio=ALL-UNNAMED") } // Test task for Java 16+ language-specific tests diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt index e83603a939..70534d7771 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt @@ -368,4 +368,68 @@ class Guess2 { } } } + + @Test + fun `read Arrow Feather reference`() { + val featherFile = File("src/test/resources/test.feather") + val expected = DataFrame.readArrowFeather(featherFile) + + DataFrame.readSource(featherFile.path) shouldBe expected + DataFrame.readSource(Path(featherFile.path)) shouldBe expected + DataFrame.readSource(featherFile) shouldBe expected + DataFrame.readSource( + Path(featherFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = ArrowFeatherNEW.Options() + + DataFrame.readSource(featherFile.path, options) shouldBe expected + DataFrame.readSource(featherFile, options) shouldBe expected + } + + @Test + fun `read Arrow Feather in memory`() { + val featherFile = File("src/test/resources/test.feather") + val expected = DataFrame.readArrowFeather(featherFile) + val options = ArrowFeatherNEW.Options() + + // ByteArray, InputStream, SeekableByteChannel all need options to disambiguate (no extension). + DataFrame.readSource(featherFile.readBytes(), options) shouldBe expected + DataFrame.readSource(featherFile.inputStream(), options) shouldBe expected + java.nio.file.Files.newByteChannel(featherFile.toPath()).use { channel -> + DataFrame.readSource(channel, options) shouldBe expected + } + } + + @Test + fun `read Arrow IPC reference`() { + val ipcFile = File("src/test/resources/test.arrow") + val expected = DataFrame.readArrowIPC(ipcFile) + + DataFrame.readSource(ipcFile.path) shouldBe expected + DataFrame.readSource(Path(ipcFile.path)) shouldBe expected + DataFrame.readSource(ipcFile) shouldBe expected + DataFrame.readSource( + Path(ipcFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = ArrowIPC.Options() + DataFrame.readSource(ipcFile, options) shouldBe expected + } + + @Test + fun `read Parquet reference`() { + val parquetFile = File("src/test/resources/test.parquet") + val expected = DataFrame.readParquet(parquetFile) + + DataFrame.readSource(parquetFile.path) shouldBe expected + DataFrame.readSource(Path(parquetFile.path)) shouldBe expected + DataFrame.readSource(parquetFile) shouldBe expected + DataFrame.readSource( + Path(parquetFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = Parquet.Options() + DataFrame.readSource(parquetFile, options) shouldBe expected + } } diff --git a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt index 5cf884d73d..63161add45 100644 --- a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt +++ b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt @@ -18,6 +18,9 @@ import java.nio.channels.ReadableByteChannel import java.nio.channels.SeekableByteChannel import java.nio.file.Files import java.nio.file.Path +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf +import kotlin.reflect.typeOf public class ArrowFeather : SupportedDataFrameFormat { override fun readDataFrame(stream: InputStream, header: List): AnyFrame = @@ -36,6 +39,247 @@ public class ArrowFeather : SupportedDataFrameFormat { DefaultReadArrowMethod(pathRepresentation) } +/** + * [DataFrameReadSource] for [Arrow Feather files][DataFrame.readArrowFeather] (random-access IPC format). + * + * Supported source types: + * - References: [URL], [Path], [File] + * - In-memory: [SeekableByteChannel], [ByteArray], [InputStream], [ArrowReader] + * + * Default-accepts the `.feather` extension. To read with no extension hint (e.g., an [InputStream]) pass + * an [Options] instance to disambiguate from text formats. + */ +public class ArrowFeatherNEW : DataFrameReadSource { + + public data class Options(val nullability: NullabilityOptions = NullabilityOptions.Infer) : DataFrameReadOptions + + public companion object { + public val SUPPORTED_TYPES: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + + internal const val EXTENSION: String = "feather" + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.kType + + // ArrowReader is exclusive; check before more general types. + if (kType.isArrowSubTypeOf()) { + return (source as? ArrowReader)?.let { DataFrame.readArrow(it, opts.nullability) } + } + + val url: URL? = when { + kType.isArrowSubTypeOf() -> source as? URL + kType.isArrowSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isArrowSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return DataFrame.readArrowFeather(url, opts.nullability) + } + + return when { + kType.isArrowSubTypeOf() -> + (source as? SeekableByteChannel)?.let { + DataFrame.readArrowFeather(it, nullability = opts.nullability) + } + + kType.isArrowSubTypeOf() -> + (source as? ByteArray)?.let { DataFrame.readArrowFeather(it, opts.nullability) } + + kType.isArrowSubTypeOf() -> + (source as? InputStream)?.let { DataFrame.readArrowFeather(it, opts.nullability) } + + else -> null + } + } + + override val testOrder: Int = 60_000 + + override fun toString(): String = "ArrowFeather" +} + +/** + * [DataFrameReadSource] for [Arrow IPC streaming files][DataFrame.readArrowIPC]. + * + * Supported source types: + * - References: [URL], [Path], [File] + * - In-memory: [InputStream], [ByteArray], [ReadableByteChannel], [ArrowReader] + * + * There's no widely-standardized extension for IPC streaming files (`.arrow` is most common but is also + * used for random-access Feather), so this format accepts the `.arrow` extension. If your `.arrow` file is + * actually random-access (Feather), prefer [ArrowFeatherNEW] — both formats will match `.arrow`, but + * [ArrowFeatherNEW] runs first by [testOrder] and a Feather read of a streaming-format file will throw, + * letting the framework fall through to [ArrowIPC]. + */ +public class ArrowIPC : DataFrameReadSource { + + public data class Options( + val allocator: RootAllocator = Allocator.ROOT, + val nullability: NullabilityOptions = NullabilityOptions.Infer, + ) : DataFrameReadOptions + + public companion object { + public val SUPPORTED_TYPES: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + + internal const val EXTENSION: String = "arrow" + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.kType + + if (kType.isArrowSubTypeOf()) { + return (source as? ArrowReader)?.let { DataFrame.readArrow(it, opts.nullability) } + } + + val url: URL? = when { + kType.isArrowSubTypeOf() -> source as? URL + kType.isArrowSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isArrowSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return DataFrame.readArrowIPC(url, opts.nullability) + } + + return when { + kType.isArrowSubTypeOf() -> + (source as? ReadableByteChannel)?.let { + DataFrame.readArrowIPC(it, allocator = opts.allocator, nullability = opts.nullability) + } + + kType.isArrowSubTypeOf() -> + (source as? ByteArray)?.let { DataFrame.readArrowIPC(it, opts.nullability) } + + kType.isArrowSubTypeOf() -> + (source as? InputStream)?.let { DataFrame.readArrowIPC(it, opts.nullability) } + + else -> null + } + } + + // Runs after ArrowFeatherNEW so that `.feather` files get the random-access reader first. + // Both accept `.arrow`; if Feather reading throws on an IPC streaming file the framework falls + // through to here. + override val testOrder: Int = 60_100 + + override fun toString(): String = "ArrowIPC" +} + +/** + * [DataFrameReadSource] for Apache Parquet files (read via Arrow Dataset). + * + * Arrow Dataset only consumes URIs, so only reference-style sources are supported: + * - References: [URL], [Path], [File] + * + * TODO? Multi-file Parquet datasets (vararg in [DataFrame.readParquet]) aren't covered by this single-source API; + * use [DataFrame.readParquet] directly for those. + */ +public class Parquet : DataFrameReadSource { + + public data class Options( + val nullability: NullabilityOptions = NullabilityOptions.Infer, + val batchSize: Long = ARROW_PARQUET_DEFAULT_BATCH_SIZE, + ) : DataFrameReadOptions + + public companion object { + public val SUPPORTED_TYPES: Set = + setOf(typeOf(), typeOf(), typeOf()) + + internal const val EXTENSION: String = "parquet" + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.kType + return when { + kType.isArrowSubTypeOf() -> + (source as? URL)?.let { + DataFrame.readParquet( + it, + nullability = opts.nullability, + batchSize = opts.batchSize, + ) + } + + kType.isArrowSubTypeOf() -> + (source as? Path)?.let { + DataFrame.readParquet( + it, + nullability = opts.nullability, + batchSize = opts.batchSize, + ) + } + + kType.isArrowSubTypeOf() -> + (source as? File)?.let { + DataFrame.readParquet( + it, + nullability = opts.nullability, + batchSize = opts.batchSize, + ) + } + + else -> null + } + } + + override val testOrder: Int = 60_500 + + override fun toString(): String = "Parquet" +} + +private inline fun KType.isArrowSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + private const val READ_ARROW_FEATHER = "readArrowFeather" internal const val ARROW_PARQUET_DEFAULT_BATCH_SIZE = 32768L diff --git a/dataframe-arrow/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource b/dataframe-arrow/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource new file mode 100644 index 0000000000..ef8466f22f --- /dev/null +++ b/dataframe-arrow/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource @@ -0,0 +1,3 @@ +org.jetbrains.kotlinx.dataframe.io.ArrowIPC +org.jetbrains.kotlinx.dataframe.io.ArrowFeatherNEW +org.jetbrains.kotlinx.dataframe.io.Parquet diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt index b168062a49..cba2897fcb 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt @@ -58,7 +58,7 @@ public class Csv : DataFrameReadSource { ) : DataFrameReadOptions public companion object { - public val supportedTypes: Set = + public val SUPPORTED_TYPES: Set = setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) internal const val EXTENSION: String = "csv" @@ -69,7 +69,7 @@ public class Csv : DataFrameReadSource { if (options != null && options !is Options) return false if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false if (sourceInfo.mimeType?.lowercase()?.equals(MIME_TYPE) == false) return false - return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt index fa605e43a0..fdf80ada21 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt @@ -58,7 +58,7 @@ public class Tsv : DataFrameReadSource { ) : DataFrameReadOptions public companion object { - public val supportedTypes: Set = + public val SUPPORTED_TYPES: Set = setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) internal const val EXTENSION: String = "tsv" @@ -69,7 +69,7 @@ public class Tsv : DataFrameReadSource { if (options != null && options !is Options) return false if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false if (sourceInfo.mimeType?.lowercase()?.equals(MIME_TYPE) == false) return false - return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( diff --git a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt index 8830c0db55..56119ee7e4 100644 --- a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt +++ b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt @@ -81,12 +81,11 @@ public class ExcelNEW : DataFrameReadSource { ) : DataFrameReadOptions public companion object { - public val supportedTypes: Set = + public val SUPPORTED_TYPES: Set = setOf( typeOf(), typeOf(), typeOf(), - typeOf(), typeOf(), typeOf(), typeOf(), @@ -105,7 +104,7 @@ public class ExcelNEW : DataFrameReadSource { if (ext != null && ext !in EXTENSIONS) return false val mime = sourceInfo.mimeType?.lowercase() if (mime != null && mime !in MIME_TYPES) return false - return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt index 1d29bc328e..78948a5514 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt @@ -81,7 +81,7 @@ public class Jdbc2 : DataFrameReadSource { ) : DataFrameReadOptions public companion object { - public val supportedTypes: Set = + public val SUPPORTED_TYPES: Set = setOf( typeOf(), typeOf(), @@ -92,7 +92,7 @@ public class Jdbc2 : DataFrameReadSource { override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { if (options != null && options !is Options) return false - return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( diff --git a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt index 679e728ce6..1d24f32abc 100644 --- a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt +++ b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt @@ -46,7 +46,7 @@ public class Json : DataFrameReadSource { ) : DataFrameReadOptions public companion object { - public val supportedTypes: Set = + public val SUPPORTED_TYPES: Set = setOf( typeOf(), typeOf(), @@ -61,7 +61,7 @@ public class Json : DataFrameReadSource { if (options != null && options !is Options) return false if (sourceInfo.extension?.lowercase()?.equals("json") == false) return false if (sourceInfo.mimeType?.lowercase()?.equals("application/json") == false) return false - return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } } @OptIn(ExperimentalSerializationApi::class) From 36b722b82fa18bad19428c5d75e89b453b6e57ac Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Mon, 18 May 2026 14:15:39 +0200 Subject: [PATCH 08/14] moved supportedType to `DataFrameReadSource` so we could use it later in converters/parsers --- .../jetbrains/kotlinx/dataframe/io/guess2.kt | 11 +++ .../kotlinx/dataframe/io/arrowReading.kt | 92 +++++++++---------- .../org/jetbrains/kotlinx/dataframe/io/csv.kt | 8 +- .../org/jetbrains/kotlinx/dataframe/io/tsv.kt | 8 +- .../jetbrains/kotlinx/dataframe/io/xlsx.kt | 24 ++--- .../jetbrains/kotlinx/dataframe/io/Jdbc.kt | 18 ++-- .../jetbrains/kotlinx/dataframe/io/json.kt | 22 ++--- 7 files changed, 96 insertions(+), 87 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt index 7725f1204e..7e1c5ff818 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt @@ -21,6 +21,17 @@ import kotlin.reflect.typeOf public interface DataFrameReadOptions public interface DataFrameReadSource { + /** + * The set of source [KType]s this format knows how to read. The framework uses this in the default + * [acceptsSource] implementation, and overriding `acceptsSource` implementations should still consult it + * so that adding a new supported type only requires updating this set. + * + * Note: a `String` *reference* (path/URL) is normalized to a [URL] by `readSourceImpl` before any format + * is invoked, so only include `String` here when raw text content is a legitimate input (e.g., JSON/CSV + * text). For binary formats, leave `String` out. + */ + public val supportedTypes: Set + public fun readDataFrameOrNull( source: Any, sourceInfo: DataSourceInfo, diff --git a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt index 63161add45..be208cb93e 100644 --- a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt +++ b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt @@ -53,25 +53,25 @@ public class ArrowFeatherNEW : DataFrameReadSource { public data class Options(val nullability: NullabilityOptions = NullabilityOptions.Infer) : DataFrameReadOptions - public companion object { - public val SUPPORTED_TYPES: Set = - setOf( - typeOf(), - typeOf(), - typeOf(), - typeOf(), - typeOf(), - typeOf(), - typeOf(), - ) + override val supportedTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + public companion object { internal const val EXTENSION: String = "feather" } override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { if (options != null && options !is Options) return false if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false - return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( @@ -83,14 +83,14 @@ public class ArrowFeatherNEW : DataFrameReadSource { val kType = sourceInfo.kType // ArrowReader is exclusive; check before more general types. - if (kType.isArrowSubTypeOf()) { + if (kType.isSubTypeOf()) { return (source as? ArrowReader)?.let { DataFrame.readArrow(it, opts.nullability) } } val url: URL? = when { - kType.isArrowSubTypeOf() -> source as? URL - kType.isArrowSubTypeOf() -> (source as? Path)?.toUri()?.toURL() - kType.isArrowSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() else -> null } if (url != null) { @@ -98,15 +98,15 @@ public class ArrowFeatherNEW : DataFrameReadSource { } return when { - kType.isArrowSubTypeOf() -> + kType.isSubTypeOf() -> (source as? SeekableByteChannel)?.let { DataFrame.readArrowFeather(it, nullability = opts.nullability) } - kType.isArrowSubTypeOf() -> + kType.isSubTypeOf() -> (source as? ByteArray)?.let { DataFrame.readArrowFeather(it, opts.nullability) } - kType.isArrowSubTypeOf() -> + kType.isSubTypeOf() -> (source as? InputStream)?.let { DataFrame.readArrowFeather(it, opts.nullability) } else -> null @@ -138,25 +138,25 @@ public class ArrowIPC : DataFrameReadSource { val nullability: NullabilityOptions = NullabilityOptions.Infer, ) : DataFrameReadOptions - public companion object { - public val SUPPORTED_TYPES: Set = - setOf( - typeOf(), - typeOf(), - typeOf(), - typeOf(), - typeOf(), - typeOf(), - typeOf(), - ) + override val supportedTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + public companion object { internal const val EXTENSION: String = "arrow" } override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { if (options != null && options !is Options) return false if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false - return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( @@ -167,14 +167,14 @@ public class ArrowIPC : DataFrameReadSource { val opts = (options ?: Options()) as Options val kType = sourceInfo.kType - if (kType.isArrowSubTypeOf()) { + if (kType.isSubTypeOf()) { return (source as? ArrowReader)?.let { DataFrame.readArrow(it, opts.nullability) } } val url: URL? = when { - kType.isArrowSubTypeOf() -> source as? URL - kType.isArrowSubTypeOf() -> (source as? Path)?.toUri()?.toURL() - kType.isArrowSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() else -> null } if (url != null) { @@ -182,15 +182,15 @@ public class ArrowIPC : DataFrameReadSource { } return when { - kType.isArrowSubTypeOf() -> + kType.isSubTypeOf() -> (source as? ReadableByteChannel)?.let { DataFrame.readArrowIPC(it, allocator = opts.allocator, nullability = opts.nullability) } - kType.isArrowSubTypeOf() -> + kType.isSubTypeOf() -> (source as? ByteArray)?.let { DataFrame.readArrowIPC(it, opts.nullability) } - kType.isArrowSubTypeOf() -> + kType.isSubTypeOf() -> (source as? InputStream)?.let { DataFrame.readArrowIPC(it, opts.nullability) } else -> null @@ -221,17 +221,17 @@ public class Parquet : DataFrameReadSource { val batchSize: Long = ARROW_PARQUET_DEFAULT_BATCH_SIZE, ) : DataFrameReadOptions - public companion object { - public val SUPPORTED_TYPES: Set = - setOf(typeOf(), typeOf(), typeOf()) + override val supportedTypes: Set = + setOf(typeOf(), typeOf(), typeOf()) + public companion object { internal const val EXTENSION: String = "parquet" } override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { if (options != null && options !is Options) return false if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false - return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( @@ -242,7 +242,7 @@ public class Parquet : DataFrameReadSource { val opts = (options ?: Options()) as Options val kType = sourceInfo.kType return when { - kType.isArrowSubTypeOf() -> + kType.isSubTypeOf() -> (source as? URL)?.let { DataFrame.readParquet( it, @@ -251,7 +251,7 @@ public class Parquet : DataFrameReadSource { ) } - kType.isArrowSubTypeOf() -> + kType.isSubTypeOf() -> (source as? Path)?.let { DataFrame.readParquet( it, @@ -260,7 +260,7 @@ public class Parquet : DataFrameReadSource { ) } - kType.isArrowSubTypeOf() -> + kType.isSubTypeOf() -> (source as? File)?.let { DataFrame.readParquet( it, @@ -278,7 +278,7 @@ public class Parquet : DataFrameReadSource { override fun toString(): String = "Parquet" } -private inline fun KType.isArrowSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) private const val READ_ARROW_FEATHER = "readArrowFeather" diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt index cba2897fcb..e9fd8c71f0 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt @@ -57,10 +57,10 @@ public class Csv : DataFrameReadSource { val parseParallel: Boolean = DelimParams.PARSE_PARALLEL, ) : DataFrameReadOptions - public companion object { - public val SUPPORTED_TYPES: Set = - setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) + override val supportedTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) + public companion object { internal const val EXTENSION: String = "csv" internal const val MIME_TYPE: String = "text/csv" } @@ -69,7 +69,7 @@ public class Csv : DataFrameReadSource { if (options != null && options !is Options) return false if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false if (sourceInfo.mimeType?.lowercase()?.equals(MIME_TYPE) == false) return false - return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt index fdf80ada21..48ae4aca50 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt @@ -57,10 +57,10 @@ public class Tsv : DataFrameReadSource { val parseParallel: Boolean = DelimParams.PARSE_PARALLEL, ) : DataFrameReadOptions - public companion object { - public val SUPPORTED_TYPES: Set = - setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) + override val supportedTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) + public companion object { internal const val EXTENSION: String = "tsv" internal const val MIME_TYPE: String = "text/tab-separated-values" } @@ -69,7 +69,7 @@ public class Tsv : DataFrameReadSource { if (options != null && options !is Options) return false if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false if (sourceInfo.mimeType?.lowercase()?.equals(MIME_TYPE) == false) return false - return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( diff --git a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt index 56119ee7e4..dcb683319d 100644 --- a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt +++ b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt @@ -80,17 +80,19 @@ public class ExcelNEW : DataFrameReadSource { val parseEmptyAsNull: Boolean = true, ) : DataFrameReadOptions - public companion object { - public val SUPPORTED_TYPES: Set = - setOf( - typeOf(), - typeOf(), - typeOf(), - typeOf(), - typeOf(), - typeOf(), - ) + // String reference paths are normalized to URL by readSourceImpl, so no String entry here; + // Excel is binary, so raw String content isn't a meaningful input either. + override val supportedTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + public companion object { internal val EXTENSIONS: Set = setOf("xls", "xlsx") internal val MIME_TYPES: Set = setOf( "application/vnd.ms-excel", @@ -104,7 +106,7 @@ public class ExcelNEW : DataFrameReadSource { if (ext != null && ext !in EXTENSIONS) return false val mime = sourceInfo.mimeType?.lowercase() if (mime != null && mime !in MIME_TYPES) return false - return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt index 78948a5514..b78be745f1 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt @@ -80,19 +80,17 @@ public class Jdbc2 : DataFrameReadSource { val resultSetConnection: Connection? = null, ) : DataFrameReadOptions - public companion object { - public val SUPPORTED_TYPES: Set = - setOf( - typeOf(), - typeOf(), - typeOf(), - typeOf(), - ) - } + override val supportedTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { if (options != null && options !is Options) return false - return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } override fun readDataFrameOrNull( diff --git a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt index 1d24f32abc..431fdbe427 100644 --- a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt +++ b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt @@ -45,23 +45,21 @@ public class Json : DataFrameReadSource { val unifyNumbers: Boolean = true, ) : DataFrameReadOptions - public companion object { - public val SUPPORTED_TYPES: Set = - setOf( - typeOf(), - typeOf(), - typeOf(), - typeOf(), - typeOf(), - typeOf(), - ) - } + override val supportedTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { if (options != null && options !is Options) return false if (sourceInfo.extension?.lowercase()?.equals("json") == false) return false if (sourceInfo.mimeType?.lowercase()?.equals("application/json") == false) return false - return SUPPORTED_TYPES.any { sourceInfo.kType.isSubtypeOf(it) } + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } @OptIn(ExperimentalSerializationApi::class) From a8ce71221b4a7d6468c7aaf3e135feb97de291a6 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Mon, 18 May 2026 15:04:23 +0200 Subject: [PATCH 09/14] DataFrameReadSource openapi support --- core/build.gradle.kts | 1 + .../kotlinx/dataframe/api/generateCode.kt | 2 + .../jetbrains/kotlinx/dataframe/io/guess2.kt | 50 ++++++- .../jetbrains/kotlinx/dataframe/io/Guess2.kt | 45 +++++++ .../jetbrains/kotlinx/dataframe/io/OpenApi.kt | 127 ++++++++++++++++++ 5 files changed, 224 insertions(+), 1 deletion(-) diff --git a/core/build.gradle.kts b/core/build.gradle.kts index 962ac51863..410332d914 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -90,6 +90,7 @@ dependencies { // testImplementation(projects.dataframeGeo) testImplementation(projects.dataframeJdbc) testImplementation(libs.h2db) + testImplementation(projects.dataframeOpenapiGenerator) } // Configure testJava16 dependencies to extend from test diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/generateCode.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/generateCode.kt index 5137cf200c..4ad31062af 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/generateCode.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/generateCode.kt @@ -297,6 +297,8 @@ public val NameNormalizer.Companion.default: NameNormalizer get() = NameNormaliz @RequiredByIntellijPlugin public value class CodeString(public val value: String) { override fun toString(): String = value + + public companion object } @PublishedApi diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt index 7e1c5ff818..a86a0a2d88 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt @@ -2,6 +2,8 @@ package org.jetbrains.kotlinx.dataframe.io import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.CodeString +import org.jetbrains.kotlinx.dataframe.api.generateInterfaces import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import java.io.ByteArrayInputStream @@ -12,7 +14,6 @@ import java.net.URI import java.net.URL import java.nio.file.Path import java.util.ServiceLoader -import kotlin.io.extension import kotlin.io.path.extension import kotlin.reflect.KType import kotlin.reflect.full.withNullability @@ -51,6 +52,15 @@ public interface DataFrameReadSource { options: DataFrameReadOptions? = null, ): DataFrameSchema? = readDataFrameOrNull(source, sourceInfo, options)?.schema() + public fun readDataSchemaCodeOrNull( + source: Any, + sourceInfo: DataSourceInfo, + name: String, + options: DataFrameReadOptions? = null, + ): CodeString? = + readDataFrameSchemaOrNull(source, sourceInfo, options) + ?.generateInterfaces(name) + public fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean // `DataFrame.Companion.read` methods uses this to sort list of all supported formats in ascending order (-1, 2, 10) @@ -220,6 +230,44 @@ public inline fun DataFrameSchema.Companion.readSource( options: DataFrameReadOptions? = null, ): DataFrameSchema = readSource(source = source, type = typeOf(), options = options) +/** + * Code-generation counterpart of [DataFrame.Companion.readSource]: dispatches through every registered + * [DataFrameReadSource] and returns a [CodeString] containing the generated `@DataSchema` interface + * declarations (plus enums/typealiases for formats like OpenAPI). The [name] is the marker name used for + * the top-level generated interface. + * + * The default implementation in [DataFrameReadSource.readDataSchemaCodeOrNull] runs + * [DataFrameSchema.generateInterfaces] on the format's [DataFrameReadSource.readDataFrameSchemaOrNull] + * result; formats that produce richer code (OpenAPI markers, enums, typealiases) override the method + * directly. + */ +public fun CodeString.Companion.readSource( + source: Any, + type: KType, + name: String, + options: DataFrameReadOptions? = null, +): CodeString = + readSourceImpl( + source = source, + sourceInfo = DataSourceInfo( + kType = type.withNullability(false), + extension = source.extensionOrNull(), + mimeType = null, // TODO, Apache Tika? + ), + options = options, + formats = newSupportedFormats, + resultKind = "CodeString", + readOrNull = { src, info, opts -> + readDataSchemaCodeOrNull(src, info, name, opts) + }, + ) + +public inline fun CodeString.Companion.readSource( + source: R, + name: String, + options: DataFrameReadOptions? = null, +): CodeString = readSource(source = source, type = typeOf(), name = name, options = options) + internal fun Any.extensionOrNull(): String? = when (this) { is Path -> extension diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt index 70534d7771..fe479194d9 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt @@ -1,10 +1,12 @@ package org.jetbrains.kotlinx.dataframe.io import io.kotest.matchers.shouldBe +import io.kotest.matchers.string.shouldContain import kotlinx.serialization.json.Json import kotlinx.serialization.json.JsonElement import org.apache.poi.ss.usermodel.WorkbookFactory import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.CodeString import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.io.db.H2 import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema @@ -432,4 +434,47 @@ class Guess2 { val options = Parquet.Options() DataFrame.readSource(parquetFile, options) shouldBe expected } + + @Test + fun `read OpenAPI yaml as code`() { + val openApiFile = File("src/test/resources/petstore.yaml") + + // The reference call from the existing helper, used as the ground truth. + val expected = readOpenApiAsString( + openApiAsString = openApiFile.readText(), + name = "Petstore", + extensionProperties = false, + generateHelperCompanionObject = false, + ) + + // String path / File / Path / URL all route through readSourceImpl to OpenApi2. + CodeString.readSource(openApiFile.path, name = "Petstore").value shouldBe expected + CodeString.readSource(openApiFile, name = "Petstore").value shouldBe expected + CodeString.readSource(Path(openApiFile.path), name = "Petstore").value shouldBe expected + CodeString.readSource( + Path(openApiFile.path).absolute().normalize().toUri().toURL(), + name = "Petstore", + ).value shouldBe expected + + // String content path (raw spec text) also works. + CodeString.readSource(openApiFile.readText(), name = "Petstore").value shouldBe expected + } + + @Test + fun `OpenAPI does not steal plain JSON DataFrame reads`() { + // A regular JSON file (not an OpenAPI spec) still goes to Json, even though OpenApi2 runs first. + // OpenApi2.readDataSchemaCodeOrNull returns null for non-OpenAPI content, but more importantly + // OpenApi2.readDataFrameOrNull is the interface default (null), so DataFrame reads fall through. + val expected = DataFrame.readJson("../data/participants.json") + DataFrame.readSource(File("../data/participants.json")) shouldBe expected + } + + @Test + fun `default DataSchema code generation works for JSON via interface default`() { + // The interface default reads the schema and calls generateInterfaces — exercise it on a JSON file. + val jsonFile = File("../data/participants.json") + val schemaCode = CodeString.readSource(jsonFile, name = "Participants") + // The output is non-empty and includes the marker name. + schemaCode.value shouldContain "Participants" + } } diff --git a/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt b/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt index d3650fa791..012caa049c 100644 --- a/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt +++ b/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt @@ -1,10 +1,137 @@ package org.jetbrains.kotlinx.dataframe.io +import io.swagger.v3.parser.core.models.AuthorizationValue +import io.swagger.v3.parser.core.models.ParseOptions +import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.annotations.DataSchema +import org.jetbrains.kotlinx.dataframe.api.CodeString import org.jetbrains.kotlinx.dataframe.codeGen.Code import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod +import org.jetbrains.kotlinx.dataframe.codeGen.MarkerVisibility +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import java.io.File import java.io.InputStream +import java.net.URL +import java.nio.file.Path +import kotlin.io.path.readText +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf +import kotlin.reflect.typeOf + +/** + * [DataFrameReadSource] for OpenAPI specifications. + * + * OpenAPI doesn't produce a `DataFrame` or a single `DataFrameSchema` — its output is a multi-marker code + * blob (interfaces + enums + typealiases). Only [readDataSchemaCodeOrNull] is overridden; the DataFrame + * and Schema methods return `null` (via the interface's defaults), so calling + * `DataFrame.readSource(openapiFile)` falls through to JSON, while `CodeString.readSource(openapiFile, name)` + * dispatches here. + * + * `.yaml`/`.yml` files are unambiguously OpenAPI; `.json` files are disambiguated at read time by + * [isOpenApiStr] returning null early when the JSON isn't actually an OpenAPI spec, letting the framework + * fall through to the JSON format for plain data. + */ +public class OpenApi2 : DataFrameReadSource { + + public data class Options( + val auth: List? = null, + val parseOptions: ParseOptions? = null, + val extensionProperties: Boolean = false, + val generateHelperCompanionObject: Boolean = false, + val visibility: MarkerVisibility = MarkerVisibility.IMPLICIT_PUBLIC, + ) : DataFrameReadOptions + + override val supportedTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) + + public companion object { + internal val EXTENSIONS: Set = setOf("yaml", "yml", "json") + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + val ext = sourceInfo.extension?.lowercase() + if (ext != null && ext !in EXTENSIONS) return false + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + // OpenAPI doesn't produce a DataFrame. + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? = null + + // ...nor a single DataFrameSchema, it can produce enums, typealiases, etc. + // so it only supports readDataSchemaCodeOrNull() + override fun readDataFrameSchemaOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrameSchema? = null + + override fun readDataSchemaCodeOrNull( + source: Any, + sourceInfo: DataSourceInfo, + name: String, + options: DataFrameReadOptions?, + ): CodeString? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.kType + + // Resolve to OpenAPI-spec text, returning null if the content isn't OpenAPI. + val text: String = when { + kType.isSubtypeOf(typeOf()) -> { + val url = (source as? URL) ?: return null + if (!isOpenApi(url)) return null + url.readText() + } + + kType.isSubtypeOf(typeOf()) -> { + val path = (source as? Path) ?: return null + if (!isOpenApi(path)) return null + path.readText() + } + + kType.isSubtypeOf(typeOf()) -> { + val file = (source as? File) ?: return null + if (!isOpenApi(file.toPath())) return null + file.readText() + } + + kType.isSubtypeOf(typeOf()) -> { + val text = (source as? String) ?: return null + if (!isOpenApiStr(text)) return null + text + } + + kType.isSubtypeOf(typeOf()) -> { + val text = (source as? InputStream)?.bufferedReader()?.readText() ?: return null + if (!isOpenApiStr(text)) return null + text + } + + else -> return null + } + + return CodeString( + readOpenApiAsString( + openApiAsString = text, + name = name, + auth = opts.auth, + options = opts.parseOptions, + extensionProperties = opts.extensionProperties, + generateHelperCompanionObject = opts.generateHelperCompanionObject, + visibility = opts.visibility, + ), + ) + } + + // Run before Json (10_000) so .json files get the OpenAPI content check first. + override val testOrder: Int = 9_000 + + override fun toString(): String = "OpenApi" +} /** * Allows for OpenApi type schemas to be converted to [DataSchema] interfaces. From b3aa890b522bf78392da280fb09ea3597239bf3d Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Mon, 18 May 2026 20:51:00 +0200 Subject: [PATCH 10/14] DataRow.readSource function --- .../jetbrains/kotlinx/dataframe/io/guess2.kt | 24 ++++++++ .../jetbrains/kotlinx/dataframe/io/Guess2.kt | 60 +++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt index a86a0a2d88..141f950e29 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt @@ -1,10 +1,13 @@ package org.jetbrains.kotlinx.dataframe.io import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.api.CodeString import org.jetbrains.kotlinx.dataframe.api.generateInterfaces import org.jetbrains.kotlinx.dataframe.api.schema +import org.jetbrains.kotlinx.dataframe.api.single import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import java.io.ByteArrayInputStream import java.io.File @@ -181,6 +184,27 @@ public fun DataFrame.Companion.readSource(source: Any, type: KType, options: Dat readOrNull = DataFrameReadSource::readDataFrameOrNull, ) +public inline fun DataRow.Companion.readSource( + source: R, + options: DataFrameReadOptions? = null, +): AnyRow = readSource(source = source, type = typeOf(), options = options) + +public fun DataRow.Companion.readSource(source: Any, type: KType, options: DataFrameReadOptions? = null): AnyRow = + readSourceImpl( + source = source, + sourceInfo = DataSourceInfo( + kType = type.withNullability(false), + extension = source.extensionOrNull(), + mimeType = null, // TODO, Apache Tika? + ), + options = options, + formats = newSupportedFormats, + resultKind = "DataRow", + readOrNull = { source, sourceInfo, options -> + readDataFrameOrNull(source, sourceInfo, options)?.single() + }, + ) + public inline fun DataFrame.Companion.readSource( source: R, options: DataFrameReadOptions? = null, diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt index fe479194d9..b3890ae66c 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt @@ -6,8 +6,10 @@ import kotlinx.serialization.json.Json import kotlinx.serialization.json.JsonElement import org.apache.poi.ss.usermodel.WorkbookFactory import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.api.CodeString import org.jetbrains.kotlinx.dataframe.api.schema +import org.jetbrains.kotlinx.dataframe.api.single import org.jetbrains.kotlinx.dataframe.io.db.H2 import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import org.junit.Test @@ -477,4 +479,62 @@ class Guess2 { // The output is non-empty and includes the marker name. schemaCode.value shouldContain "Participants" } + + // region DataRow.readSource — single-row inputs across formats + + @Test + fun `read DataRow from CSV string`() { + val csvText = "a,b,c\n1,2,3" + val expected = DataFrame.readCsvStr(csvText).single() + DataRow.readSource(csvText, Csv.Options()) shouldBe expected + } + + @Test + fun `read DataRow from TSV string`() { + val tsvText = "a\tb\tc\n1\t2\t3" + val expected = DataFrame.readTsvStr(tsvText).single() + DataRow.readSource(tsvText, Tsv.Options()) shouldBe expected + } + + @Test + fun `read DataRow from JSON string`() { + // A single-element JSON array yields a one-row DataFrame. + val jsonText = """[{"a": 1, "b": 2}]""" + val expected = DataFrame.readJsonStr(jsonText).single() + DataRow.readSource(jsonText) shouldBe expected + } + + @Test + fun `read DataRow from single-row XLSX file`() { + // sample2.xlsx has exactly one data row. + val xlsxFile = File("src/test/resources/sample2.xlsx") + val expected = DataFrame.readExcel(xlsxFile).single() + DataRow.readSource(xlsxFile) shouldBe expected + } + + @Test + fun `read DataRow from JDBC with single-row query`() { + val url = h2Url("guess2_datarow") + DriverManager.getConnection(url).use { conn -> + seed(conn) + val query = "SELECT * FROM Customer WHERE id = 1" + val expected = DataFrame.readSqlQuery(conn, query).single() + DataRow.readSource(conn, Jdbc2.Options(sqlQueryOrTableName = query)) shouldBe expected + } + } + + @Test + fun `read DataRow throws when source has multiple rows`() { + // movies.csv has many rows — DataRow.single() should fail, surfaced as the framework's + // "Unknown DataRow source" since the exception is caught and converted. + val movies = File("../data/movies.csv") + try { + DataRow.readSource(movies) + error("Expected DataRow.readSource to fail on a multi-row CSV") + } catch (_: IllegalArgumentException) { + // expected + } + } + + // endregion } From 7b759f6d3d9e3d673cb378822ee3fdeff3a1898e Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Mon, 18 May 2026 21:43:37 +0200 Subject: [PATCH 11/14] put readSource functionality in convert operation --- .../kotlinx/dataframe/impl/api/convert.kt | 56 ++++++++++- .../jetbrains/kotlinx/dataframe/io/guess2.kt | 85 ++++++++++++---- .../jetbrains/kotlinx/dataframe/io/Guess2.kt | 96 +++++++++++++++++++ 3 files changed, 211 insertions(+), 26 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt index fbaafd0370..f5c65ea63b 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt @@ -22,6 +22,7 @@ import kotlinx.datetime.toStdlibInstant import org.jetbrains.kotlinx.dataframe.AnyCol import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.RowColumnExpression import org.jetbrains.kotlinx.dataframe.RowValueExpression import org.jetbrains.kotlinx.dataframe.api.Convert @@ -30,7 +31,6 @@ import org.jetbrains.kotlinx.dataframe.api.Infer import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.api.asColumn import org.jetbrains.kotlinx.dataframe.api.isValueColumn -import org.jetbrains.kotlinx.dataframe.api.mapIndexed import org.jetbrains.kotlinx.dataframe.api.name import org.jetbrains.kotlinx.dataframe.columns.values import org.jetbrains.kotlinx.dataframe.dataTypes.IFRAME @@ -41,13 +41,14 @@ import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException import org.jetbrains.kotlinx.dataframe.impl.columns.newColumn import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType -import org.jetbrains.kotlinx.dataframe.impl.isSubtypeWithNullabilityOf +import org.jetbrains.kotlinx.dataframe.io.dataFrameReadSourceByType +import org.jetbrains.kotlinx.dataframe.io.readSource import org.jetbrains.kotlinx.dataframe.path +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import org.jetbrains.kotlinx.dataframe.type import java.math.BigDecimal import java.math.BigInteger import java.net.URL -import java.util.Locale import kotlin.math.roundToInt import kotlin.math.roundToLong import kotlin.reflect.KType @@ -65,6 +66,7 @@ import kotlin.time.toJavaInstant import kotlin.time.toKotlinDuration import kotlin.time.toKotlinInstant import kotlin.toBigDecimal +import kotlinx.datetime.Instant as DeprecatedInstant import java.time.Duration as JavaDuration import java.time.Instant as JavaInstant import java.time.LocalDate as JavaLocalDate @@ -73,7 +75,6 @@ import java.time.LocalTime as JavaLocalTime import kotlin.time.Instant as StdlibInstant import kotlin.toBigDecimal as toBigDecimalKotlin import kotlin.toBigInteger as toBigIntegerKotlin -import kotlinx.datetime.Instant as DeprecatedInstant @PublishedApi internal fun Convert.withRowCellImpl( @@ -195,16 +196,61 @@ internal inline fun convert(crossinline converter: (T) -> Any?): TypeConvert private enum class DummyEnum +private val dataFrameReadSourceSupportedClasses by lazy { + dataFrameReadSourceByType.keys.map { it.jvmErasure }.toSet() +} + @Suppress("UNCHECKED_CAST") internal fun createConverter(from: KType, to: KType, options: ParserOptions? = null): TypeConverter? { - if (from.arguments.isNotEmpty() || to.arguments.isNotEmpty()) return null if (from.isMarkedNullable) { val res = createConverter(from.withNullability(false), to, options) ?: return null return { res(it) } } val fromClass = from.jvmErasure val toClass = to.jvmErasure + + // readSource-backed conversions handle target types with type arguments (e.g. `DataFrame<*>`, + // `DataRow<*>`), so they must run before the generic-arguments early-exit below. + if (dataFrameReadSourceByType.any { from.isSubtypeOf(it.key) }) { + val readSources = dataFrameReadSourceByType.entries + .first { from.isSubtypeOf(it.key) }.value + + when (toClass) { + DataFrame::class -> + return convert { source -> + DataFrame.readSource( + source = source, + type = from, + options = null, + formats = readSources, + ) + } + + DataRow::class -> + return convert { source -> + DataRow.readSource( + source = source, + type = from, + options = null, + formats = readSources, + ) + } + + DataFrameSchema::class -> + return convert { source -> + DataFrameSchema.readSource( + source = source, + type = from, + options = null, + formats = readSources, + ) + } + } + } + return when { + from.arguments.isNotEmpty() || to.arguments.isNotEmpty() -> null + fromClass == toClass -> TypeConverterIdentity // kotlin.time.Duration is a value class, diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt index 141f950e29..f004e96939 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt @@ -89,6 +89,7 @@ public data class DataSourceInfo( * resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource * to be detected here. */ +@PublishedApi internal val newSupportedFormats: List by lazy { ServiceLoader.load(DataFrameReadSource::class.java) .toList() @@ -96,6 +97,28 @@ internal val newSupportedFormats: List by lazy { .sortedBy { it.testOrder } } +internal val dataFrameReadSourceByType: Map> by lazy { + buildMap> { + newSupportedFormats.forEach { format -> + format.supportedTypes.forEach { type -> + getOrPut(type) { mutableListOf() }.let { + if (format !in it) it += format + } + + // special String -> URL case + if (type == typeOf()) { + getOrPut(typeOf()) { mutableListOf() }.let { + if (format !in it) it += format + } + } + } + } + values.forEach { + it.sortBy { it.testOrder } + } + } +} + /** * Shared dispatch loop for [readDataFrameImpl] and [readDataFrameSchemaImpl]: handles String→URL * normalization, InputStream buffering, sorted iteration, and error aggregation. The per-format read @@ -170,7 +193,12 @@ internal fun readSourceImpl( * entries in `guess.kt` that use the older [SupportedDataFrameFormat] system. Once the legacy entries are * retired, this can be renamed to `read`. */ -public fun DataFrame.Companion.readSource(source: Any, type: KType, options: DataFrameReadOptions? = null): AnyFrame = +public fun DataFrame.Companion.readSource( + source: Any, + type: KType, + options: DataFrameReadOptions? = null, + formats: List = newSupportedFormats, +): AnyFrame = readSourceImpl( source = source, sourceInfo = DataSourceInfo( @@ -179,7 +207,7 @@ public fun DataFrame.Companion.readSource(source: Any, type: KType, options: Dat mimeType = null, // TODO, Apache Tika? ), options = options, - formats = newSupportedFormats, + formats = formats, resultKind = "DataFrame", readOrNull = DataFrameReadSource::readDataFrameOrNull, ) @@ -187,9 +215,15 @@ public fun DataFrame.Companion.readSource(source: Any, type: KType, options: Dat public inline fun DataRow.Companion.readSource( source: R, options: DataFrameReadOptions? = null, -): AnyRow = readSource(source = source, type = typeOf(), options = options) + formats: List = newSupportedFormats, +): AnyRow = readSource(source = source, type = typeOf(), options = options, formats = formats) -public fun DataRow.Companion.readSource(source: Any, type: KType, options: DataFrameReadOptions? = null): AnyRow = +public fun DataRow.Companion.readSource( + source: Any, + type: KType, + options: DataFrameReadOptions? = null, + formats: List = newSupportedFormats, +): AnyRow = readSourceImpl( source = source, sourceInfo = DataSourceInfo( @@ -198,7 +232,7 @@ public fun DataRow.Companion.readSource(source: Any, type: KType, options: DataF mimeType = null, // TODO, Apache Tika? ), options = options, - formats = newSupportedFormats, + formats = formats, resultKind = "DataRow", readOrNull = { source, sourceInfo, options -> readDataFrameOrNull(source, sourceInfo, options)?.single() @@ -208,7 +242,14 @@ public fun DataRow.Companion.readSource(source: Any, type: KType, options: DataF public inline fun DataFrame.Companion.readSource( source: R, options: DataFrameReadOptions? = null, -): AnyFrame = readSource(source = source, type = typeOf(), options = options) + formats: List = newSupportedFormats, +): AnyFrame = + readSource( + source = source, + type = typeOf(), + options = options, + formats = formats, + ) /** * Schema-only counterpart of [DataFrame.Companion.readSource]: dispatches through every registered @@ -220,6 +261,7 @@ public fun DataFrameSchema.Companion.readSource( source: Any, type: KType, options: DataFrameReadOptions? = null, + formats: List = newSupportedFormats, ): DataFrameSchema = readSourceImpl( source = source, @@ -229,31 +271,23 @@ public fun DataFrameSchema.Companion.readSource( mimeType = null, // TODO, Apache Tika? ), options = options, - formats = newSupportedFormats, + formats = formats, resultKind = "DataFrameSchema", readOrNull = DataFrameReadSource::readDataFrameSchemaOrNull, ) -internal fun readDataFrameSchemaImpl( - source: Any, - sourceInfo: DataSourceInfo, +public inline fun DataFrameSchema.Companion.readSource( + source: R, options: DataFrameReadOptions? = null, formats: List = newSupportedFormats, ): DataFrameSchema = - readSourceImpl( + readSource( source = source, - sourceInfo = sourceInfo, + type = typeOf(), options = options, formats = formats, - resultKind = "DataFrameSchema", - readOrNull = DataFrameReadSource::readDataFrameSchemaOrNull, ) -public inline fun DataFrameSchema.Companion.readSource( - source: R, - options: DataFrameReadOptions? = null, -): DataFrameSchema = readSource(source = source, type = typeOf(), options = options) - /** * Code-generation counterpart of [DataFrame.Companion.readSource]: dispatches through every registered * [DataFrameReadSource] and returns a [CodeString] containing the generated `@DataSchema` interface @@ -270,6 +304,7 @@ public fun CodeString.Companion.readSource( type: KType, name: String, options: DataFrameReadOptions? = null, + formats: List = newSupportedFormats, ): CodeString = readSourceImpl( source = source, @@ -279,7 +314,7 @@ public fun CodeString.Companion.readSource( mimeType = null, // TODO, Apache Tika? ), options = options, - formats = newSupportedFormats, + formats = formats, resultKind = "CodeString", readOrNull = { src, info, opts -> readDataSchemaCodeOrNull(src, info, name, opts) @@ -290,7 +325,15 @@ public inline fun CodeString.Companion.readSource( source: R, name: String, options: DataFrameReadOptions? = null, -): CodeString = readSource(source = source, type = typeOf(), name = name, options = options) + formats: List = newSupportedFormats, +): CodeString = + readSource( + source = source, + type = typeOf(), + name = name, + options = options, + formats = formats, + ) internal fun Any.extensionOrNull(): String? = when (this) { diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt index b3890ae66c..990ccefd17 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt @@ -8,8 +8,13 @@ import org.apache.poi.ss.usermodel.WorkbookFactory import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.api.CodeString +import org.jetbrains.kotlinx.dataframe.api.columnOf +import org.jetbrains.kotlinx.dataframe.api.convert +import org.jetbrains.kotlinx.dataframe.api.dataFrameOf +import org.jetbrains.kotlinx.dataframe.api.named import org.jetbrains.kotlinx.dataframe.api.schema import org.jetbrains.kotlinx.dataframe.api.single +import org.jetbrains.kotlinx.dataframe.api.toDataFrame import org.jetbrains.kotlinx.dataframe.io.db.H2 import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import org.junit.Test @@ -537,4 +542,95 @@ class Guess2 { } // endregion + + // region convert API integration — convert { col }.to() + // + // Frame columns are typed by their schema, so each column being converted must contain sources of the + // same shape. Mixing, say, a CSV-shaped source and a JSON-shaped source in the same column would yield + // a FrameColumn with no coherent single schema — these tests keep each column homogeneous and put + // differently-shaped sources into separate columns. + + @Test + fun `convert column of CSV files to DataFrame`() { + // Two cells, both pointing at the same CSV → uniform shape in the resulting FrameColumn. + val csvFile = File("../data/movies.csv") + val df = dataFrameOf("source")(csvFile, csvFile) + + val converted = df.convert("source").to>() + + val expected = DataFrame.readCsv(csvFile) + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + @Test + fun `convert column of CSV files to DataFrameSchema`() { + val csvFile = File("../data/movies.csv") + val df = dataFrameOf("source")(csvFile, csvFile) + + val converted = df.convert("source").to() + + val expected = DataFrame.readCsv(csvFile).schema() + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + @Test + fun `convert column of single-row XLSX files to DataRow`() { + // sample2.xlsx has exactly one data row, so .to>() works for each cell. + val xlsxFile = File("src/test/resources/sample2.xlsx") + val df = dataFrameOf("source")(xlsxFile, xlsxFile) + + val converted = df.convert("source").to>() + + val expected = DataFrame.readExcel(xlsxFile).single() + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + @Test + fun `convert column of String content to DataFrame`() { + // Multiple parallel JSON content strings (same shape) → uniform FrameColumn. + val text = """[{"a": 1, "b": 2}]""" + val df = dataFrameOf("source")(text, text) + + val converted = df.convert("source").to>() + + val expected = DataFrame.readJsonStr(text) + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + @Test + fun `convert two homogeneous source columns at once`() { + // Each column is internally uniform: csvCol has CSV-shaped cells, jsonCol has JSON-shaped cells. + // The result is two FrameColumns, each with its own coherent schema. + val csvFile = File("../data/movies.csv") + val jsonFile = File("../data/participants.json") + val df = dataFrameOf("csvCol", "jsonCol")(csvFile, jsonFile, csvFile, jsonFile) + + val converted = df.convert("csvCol", "jsonCol").to>() + + val expectedCsv = DataFrame.readCsv(csvFile) + val expectedJson = DataFrame.readJson(jsonFile) + converted["csvCol"][0] shouldBe expectedCsv + converted["csvCol"][1] shouldBe expectedCsv + converted["jsonCol"][0] shouldBe expectedJson + converted["jsonCol"][1] shouldBe expectedJson + } + + @Test + fun `convert column of URLs to DataFrame`() { + // Two URLs pointing at the same JSON file → uniform schema in the FrameColumn. + val jsonUrl = File("../data/participants.json").toURI().toURL() + val urls = columnOf(jsonUrl, jsonUrl) named "source" + val df = urls.toDataFrame() + + val converted = df.convert("source").to>() + val expected = DataFrame.readJson(jsonUrl) + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + // endregion } From 1931297742a78f091bc53b075f9b0d1326bf1862 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Tue, 19 May 2026 14:51:37 +0200 Subject: [PATCH 12/14] using apache tika to sniff mime types --- core/build.gradle.kts | 1 + .../kotlinx/dataframe/impl/api/convert.kt | 2 +- .../jetbrains/kotlinx/dataframe/io/guess2.kt | 94 ++++++++++++++----- .../jetbrains/kotlinx/dataframe/io/Guess2.kt | 8 ++ .../kotlinx/dataframe/io/arrowReading.kt | 5 + .../org/jetbrains/kotlinx/dataframe/io/csv.kt | 12 ++- .../org/jetbrains/kotlinx/dataframe/io/tsv.kt | 12 ++- .../jetbrains/kotlinx/dataframe/io/xlsx.kt | 2 + .../jetbrains/kotlinx/dataframe/io/json.kt | 18 +++- .../jetbrains/kotlinx/dataframe/io/OpenApi.kt | 15 +++ gradle/libs.versions.toml | 2 + 11 files changed, 133 insertions(+), 38 deletions(-) diff --git a/core/build.gradle.kts b/core/build.gradle.kts index 410332d914..07bdd2f173 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -59,6 +59,7 @@ dependencies { implementation(libs.commonsIo) implementation(libs.fastDoubleParser) + implementation(libs.tika) api(libs.kotlin.datetimeJvm) implementation(libs.kotlinpoet) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt index f5c65ea63b..74a7e6564a 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt @@ -66,7 +66,6 @@ import kotlin.time.toJavaInstant import kotlin.time.toKotlinDuration import kotlin.time.toKotlinInstant import kotlin.toBigDecimal -import kotlinx.datetime.Instant as DeprecatedInstant import java.time.Duration as JavaDuration import java.time.Instant as JavaInstant import java.time.LocalDate as JavaLocalDate @@ -75,6 +74,7 @@ import java.time.LocalTime as JavaLocalTime import kotlin.time.Instant as StdlibInstant import kotlin.toBigDecimal as toBigDecimalKotlin import kotlin.toBigInteger as toBigIntegerKotlin +import kotlinx.datetime.Instant as DeprecatedInstant @PublishedApi internal fun Convert.withRowCellImpl( diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt index f004e96939..858cee018a 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt @@ -1,5 +1,10 @@ package org.jetbrains.kotlinx.dataframe.io +import org.apache.tika.detect.DefaultDetector +import org.apache.tika.io.TikaInputStream +import org.apache.tika.metadata.Metadata +import org.apache.tika.metadata.TikaCoreProperties +import org.apache.tika.mime.MediaType import org.jetbrains.kotlinx.dataframe.AnyFrame import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.DataFrame @@ -12,12 +17,14 @@ import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import java.io.ByteArrayInputStream import java.io.File import java.io.FileNotFoundException +import java.io.IOException import java.io.InputStream import java.net.URI import java.net.URL import java.nio.file.Path import java.util.ServiceLoader import kotlin.io.path.extension +import kotlin.io.path.name import kotlin.reflect.KType import kotlin.reflect.full.withNullability import kotlin.reflect.typeOf @@ -80,9 +87,14 @@ public interface DataFrameReadSource { public data class DataSourceInfo( public val kType: KType, public val extension: String? = null, - // TODO, Apache Tika? public val mimeType: String? = null, -) +) { + init { + if (mimeType != null) { + println() + } + } +} /** * NOTE: Needs to have fully qualified name in @@ -129,7 +141,7 @@ internal val dataFrameReadSourceByType: Map> by */ internal fun readSourceImpl( source: Any, - sourceInfo: DataSourceInfo, + sourceType: KType, options: DataFrameReadOptions?, formats: List, resultKind: String, @@ -144,7 +156,7 @@ internal fun readSourceImpl( if (url != null) { return readSourceImpl( source = url, - sourceInfo = sourceInfo.copy(kType = typeOf()), + sourceType = typeOf(), options = options, formats = formats, resultKind = resultKind, @@ -166,6 +178,12 @@ internal fun readSourceImpl( else -> source } + val sourceInfo = DataSourceInfo( + kType = sourceType, + extension = getSource().extensionOrNull(), + mimeType = getSource().mimeTypeOrNull(), + ) + val tries = mutableMapOf() formats.sortedBy { it.testOrder }.forEach { if (!it.acceptsSource(sourceInfo, options)) return@forEach @@ -201,11 +219,7 @@ public fun DataFrame.Companion.readSource( ): AnyFrame = readSourceImpl( source = source, - sourceInfo = DataSourceInfo( - kType = type.withNullability(false), - extension = source.extensionOrNull(), - mimeType = null, // TODO, Apache Tika? - ), + sourceType = type.withNullability(false), options = options, formats = formats, resultKind = "DataFrame", @@ -226,11 +240,7 @@ public fun DataRow.Companion.readSource( ): AnyRow = readSourceImpl( source = source, - sourceInfo = DataSourceInfo( - kType = type.withNullability(false), - extension = source.extensionOrNull(), - mimeType = null, // TODO, Apache Tika? - ), + sourceType = type.withNullability(false), options = options, formats = formats, resultKind = "DataRow", @@ -265,11 +275,7 @@ public fun DataFrameSchema.Companion.readSource( ): DataFrameSchema = readSourceImpl( source = source, - sourceInfo = DataSourceInfo( - kType = type.withNullability(false), - extension = source.extensionOrNull(), - mimeType = null, // TODO, Apache Tika? - ), + sourceType = type.withNullability(false), options = options, formats = formats, resultKind = "DataFrameSchema", @@ -308,11 +314,7 @@ public fun CodeString.Companion.readSource( ): CodeString = readSourceImpl( source = source, - sourceInfo = DataSourceInfo( - kType = type.withNullability(false), - extension = source.extensionOrNull(), - mimeType = null, // TODO, Apache Tika? - ), + sourceType = type.withNullability(false), options = options, formats = formats, resultKind = "CodeString", @@ -335,6 +337,48 @@ public inline fun CodeString.Companion.readSource( formats = formats, ) +private val tikaDetector by lazy { DefaultDetector() } + +internal fun Any.mimeTypeOrNull(): String? { + val inputStream = try { + when (this) { + is Path -> TikaInputStream.get(this) + + is File -> + @Suppress("DEPRECATION") + TikaInputStream.get(this) + + is URL -> TikaInputStream.get(this) + + is InputStream -> TikaInputStream.get(this) + + is ByteArray -> TikaInputStream.get(this) + + else -> null + } + } catch (_: IOException) { + null + } ?: return null + + val metadata = Metadata().apply { + if (inputStream.hasFile()) { + add(TikaCoreProperties.RESOURCE_NAME_KEY, inputStream.path.name) + } + } + return try { + val detected = tikaDetector.detect(inputStream, metadata) + return when { + detected == MediaType.OCTET_STREAM -> null + detected == MediaType.TEXT_PLAIN -> null + detected == MediaType.EMPTY -> null + detected.toString().isEmpty() -> null + else -> detected.toString() + } + } catch (_: IOException) { + null + } +} + internal fun Any.extensionOrNull(): String? = when (this) { is Path -> extension @@ -350,7 +394,7 @@ internal fun Any.extensionOrNull(): String? = } else -> null - } + }?.lowercase() /** * Non-throwing variant of [asUrl]: returns the [URL] iff [string] is a recognized URL (`http`/`https`/`ftp`) diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt index 990ccefd17..d4b9503a95 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt @@ -243,13 +243,21 @@ class Guess2 { // DataSource — opens a fresh connection each call (DataSource.readDataFrame closes it via `use`). val dataSource = object : DataSource { override fun getConnection() = DriverManager.getConnection(url) + override fun getConnection(u: String?, p: String?) = DriverManager.getConnection(url) + override fun getLogWriter() = null + override fun setLogWriter(out: java.io.PrintWriter?) {} + override fun setLoginTimeout(seconds: Int) {} + override fun getLoginTimeout() = 0 + override fun getParentLogger() = throw UnsupportedOperationException() + override fun unwrap(iface: Class?): T = throw UnsupportedOperationException() + override fun isWrapperFor(iface: Class<*>?) = false } DataFrame.readSource(dataSource, tableOpts) shouldBe expected diff --git a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt index be208cb93e..bc4f1b78df 100644 --- a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt +++ b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt @@ -226,11 +226,16 @@ public class Parquet : DataFrameReadSource { public companion object { internal const val EXTENSION: String = "parquet" + internal val MIME_TYPES = setOf( + "application/x-parquet", + "application/parquet", + ) } override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { if (options != null && options !is Options) return false if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPES) return false return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt index e9fd8c71f0..4e77e63d50 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt @@ -61,14 +61,18 @@ public class Csv : DataFrameReadSource { setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) public companion object { - internal const val EXTENSION: String = "csv" - internal const val MIME_TYPE: String = "text/csv" + internal val EXTENSIONS = setOf("csv", "zip", "gz") + internal val MIME_TYPES = setOf( + "text/csv", + "application/zip", + "application/gzip", + ) } override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { if (options != null && options !is Options) return false - if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false - if (sourceInfo.mimeType?.lowercase()?.equals(MIME_TYPE) == false) return false + if (sourceInfo.extension != null && sourceInfo.extension !in EXTENSIONS) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPES) return false return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt index 48ae4aca50..96e026ab69 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt @@ -61,14 +61,18 @@ public class Tsv : DataFrameReadSource { setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) public companion object { - internal const val EXTENSION: String = "tsv" - internal const val MIME_TYPE: String = "text/tab-separated-values" + internal val EXTENSIONS = setOf("tsv", "zip", "gz") + internal val MIME_TYPE = setOf( + "text/tab-separated-values", + "application/zip", + "application/gzip", + ) } override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { if (options != null && options !is Options) return false - if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false - if (sourceInfo.mimeType?.lowercase()?.equals(MIME_TYPE) == false) return false + if (sourceInfo.extension != null && sourceInfo.extension !in EXTENSIONS) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPE) return false return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } diff --git a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt index dcb683319d..543b30b333 100644 --- a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt +++ b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt @@ -97,6 +97,8 @@ public class ExcelNEW : DataFrameReadSource { internal val MIME_TYPES: Set = setOf( "application/vnd.ms-excel", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/x-tika-ooxml", + "application/x-tika-msoffice", ) } diff --git a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt index 431fdbe427..7a1e9f8532 100644 --- a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt +++ b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt @@ -55,10 +55,20 @@ public class Json : DataFrameReadSource { typeOf(), ) + public companion object { + internal const val EXTENSION = "json" + internal val MIME_TYPES = setOf( + "application/json", + "application/x-json", + "text/json", + "text/x-json", + ) + } + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { if (options != null && options !is Options) return false - if (sourceInfo.extension?.lowercase()?.equals("json") == false) return false - if (sourceInfo.mimeType?.lowercase()?.equals("application/json") == false) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPES) return false return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } @@ -121,8 +131,8 @@ public class Json : DataFrameReadSource { trim().let { it.isEmpty() || !( - it.startsWith('{') && it.endsWith('}') || - it.startsWith('[') && it.endsWith(']') + (it.startsWith('{') && it.endsWith('}')) || + (it.startsWith('[') && it.endsWith(']')) ) } } diff --git a/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt b/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt index 012caa049c..ef7e087916 100644 --- a/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt +++ b/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt @@ -46,12 +46,27 @@ public class OpenApi2 : DataFrameReadSource { public companion object { internal val EXTENSIONS: Set = setOf("yaml", "yml", "json") + internal val MIME_TYPES = setOf( + "application/vnd.oai.openapi", + "application/vnd.oai.openapi+json", + "application/vnd.oai.openapi.yaml", + "application/vnd.oai.openapi+yaml", + "text/x-yaml", + "text/yaml", + "application/x-yaml", + "application/yaml", + "application/x-json", + "application/json", + "text/x-json", + "text/json", + ) } override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { if (options != null && options !is Options) return false val ext = sourceInfo.extension?.lowercase() if (ext != null && ext !in EXTENSIONS) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPES) return false return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } } diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index ed04b6e669..5f5cd5aa86 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -34,6 +34,7 @@ commonsCsv = "1.14.1" commonsCompress = "1.28.0" commonsIo = "2.21.0" commonsStatistics = "1.2" +tika = "3.3.0" serialization = "1.11.0" poi = "5.5.1" mariadb = "3.5.8" @@ -100,6 +101,7 @@ commonsCsv = { group = "org.apache.commons", name = "commons-csv", version.ref = commonsCompress = { group = "org.apache.commons", name = "commons-compress", version.ref = "commonsCompress" } commonsIo = { group = "commons-io", name = "commons-io", version.ref = "commonsIo" } commonsStatisticsDescriptive = { group = "org.apache.commons", name = "commons-statistics-descriptive", version.ref = "commonsStatistics" } +tika = { group = "org.apache.tika", name = "tika-core", version.ref = "tika" } # Serialization serialization-core = { group = "org.jetbrains.kotlinx", name = "kotlinx-serialization-core", version.ref = "serialization" } From 0033315fa696e88db691f951e535cfda2e791e46 Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Tue, 19 May 2026 14:58:48 +0200 Subject: [PATCH 13/14] api dump --- core/api/core.api | 47 ++++++++ dataframe-arrow/api/dataframe-arrow.api | 88 ++++++++++++++ dataframe-csv/api/dataframe-csv.api | 108 ++++++++++++++++++ dataframe-excel/api/dataframe-excel.api | 41 +++++++ dataframe-jdbc/api/dataframe-jdbc.api | 36 ++++++ dataframe-json/api/dataframe-json.api | 34 ++++++ .../api/dataframe-openapi-generator.api | 36 ++++++ 7 files changed, 390 insertions(+) diff --git a/core/api/core.api b/core/api/core.api index 790ce8fa29..6ff4138858 100644 --- a/core/api/core.api +++ b/core/api/core.api @@ -663,6 +663,7 @@ public final class org/jetbrains/kotlinx/dataframe/api/ChunkedKt { } public final class org/jetbrains/kotlinx/dataframe/api/CodeString { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/api/CodeString$Companion; public static final synthetic fun box-impl (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/api/CodeString; public static fun constructor-impl (Ljava/lang/String;)Ljava/lang/String; public fun equals (Ljava/lang/Object;)Z @@ -676,6 +677,9 @@ public final class org/jetbrains/kotlinx/dataframe/api/CodeString { public final synthetic fun unbox-impl ()Ljava/lang/String; } +public final class org/jetbrains/kotlinx/dataframe/api/CodeString$Companion { +} + public abstract interface class org/jetbrains/kotlinx/dataframe/api/ColColumnsSelectionDsl { public fun col (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/columns/ColumnAccessor; public fun col (Ljava/lang/String;I)Lorg/jetbrains/kotlinx/dataframe/columns/SingleColumn; @@ -6028,6 +6032,37 @@ public final class org/jetbrains/kotlinx/dataframe/io/DataFrameHtmlData$Companio public static synthetic fun tableDefinitions$default (Lorg/jetbrains/kotlinx/dataframe/io/DataFrameHtmlData$Companion;ZZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/DataFrameHtmlData; } +public abstract interface class org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { +} + +public abstract interface class org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public abstract fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public abstract fun getSupportedTypes ()Ljava/util/Set; + public abstract fun getTestOrder ()I + public abstract fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static synthetic fun readDataFrameOrNull$default (Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadSource;Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public static synthetic fun readDataFrameSchemaOrNull$default (Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadSource;Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public static synthetic fun readDataSchemaCodeOrNull-myXLQ2E$default (Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadSource;Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;ILjava/lang/Object;)Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/DataSourceInfo { + public fun (Lkotlin/reflect/KType;Ljava/lang/String;Ljava/lang/String;)V + public synthetic fun (Lkotlin/reflect/KType;Ljava/lang/String;Ljava/lang/String;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Lkotlin/reflect/KType; + public final fun component2 ()Ljava/lang/String; + public final fun component3 ()Ljava/lang/String; + public final fun copy (Lkotlin/reflect/KType;Ljava/lang/String;Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lkotlin/reflect/KType;Ljava/lang/String;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo; + public fun equals (Ljava/lang/Object;)Z + public final fun getExtension ()Ljava/lang/String; + public final fun getKType ()Lkotlin/reflect/KType; + public final fun getMimeType ()Ljava/lang/String; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/DisplayConfiguration { public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/DisplayConfiguration$Companion; public synthetic fun (Ljava/lang/Integer;Ljava/lang/Integer;ILkotlin/jvm/functions/Function3;Ljava/lang/String;ZZZZZILkotlin/jvm/internal/DefaultConstructorMarker;)V @@ -6078,6 +6113,18 @@ public final class org/jetbrains/kotlinx/dataframe/io/DisplayConfiguration$Compa public final fun getDEFAULT ()Lorg/jetbrains/kotlinx/dataframe/io/DisplayConfiguration; } +public final class org/jetbrains/kotlinx/dataframe/io/Guess2Kt { + public static final fun getNewSupportedFormats ()Ljava/util/List; + public static final fun readSource (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static final fun readSource (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataRow; + public static final fun readSource (Lorg/jetbrains/kotlinx/dataframe/api/CodeString$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;)Ljava/lang/String; + public static final fun readSource (Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public static synthetic fun readSource$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static synthetic fun readSource$default (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataRow; + public static synthetic fun readSource$default (Lorg/jetbrains/kotlinx/dataframe/api/CodeString$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;ILjava/lang/Object;)Ljava/lang/String; + public static synthetic fun readSource$default (Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; +} + public final class org/jetbrains/kotlinx/dataframe/io/GuessKt { public static final fun read (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static final fun read (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; diff --git a/dataframe-arrow/api/dataframe-arrow.api b/dataframe-arrow/api/dataframe-arrow.api index c3f0a80c3b..4a098bd002 100644 --- a/dataframe-arrow/api/dataframe-arrow.api +++ b/dataframe-arrow/api/dataframe-arrow.api @@ -9,6 +9,64 @@ public final class org/jetbrains/kotlinx/dataframe/io/ArrowFeather : org/jetbrai public fun readDataFrame (Ljava/nio/file/Path;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; } +public final class org/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)V + public synthetic fun (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public final fun copy (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$Options;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getNullability ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowIPC : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/ArrowIPC$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowIPC$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowIPC$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)V + public synthetic fun (Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Lorg/apache/arrow/memory/RootAllocator; + public final fun component2 ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public final fun copy (Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowIPC$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowIPC$Options;Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowIPC$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getAllocator ()Lorg/apache/arrow/memory/RootAllocator; + public final fun getNullability ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/ArrowReadingKt { public static final fun readArrow (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Lorg/apache/arrow/vector/ipc/ArrowReader;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static synthetic fun readArrow$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Lorg/apache/arrow/vector/ipc/ArrowReader;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; @@ -323,3 +381,33 @@ public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$Widenin public fun toString ()Ljava/lang/String; } +public final class org/jetbrains/kotlinx/dataframe/io/Parquet : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/Parquet$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Parquet$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/Parquet$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;J)V + public synthetic fun (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;JILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public final fun component2 ()J + public final fun copy (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;J)Lorg/jetbrains/kotlinx/dataframe/io/Parquet$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Parquet$Options;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;JILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Parquet$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getBatchSize ()J + public final fun getNullability ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + diff --git a/dataframe-csv/api/dataframe-csv.api b/dataframe-csv/api/dataframe-csv.api index a9a964783c..d8c4ca4133 100644 --- a/dataframe-csv/api/dataframe-csv.api +++ b/dataframe-csv/api/dataframe-csv.api @@ -1,3 +1,57 @@ +public final class org/jetbrains/kotlinx/dataframe/io/Csv : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/Csv$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Csv$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/Csv$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZ)V + public synthetic fun (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()C + public final fun component10 ()Z + public final fun component11 ()C + public final fun component12 ()Z + public final fun component13 ()Z + public final fun component14 ()Z + public final fun component2 ()Ljava/util/List; + public final fun component3 ()Ljava/nio/charset/Charset; + public final fun component4 ()Ljava/util/Map; + public final fun component5 ()J + public final fun component6 ()Ljava/lang/Long; + public final fun component7 ()Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public final fun component8 ()Z + public final fun component9 ()Z + public final fun copy (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZ)Lorg/jetbrains/kotlinx/dataframe/io/Csv$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Csv$Options;CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Csv$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getAllowMissingColumns ()Z + public final fun getCharset ()Ljava/nio/charset/Charset; + public final fun getColTypes ()Ljava/util/Map; + public final fun getDelimiter ()C + public final fun getHeader ()Ljava/util/List; + public final fun getIgnoreEmptyLines ()Z + public final fun getIgnoreExcessColumns ()Z + public final fun getIgnoreSurroundingSpaces ()Z + public final fun getParseParallel ()Z + public final fun getParserOptions ()Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public final fun getQuote ()C + public final fun getReadLines ()Ljava/lang/Long; + public final fun getSkipLines ()J + public final fun getTrimInsideQuoted ()Z + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/CsvDeephaven : org/jetbrains/kotlinx/dataframe/io/SupportedDataFrameFormat { public fun ()V public fun (C)V @@ -121,6 +175,60 @@ public final class org/jetbrains/kotlinx/dataframe/io/ToTsvStrKt { public static synthetic fun toTsvStr$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;ZCLjava/lang/Character;Lorg/jetbrains/kotlinx/dataframe/io/QuoteMode;Ljava/lang/Character;Ljava/lang/Character;Ljava/util/List;Ljava/lang/String;ILjava/lang/Object;)Ljava/lang/String; } +public final class org/jetbrains/kotlinx/dataframe/io/Tsv : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/Tsv$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Tsv$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/Tsv$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZ)V + public synthetic fun (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()C + public final fun component10 ()Z + public final fun component11 ()C + public final fun component12 ()Z + public final fun component13 ()Z + public final fun component14 ()Z + public final fun component2 ()Ljava/util/List; + public final fun component3 ()Ljava/nio/charset/Charset; + public final fun component4 ()Ljava/util/Map; + public final fun component5 ()J + public final fun component6 ()Ljava/lang/Long; + public final fun component7 ()Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public final fun component8 ()Z + public final fun component9 ()Z + public final fun copy (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZ)Lorg/jetbrains/kotlinx/dataframe/io/Tsv$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Tsv$Options;CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Tsv$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getAllowMissingColumns ()Z + public final fun getCharset ()Ljava/nio/charset/Charset; + public final fun getColTypes ()Ljava/util/Map; + public final fun getDelimiter ()C + public final fun getHeader ()Ljava/util/List; + public final fun getIgnoreEmptyLines ()Z + public final fun getIgnoreExcessColumns ()Z + public final fun getIgnoreSurroundingSpaces ()Z + public final fun getParseParallel ()Z + public final fun getParserOptions ()Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public final fun getQuote ()C + public final fun getReadLines ()Ljava/lang/Long; + public final fun getSkipLines ()J + public final fun getTrimInsideQuoted ()Z + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/TsvDeephaven : org/jetbrains/kotlinx/dataframe/io/SupportedDataFrameFormat { public fun ()V public fun (C)V diff --git a/dataframe-excel/api/dataframe-excel.api b/dataframe-excel/api/dataframe-excel.api index 06541cf3c9..0320ffc31d 100644 --- a/dataframe-excel/api/dataframe-excel.api +++ b/dataframe-excel/api/dataframe-excel.api @@ -9,6 +9,47 @@ public final class org/jetbrains/kotlinx/dataframe/io/Excel : org/jetbrains/kotl public fun readDataFrame (Ljava/nio/file/Path;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; } +public final class org/jetbrains/kotlinx/dataframe/io/ExcelNEW : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/ExcelNEW$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/ExcelNEW$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/ExcelNEW$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public synthetic fun (Ljava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy;ZZILkotlin/jvm/internal/DefaultConstructorMarker;)V + public synthetic fun (Ljava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy;ZZLkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Ljava/lang/String; + public final fun component2 ()I + public final fun component3 ()Ljava/lang/String; + public final fun component4-358K8uM ()Ljava/lang/String; + public final fun component5 ()Ljava/lang/Integer; + public final fun component6 ()Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy; + public final fun component7 ()Z + public final fun component8 ()Z + public final fun copy-vOPuZIo (Ljava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy;ZZ)Lorg/jetbrains/kotlinx/dataframe/io/ExcelNEW$Options; + public static synthetic fun copy-vOPuZIo$default (Lorg/jetbrains/kotlinx/dataframe/io/ExcelNEW$Options;Ljava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy;ZZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ExcelNEW$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getColumns ()Ljava/lang/String; + public final fun getFirstRowIsHeader ()Z + public final fun getNameRepairStrategy ()Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy; + public final fun getParseEmptyAsNull ()Z + public final fun getRowsCount ()Ljava/lang/Integer; + public final fun getSheetName ()Ljava/lang/String; + public final fun getSkipRows ()I + public final fun getStringColumns-358K8uM ()Ljava/lang/String; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/FormattingOptions { public fun (Ljava/lang/String;Lorg/apache/poi/ss/usermodel/DataFormatter;)V public synthetic fun (Ljava/lang/String;Lorg/apache/poi/ss/usermodel/DataFormatter;ILkotlin/jvm/internal/DefaultConstructorMarker;)V diff --git a/dataframe-jdbc/api/dataframe-jdbc.api b/dataframe-jdbc/api/dataframe-jdbc.api index 43b8f1cf82..4b3c3fc4e3 100644 --- a/dataframe-jdbc/api/dataframe-jdbc.api +++ b/dataframe-jdbc/api/dataframe-jdbc.api @@ -25,6 +25,42 @@ public final class org/jetbrains/kotlinx/dataframe/io/Jdbc : org/jetbrains/kotli public fun readDataFrame (Ljava/nio/file/Path;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; } +public final class org/jetbrains/kotlinx/dataframe/io/Jdbc2 : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Jdbc2$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Ljava/lang/String;Ljava/lang/Integer;ZLorg/jetbrains/kotlinx/dataframe/io/db/DbType;ZLkotlin/jvm/functions/Function1;Ljava/sql/Connection;)V + public synthetic fun (Ljava/lang/String;Ljava/lang/Integer;ZLorg/jetbrains/kotlinx/dataframe/io/db/DbType;ZLkotlin/jvm/functions/Function1;Ljava/sql/Connection;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Ljava/lang/String; + public final fun component2 ()Ljava/lang/Integer; + public final fun component3 ()Z + public final fun component4 ()Lorg/jetbrains/kotlinx/dataframe/io/db/DbType; + public final fun component5 ()Z + public final fun component6 ()Lkotlin/jvm/functions/Function1; + public final fun component7 ()Ljava/sql/Connection; + public final fun copy (Ljava/lang/String;Ljava/lang/Integer;ZLorg/jetbrains/kotlinx/dataframe/io/db/DbType;ZLkotlin/jvm/functions/Function1;Ljava/sql/Connection;)Lorg/jetbrains/kotlinx/dataframe/io/Jdbc2$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Jdbc2$Options;Ljava/lang/String;Ljava/lang/Integer;ZLorg/jetbrains/kotlinx/dataframe/io/db/DbType;ZLkotlin/jvm/functions/Function1;Ljava/sql/Connection;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Jdbc2$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getConfigureStatement ()Lkotlin/jvm/functions/Function1; + public final fun getDbType ()Lorg/jetbrains/kotlinx/dataframe/io/db/DbType; + public final fun getInferNullability ()Z + public final fun getLimit ()Ljava/lang/Integer; + public final fun getResultSetConnection ()Ljava/sql/Connection; + public final fun getSqlQueryOrTableName ()Ljava/lang/String; + public final fun getStrictValidation ()Z + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/JdbcSchemaKt { public static final fun buildCodeForDB (Ljava/net/URL;Ljava/lang/String;)Ljava/lang/String; public static final fun getDatabaseCodeGenReader (Lorg/jetbrains/kotlinx/dataframe/codeGen/CodeGenerator$Companion;)Lkotlin/jvm/functions/Function2; diff --git a/dataframe-json/api/dataframe-json.api b/dataframe-json/api/dataframe-json.api index 368faf4a12..46885ef8e5 100644 --- a/dataframe-json/api/dataframe-json.api +++ b/dataframe-json/api/dataframe-json.api @@ -40,6 +40,40 @@ public final class org/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic : jav public static fun values ()[Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic; } +public final class org/jetbrains/kotlinx/dataframe/io/Json : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/Json$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Json$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/Json$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Ljava/util/List;Z)V + public synthetic fun (Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Ljava/util/List;ZILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Ljava/util/List; + public final fun component2 ()Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic; + public final fun component3 ()Ljava/util/List; + public final fun component4 ()Z + public final fun copy (Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Ljava/util/List;Z)Lorg/jetbrains/kotlinx/dataframe/io/Json$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Json$Options;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Ljava/util/List;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Json$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getHeader ()Ljava/util/List; + public final fun getKeyValuePaths ()Ljava/util/List; + public final fun getTypeClashTactic ()Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic; + public final fun getUnifyNumbers ()Z + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/JsonKt { public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/InputStream;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame; diff --git a/dataframe-openapi-generator/api/dataframe-openapi-generator.api b/dataframe-openapi-generator/api/dataframe-openapi-generator.api index cc65f36f8e..76583b5641 100644 --- a/dataframe-openapi-generator/api/dataframe-openapi-generator.api +++ b/dataframe-openapi-generator/api/dataframe-openapi-generator.api @@ -19,6 +19,42 @@ public final class org/jetbrains/kotlinx/dataframe/io/OpenApi : org/jetbrains/ko public static synthetic fun readCodeForGeneration$default (Lorg/jetbrains/kotlinx/dataframe/io/OpenApi;Ljava/lang/String;Ljava/lang/String;ZZILjava/lang/Object;)Ljava/lang/String; } +public final class org/jetbrains/kotlinx/dataframe/io/OpenApi2 : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/OpenApi2$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/OpenApi2$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/OpenApi2$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;)V + public synthetic fun (Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Ljava/util/List; + public final fun component2 ()Lio/swagger/v3/parser/core/models/ParseOptions; + public final fun component3 ()Z + public final fun component4 ()Z + public final fun component5 ()Lorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility; + public final fun copy (Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;)Lorg/jetbrains/kotlinx/dataframe/io/OpenApi2$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/OpenApi2$Options;Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/OpenApi2$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getAuth ()Ljava/util/List; + public final fun getExtensionProperties ()Z + public final fun getGenerateHelperCompanionObject ()Z + public final fun getParseOptions ()Lio/swagger/v3/parser/core/models/ParseOptions; + public final fun getVisibility ()Lorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/ReadOpenapiKt { public static final fun readOpenApi (Ljava/lang/String;Ljava/lang/String;Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;)Ljava/lang/String; public static synthetic fun readOpenApi$default (Ljava/lang/String;Ljava/lang/String;Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;ILjava/lang/Object;)Ljava/lang/String; From aa2bd1b16299fa534f29789480f3bd19776f17ed Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Tue, 19 May 2026 21:28:48 +0200 Subject: [PATCH 14/14] tests for parsing json columns to check behavior still matches. Added parseToDataFrameReadSource parser option. --- core/api/core.api | 15 +- .../kotlinx/dataframe/api/ParserOptions.kt | 12 +- .../kotlinx/dataframe/impl/api/convert.kt | 80 +++---- .../kotlinx/dataframe/impl/api/parse.kt | 96 +++++--- .../jetbrains/kotlinx/dataframe/io/guess2.kt | 45 ++-- .../kotlinx/dataframe/io/ParserTests.kt | 4 +- .../kotlinx/dataframe/impl/io/readDelim.kt | 10 +- .../org/jetbrains/kotlinx/dataframe/io/csv.kt | 3 + .../org/jetbrains/kotlinx/dataframe/io/tsv.kt | 3 + .../kotlinx/dataframe/io/db/DuckDb.kt | 3 +- .../dataframe/io/ParseJsonColumnTests.kt | 215 ++++++++++++++++++ 11 files changed, 383 insertions(+), 103 deletions(-) create mode 100644 dataframe-json/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParseJsonColumnTests.kt diff --git a/core/api/core.api b/core/api/core.api index 6ff4138858..ffeec4d531 100644 --- a/core/api/core.api +++ b/core/api/core.api @@ -2475,6 +2475,7 @@ public abstract interface class org/jetbrains/kotlinx/dataframe/api/GlobalParser public abstract fun getNulls ()Ljava/util/Set; public abstract fun getParseExperimentalInstant ()Z public abstract fun getParseExperimentalUuid ()Z + public abstract fun getParseToDataFrameReadSource ()Z public abstract fun getSkipTypes ()Ljava/util/Set; public abstract fun getUseFastDoubleParser ()Z public abstract fun resetToDefault ()V @@ -2482,6 +2483,7 @@ public abstract interface class org/jetbrains/kotlinx/dataframe/api/GlobalParser public abstract fun setLocale (Ljava/util/Locale;)V public abstract fun setParseExperimentalInstant (Z)V public abstract fun setParseExperimentalUuid (Z)V + public abstract fun setParseToDataFrameReadSource (Z)V public abstract fun setUseFastDoubleParser (Z)V } @@ -3622,18 +3624,19 @@ public final class org/jetbrains/kotlinx/dataframe/api/ParserOptions { public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ILkotlin/jvm/internal/DefaultConstructorMarker;)V public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;)V public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V - public fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)V - public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V - public fun (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)V - public synthetic fun (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V - public final fun copy (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/lang/Iterable;Ljava/lang/Iterable;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; - public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/lang/Iterable;Ljava/lang/Iterable;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)V + public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public fun (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)V + public synthetic fun (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun copy (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/lang/Iterable;Ljava/lang/Iterable;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/lang/Iterable;Ljava/lang/Iterable;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; public fun equals (Ljava/lang/Object;)Z public final fun getDateTime ()Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions; public final fun getLocale ()Ljava/util/Locale; public final fun getNullStrings ()Ljava/util/Set; public final fun getParseExperimentalInstant ()Ljava/lang/Boolean; public final fun getParseExperimentalUuid ()Ljava/lang/Boolean; + public final fun getParseToDataFrameReadSource ()Ljava/lang/Boolean; public final fun getSkipTypes ()Ljava/util/Set; public final fun getUseFastDoubleParser ()Ljava/lang/Boolean; public fun hashCode ()I diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ParserOptions.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ParserOptions.kt index b77de07bab..abad2f2631 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ParserOptions.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ParserOptions.kt @@ -291,6 +291,8 @@ public interface GlobalParserOptions { * @see [addJavaDateTimePattern] */ public var dateTimeLibrary: ParseDateTimeLibrary? + + public var parseToDataFrameReadSource: Boolean } /** @include [GlobalParserOptions] */ @@ -410,6 +412,7 @@ public class ParserOptions( public val useFastDoubleParser: Boolean? = null, public val parseExperimentalUuid: Boolean? = null, public val parseExperimentalInstant: Boolean? = null, + public val parseToDataFrameReadSource: Boolean? = null, ) { public fun copy( locale: Locale? = this.locale, @@ -419,6 +422,7 @@ public class ParserOptions( useFastDoubleParser: Boolean? = this.useFastDoubleParser, parseExperimentalUuid: Boolean? = this.parseExperimentalUuid, parseExperimentalInstant: Boolean? = this.parseExperimentalInstant, + parseToDataFrameReadSource: Boolean? = this.parseToDataFrameReadSource, ): ParserOptions = ParserOptions( locale = locale, @@ -428,6 +432,7 @@ public class ParserOptions( useFastDoubleParser = useFastDoubleParser, parseExperimentalUuid = parseExperimentalUuid, parseExperimentalInstant = parseExperimentalInstant, + parseToDataFrameReadSource = parseToDataFrameReadSource, ) override fun equals(other: Any?): Boolean { @@ -439,6 +444,7 @@ public class ParserOptions( if (useFastDoubleParser != other.useFastDoubleParser) return false if (parseExperimentalUuid != other.parseExperimentalUuid) return false if (parseExperimentalInstant != other.parseExperimentalInstant) return false + if (parseToDataFrameReadSource != other.parseToDataFrameReadSource) return false if (locale != other.locale) return false if (dateTime != other.dateTime) return false if (nullStrings != other.nullStrings) return false @@ -451,6 +457,8 @@ public class ParserOptions( var result = useFastDoubleParser?.hashCode() ?: 0 result = 31 * result + (parseExperimentalUuid?.hashCode() ?: 0) result = 31 * result + (parseExperimentalInstant?.hashCode() ?: 0) + result = 31 * result + (parseToDataFrameReadSource?.hashCode() ?: 0) + result = 31 * result + (parseExperimentalInstant?.hashCode() ?: 0) result = 31 * result + (locale?.hashCode() ?: 0) result = 31 * result + (dateTime?.hashCode() ?: 0) result = 31 * result + (nullStrings?.hashCode() ?: 0) @@ -459,7 +467,7 @@ public class ParserOptions( } override fun toString(): String = - "ParserOptions(locale=$locale, dateTimeParserOptions=$dateTime, nullStrings=$nullStrings, skipTypes=$skipTypes, useFastDoubleParser=$useFastDoubleParser, parseExperimentalUuid=$parseExperimentalUuid, parseExperimentalInstant=$parseExperimentalInstant)" + "ParserOptions(locale=$locale, dateTimeParserOptions=$dateTime, nullStrings=$nullStrings, skipTypes=$skipTypes, useFastDoubleParser=$useFastDoubleParser, parseExperimentalUuid=$parseExperimentalUuid, parseExperimentalInstant=$parseExperimentalInstant, parseToDataFrameReadSource=$parseToDataFrameReadSource)" // region deprecated constructors @@ -518,6 +526,7 @@ public class ParserOptions( useFastDoubleParser: Boolean? = null, parseExperimentalUuid: Boolean? = null, parseExperimentalInstant: Boolean? = null, + parseToDataFrameReadSource: Boolean? = null, ) : this( locale = locale, dateTime = 0.run { @@ -535,6 +544,7 @@ public class ParserOptions( useFastDoubleParser = useFastDoubleParser, parseExperimentalUuid = parseExperimentalUuid, parseExperimentalInstant = parseExperimentalInstant, + parseToDataFrameReadSource = parseToDataFrameReadSource, ) // endregion } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt index 74a7e6564a..c734a6b101 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt @@ -196,10 +196,6 @@ internal inline fun convert(crossinline converter: (T) -> Any?): TypeConvert private enum class DummyEnum -private val dataFrameReadSourceSupportedClasses by lazy { - dataFrameReadSourceByType.keys.map { it.jvmErasure }.toSet() -} - @Suppress("UNCHECKED_CAST") internal fun createConverter(from: KType, to: KType, options: ParserOptions? = null): TypeConverter? { if (from.isMarkedNullable) { @@ -209,50 +205,46 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n val fromClass = from.jvmErasure val toClass = to.jvmErasure - // readSource-backed conversions handle target types with type arguments (e.g. `DataFrame<*>`, - // `DataRow<*>`), so they must run before the generic-arguments early-exit below. - if (dataFrameReadSourceByType.any { from.isSubtypeOf(it.key) }) { - val readSources = dataFrameReadSourceByType.entries - .first { from.isSubtypeOf(it.key) }.value - - when (toClass) { - DataFrame::class -> - return convert { source -> - DataFrame.readSource( - source = source, - type = from, - options = null, - formats = readSources, - ) - } - - DataRow::class -> - return convert { source -> - DataRow.readSource( - source = source, - type = from, - options = null, - formats = readSources, - ) - } - - DataFrameSchema::class -> - return convert { source -> - DataFrameSchema.readSource( - source = source, - type = from, - options = null, - formats = readSources, - ) - } - } + // early exit when we encounter types with generics (except DataFrame and DataRow), which we don't support + if (from.arguments.isNotEmpty() || + (to.arguments.isNotEmpty() && toClass !in setOf(DataFrame::class, DataRow::class)) + ) { + return null } - return when { - from.arguments.isNotEmpty() || to.arguments.isNotEmpty() -> null + val fromTypeInDfReadSources = + dataFrameReadSourceByType.keys.any { from.isSubtypeOf(it) } || from == typeOf() + return when { fromClass == toClass -> TypeConverterIdentity + fromTypeInDfReadSources && toClass == DataFrame::class -> + convert { source -> + DataFrame.readSource( + source = source, + type = from, + options = null, + ) + } + + fromTypeInDfReadSources && toClass == DataRow::class -> + convert { source -> + DataRow.readSource( + source = source, + type = from, + options = null, + ) + } + + fromTypeInDfReadSources && toClass == DataFrameSchema::class -> + convert { source -> + DataFrameSchema.readSource( + source = source, + type = from, + options = null, + ) + } + // kotlin.time.Duration is a value class, // so it must be handled before the generic toClass.isValue / fromClass.isValue branches. toClass == Duration::class -> when (fromClass) { @@ -277,7 +269,7 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n val underlyingType = constructor.parameters.single().type val converter = getConverter(from, underlyingType) ?: throw TypeConverterNotFoundException(from, underlyingType, null) - return convert { + convert { val converted = converter(it) if (converted == null && !underlyingType.isMarkedNullable) { throw TypeConversionException(it, from, underlyingType, null) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index 2d654e6024..77cbdcce35 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -39,6 +39,7 @@ import org.jetbrains.kotlinx.dataframe.api.isFrameColumn import org.jetbrains.kotlinx.dataframe.api.isSubtypeOf import org.jetbrains.kotlinx.dataframe.api.map import org.jetbrains.kotlinx.dataframe.api.parser +import org.jetbrains.kotlinx.dataframe.api.singleOrNull import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion import org.jetbrains.kotlinx.dataframe.columns.size import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException @@ -50,7 +51,10 @@ import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser import org.jetbrains.kotlinx.dataframe.impl.javaDurationCanParse import org.jetbrains.kotlinx.dataframe.impl.lazyMapOf +import org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource import org.jetbrains.kotlinx.dataframe.io.isUrl +import org.jetbrains.kotlinx.dataframe.io.newSupportedFormats +import org.jetbrains.kotlinx.dataframe.io.readSourceImpl import org.jetbrains.kotlinx.dataframe.values import java.math.BigDecimal import java.math.BigInteger @@ -334,6 +338,8 @@ internal object Parsers : GlobalParserOptions { override var dateTimeLibrary: ParseDateTimeLibrary? = null + override var parseToDataFrameReadSource by Delegates.notNull() + override fun resetToDefault() { customGlobalJavaFormatters.values.forEach { it.clear() } nullStrings.clear() @@ -343,6 +349,8 @@ internal object Parsers : GlobalParserOptions { useFastDoubleParser = true parseExperimentalUuid = false parseExperimentalInstant = true + // disabled by default, because it can be very heavy + parseToDataFrameReadSource = false _locale = null dateTimeLibrary = null nullStrings.addAll(listOf("null", "NULL", "NA", "N/A")) @@ -941,40 +949,76 @@ internal object Parsers : GlobalParserOptions { stringParser { it.toBigIntegerOrNull() }, // BigDecimal stringParser { it.toBigDecimalOrNull() }, - // JSON array as DataFrame<*> - stringParser(catch = true) { - val trimmed = it.trim() - if (trimmed.startsWith("[") && trimmed.endsWith("]")) { - if (readJsonStrAnyFrame == null) { - logger.warn { - "parse() encountered a string that looks like a JSON array, but the dataframe-json dependency was not detected. Skipping for now." - } - null - } else { - readJsonStrAnyFrame!!(trimmed) + // Char + stringParser { it.singleOrNull() }, + stringParserWithOptions { options, isConverter -> + if (options?.parseToDataFrameReadSource ?: this.parseToDataFrameReadSource) { + parseBy { + readSourceImpl( + source = it, + sourceType = typeOf(), + options = null, + formats = newSupportedFormats, + resultKind = "DataRow", + doStringToUrlConversion = isConverter, + readOrNull = { source, sourceInfo, options -> + readDataFrameOrNull(source, sourceInfo, options)?.singleOrNull() + }, + ).getOrNull() } } else { - null + SKIP_PARSER } }, - // JSON object as DataRow<*> - stringParser(catch = true) { - val trimmed = it.trim() - if (trimmed.startsWith("{") && trimmed.endsWith("}")) { - if (readJsonStrAnyRow == null) { - logger.warn { - "parse() encountered a string that looks like a JSON object, but the dataframe-json dependency was not detected. Skipping for now." - } - null - } else { - readJsonStrAnyRow!!(trimmed) + stringParserWithOptions { options, isConverter -> + if (options?.parseToDataFrameReadSource ?: this.parseToDataFrameReadSource) { + parseBy { + readSourceImpl( + source = it, + sourceType = typeOf(), + options = null, + formats = newSupportedFormats, + resultKind = "DataFrame", + doStringToUrlConversion = isConverter, + readOrNull = DataFrameReadSource::readDataFrameOrNull, + ).getOrNull() } } else { - null + SKIP_PARSER } }, - // Char - stringParser { it.singleOrNull() }, +// // JSON array as DataFrame<*> +// stringParser(catch = true) { +// val trimmed = it.trim() +// if (trimmed.startsWith("[") && trimmed.endsWith("]")) { +// if (readJsonStrAnyFrame == null) { +// logger.warn { +// "parse() encountered a string that looks like a JSON array, but the dataframe-json dependency was not detected. Skipping for now." +// } +// null +// } else { +// readJsonStrAnyFrame!!(trimmed) +// } +// } else { +// null +// } +// }, +// // JSON object as DataRow<*> +// stringParser(catch = true) { +// val trimmed = it.trim() +// if (trimmed.startsWith("{") && trimmed.endsWith("}")) { +// if (readJsonStrAnyRow == null) { +// logger.warn { +// "parse() encountered a string that looks like a JSON object, but the dataframe-json dependency was not detected. Skipping for now." +// } +// null +// } else { +// readJsonStrAnyRow!!(trimmed) +// } +// } else { +// null +// } +// }, // No parser found, return as String // must be last in the list of parsers to return original unparsed string stringParser { it }, diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt index 858cee018a..7f92575aa0 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt @@ -88,13 +88,7 @@ public data class DataSourceInfo( public val kType: KType, public val extension: String? = null, public val mimeType: String? = null, -) { - init { - if (mimeType != null) { - println() - } - } -} +) /** * NOTE: Needs to have fully qualified name in @@ -116,13 +110,6 @@ internal val dataFrameReadSourceByType: Map> by getOrPut(type) { mutableListOf() }.let { if (format !in it) it += format } - - // special String -> URL case - if (type == typeOf()) { - getOrPut(typeOf()) { mutableListOf() }.let { - if (format !in it) it += format - } - } } } values.forEach { @@ -145,13 +132,14 @@ internal fun readSourceImpl( options: DataFrameReadOptions?, formats: List, resultKind: String, + doStringToUrlConversion: Boolean, readOrNull: DataFrameReadSource.( source: Any, sourceInfo: DataSourceInfo, options: DataFrameReadOptions?, ) -> T?, -): T { - if (source is String) { +): Result { + if (doStringToUrlConversion && source is String) { val url = asUrlOrNull(source) if (url != null) { return readSourceImpl( @@ -160,6 +148,7 @@ internal fun readSourceImpl( options = options, formats = formats, resultKind = resultKind, + doStringToUrlConversion = true, readOrNull = readOrNull, ) } @@ -189,14 +178,20 @@ internal fun readSourceImpl( if (!it.acceptsSource(sourceInfo, options)) return@forEach try { val result = it.readOrNull(getSource(), sourceInfo, options) - if (result != null) return result + if (result != null) return Result.success(result) + + val name = it::class.simpleName!! + tries[name] = Exception("$name returned null.") } catch (e: FileNotFoundException) { - throw e + // fail early. File not found means the reference is broken. + return Result.failure(exception = e) } catch (e: Exception) { tries[it::class.simpleName!!] = e } } - throw IllegalArgumentException("Unknown $resultKind source $source, $sourceInfo; Tried $tries") + return Result.failure( + exception = IllegalArgumentException("Unknown $resultKind source $source, $sourceInfo; Tried $tries"), + ) } /** @@ -223,8 +218,9 @@ public fun DataFrame.Companion.readSource( options = options, formats = formats, resultKind = "DataFrame", + doStringToUrlConversion = true, readOrNull = DataFrameReadSource::readDataFrameOrNull, - ) + ).getOrThrow() public inline fun DataRow.Companion.readSource( source: R, @@ -244,10 +240,11 @@ public fun DataRow.Companion.readSource( options = options, formats = formats, resultKind = "DataRow", + doStringToUrlConversion = true, readOrNull = { source, sourceInfo, options -> readDataFrameOrNull(source, sourceInfo, options)?.single() }, - ) + ).getOrThrow() public inline fun DataFrame.Companion.readSource( source: R, @@ -279,8 +276,9 @@ public fun DataFrameSchema.Companion.readSource( options = options, formats = formats, resultKind = "DataFrameSchema", + doStringToUrlConversion = true, readOrNull = DataFrameReadSource::readDataFrameSchemaOrNull, - ) + ).getOrThrow() public inline fun DataFrameSchema.Companion.readSource( source: R, @@ -318,10 +316,11 @@ public fun CodeString.Companion.readSource( options = options, formats = formats, resultKind = "CodeString", + doStringToUrlConversion = true, readOrNull = { src, info, opts -> readDataSchemaCodeOrNull(src, info, name, opts) }, - ) + ).getOrThrow() public inline fun CodeString.Companion.readSource( source: R, diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 5900956fcf..3ba90f39d1 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -464,7 +464,9 @@ class ParserTests { @Test fun `Mixing null and json`() { val col by columnOf("[\"str\"]", "[]", "null") - val parsed = col.parse() + val parsed = col.parse( + ParserOptions(parseToDataFrameReadSource = true), + ) parsed.type() shouldBe typeOf() parsed.kind() shouldBe ColumnKind.Frame require(parsed.isFrameColumn()) diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt index 34bf7aab0d..cd9230b610 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt @@ -231,8 +231,16 @@ private fun CsvReader.ResultColumn.toDataColumn( val givenSkipTypes = parserOptions?.skipTypes ?: DataFrame.parser.skipTypes // no need to check for types that Deephaven already parses, skip those too val adjustedSkipTypes = givenSkipTypes + typesDeephavenAlreadyParses + + val parseDfReadSource = parserOptions?.parseToDataFrameReadSource + ?: DataFrame.parser.parseToDataFrameReadSource.takeIf { it } // only take if adjusted to 'true' + ?: true + val adjustedParserOptions = (parserOptions ?: ParserOptions()) - .copy(skipTypes = adjustedSkipTypes) + .copy( + skipTypes = adjustedSkipTypes, + parseToDataFrameReadSource = parseDfReadSource, + ) column.tryParse(adjustedParserOptions) } diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt index 4e77e63d50..57cdb342da 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt @@ -134,6 +134,9 @@ public class Csv : DataFrameReadSource { kType.isSubTypeOf() -> (source as? String)?.let { text -> + // early fail + if (opts.delimiter !in text) return null + DataFrame.readCsvStr( text = text, delimiter = opts.delimiter, diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt index 96e026ab69..49f3723a5e 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt @@ -134,6 +134,9 @@ public class Tsv : DataFrameReadSource { kType.isSubTypeOf() -> (source as? String)?.let { text -> + // early fail + if (opts.delimiter !in text) return null + DataFrame.readTsvStr( text = text, delimiter = opts.delimiter, diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt index e454dec406..80b54229b3 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -43,6 +43,7 @@ import org.duckdb.JsonNode import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.api.Infer +import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.api.asColumnGroup import org.jetbrains.kotlinx.dataframe.api.asDataColumn import org.jetbrains.kotlinx.dataframe.api.inferType @@ -158,7 +159,7 @@ public object DuckDb : AdvancedDbType("duckdb") { .withColumnBuilder(targetSchema = null) { name, values, inferNullability -> values .toColumn(name, if (inferNullability) Infer.Nulls else Infer.None) - .tryParse() + .tryParse(ParserOptions(parseToDataFrameReadSource = true)) .inferType() } diff --git a/dataframe-json/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParseJsonColumnTests.kt b/dataframe-json/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParseJsonColumnTests.kt new file mode 100644 index 0000000000..1a6f3430bc --- /dev/null +++ b/dataframe-json/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParseJsonColumnTests.kt @@ -0,0 +1,215 @@ +package org.jetbrains.kotlinx.dataframe.io + +import io.kotest.matchers.shouldBe +import org.intellij.lang.annotations.Language +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow +import org.jetbrains.kotlinx.dataframe.api.asColumnGroup +import org.jetbrains.kotlinx.dataframe.api.asFrameColumn +import org.jetbrains.kotlinx.dataframe.api.columnOf +import org.jetbrains.kotlinx.dataframe.api.dataFrameOf +import org.jetbrains.kotlinx.dataframe.api.getColumnGroup +import org.jetbrains.kotlinx.dataframe.api.getFrameColumn +import org.jetbrains.kotlinx.dataframe.api.isColumnGroup +import org.jetbrains.kotlinx.dataframe.api.isFrameColumn +import org.jetbrains.kotlinx.dataframe.api.parse +import org.jetbrains.kotlinx.dataframe.api.parser +import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup +import org.jetbrains.kotlinx.dataframe.columns.FrameColumn +import org.junit.jupiter.api.AfterAll +import org.junit.jupiter.api.BeforeAll +import kotlin.reflect.typeOf +import kotlin.test.Test + +/** + * Tests that strings containing JSON in a [String] column can be parsed via [parse], + * where JSON arrays become [DataFrame]s (forming a [FrameColumn]) + * and JSON objects become [DataRow]s (forming a [ColumnGroup]). + */ +class ParseJsonColumnTests { + + companion object { + @[BeforeAll JvmStatic] + fun `setup ParserOptions`() { + DataFrame.parser.parseToDataFrameReadSource = true + } + + @[AfterAll JvmStatic] + fun `reset ParserOptions`() { + DataFrame.parser.resetToDefault() + } + } + + @Test + fun `parse column of json arrays into FrameColumn`() { + @Language("json") + val a = """[1, 2, 3]""" + + @Language("json") + val b = """[4, 5, 6]""" + + val col = columnOf(a, b) + val parsed = col.parse() + + parsed.isFrameColumn() shouldBe true + val frameCol = parsed.asFrameColumn() + frameCol.size() shouldBe 2 + frameCol[0]["value"].values().toList() shouldBe listOf(1, 2, 3) + frameCol[1]["value"].values().toList() shouldBe listOf(4, 5, 6) + } + + @Test + fun `parse column of json objects into ColumnGroup`() { + @Language("json") + val a = """{"x": 1, "y": "a"}""" + + @Language("json") + val b = """{"x": 2, "y": "b"}""" + + val col = columnOf(a, b) + val parsed = col.parse() + + parsed.isColumnGroup() shouldBe true + val group = parsed.asColumnGroup() + group.columnsCount() shouldBe 2 + group["x"].type() shouldBe typeOf() + group["y"].type() shouldBe typeOf() + group["x"].values().toList() shouldBe listOf(1, 2) + group["y"].values().toList() shouldBe listOf("a", "b") + } + + @Test + fun `parse dataframe column of json arrays into FrameColumn`() { + @Language("json") + val a = """[10, 20]""" + + @Language("json") + val b = """[30, 40, 50]""" + + val df = dataFrameOf("data")(a, b) + val parsed = df.parse("data") + + parsed.rowsCount() shouldBe 2 + parsed["data"].isFrameColumn() shouldBe true + val frameCol = parsed.getFrameColumn("data") + frameCol[0]["value"].values().toList() shouldBe listOf(10, 20) + frameCol[1]["value"].values().toList() shouldBe listOf(30, 40, 50) + } + + @Test + fun `parse dataframe column of json objects into ColumnGroup`() { + @Language("json") + val a = """{"name": "Alice", "age": 30}""" + + @Language("json") + val b = """{"name": "Bob", "age": 25}""" + + val df = dataFrameOf("person")(a, b) + val parsed = df.parse("person") + + parsed.rowsCount() shouldBe 2 + parsed["person"].isColumnGroup() shouldBe true + val group = parsed.getColumnGroup("person") + group.columnsCount() shouldBe 2 + group["name"].values().toList() shouldBe listOf("Alice", "Bob") + group["age"].values().toList() shouldBe listOf(30, 25) + group["name"].type() shouldBe typeOf() + group["age"].type() shouldBe typeOf() + } + + @Test + fun `parse column of json arrays of objects`() { + @Language("json") + val a = """[{"k": 1}, {"k": 2}]""" + + @Language("json") + val b = """[{"k": 3}, {"k": 4}, {"k": 5}]""" + + val parsed = columnOf(a, b).parse() + + parsed.isFrameColumn() shouldBe true + val frameCol = parsed.asFrameColumn() + frameCol.size() shouldBe 2 + frameCol[0]["k"].values().toList() shouldBe listOf(1, 2) + frameCol[1]["k"].values().toList() shouldBe listOf(3, 4, 5) + } + + @Test + fun `parse column of nested json objects`() { + @Language("json") + val a = """{"outer": {"inner": 1}}""" + + @Language("json") + val b = """{"outer": {"inner": 2}}""" + + val parsed = columnOf(a, b).parse() + + parsed.isColumnGroup() shouldBe true + val outer = parsed.asColumnGroup().getColumnGroup("outer") + outer["inner"].type() shouldBe typeOf() + outer["inner"].values().toList() shouldBe listOf(1, 2) + } + + @Test + fun `parse column of json objects containing arrays`() { + @Language("json") + val a = """{"name": "list1", "values": [1, 2, 3]}""" + + @Language("json") + val b = """{"name": "list2", "values": [4, 5]}""" + + val parsed = columnOf(a, b).parse() + + parsed.isColumnGroup() shouldBe true + val group = parsed.asColumnGroup() + group["name"].values().toList() shouldBe listOf("list1", "list2") + group["values"].type() shouldBe typeOf>() + group["values"].values().toList() shouldBe listOf(listOf(1, 2, 3), listOf(4, 5)) + } + + @Test + fun `parse column of json arrays with whitespace`() { + val col = columnOf(" [1, 2, 3] ", "\n[4, 5]\t") + val parsed = col.parse() + + parsed.isFrameColumn() shouldBe true + val frameCol = parsed.asFrameColumn() + frameCol.size() shouldBe 2 + frameCol[0]["value"].values().toList() shouldBe listOf(1, 2, 3) + frameCol[1]["value"].values().toList() shouldBe listOf(4, 5) + } + + @Test + fun `parse dataframe with multiple json columns`() { + @Language("json") + val obj1 = """{"a": 1}""" + + @Language("json") + val obj2 = """{"a": 2}""" + + @Language("json") + val arr1 = """[1, 2]""" + + @Language("json") + val arr2 = """[3, 4]""" + + val df = dataFrameOf("obj", "arr")( + obj1, + arr1, + obj2, + arr2, + ) + val parsed = df.parse() + + parsed.rowsCount() shouldBe 2 + parsed["obj"].isColumnGroup() shouldBe true + parsed["arr"].isFrameColumn() shouldBe true + + val objGroup = parsed.getColumnGroup("obj") + objGroup["a"].values().toList() shouldBe listOf(1, 2) + + val arrFrame = parsed.getFrameColumn("arr") + arrFrame[0]["value"].values().toList() shouldBe listOf(1, 2) + arrFrame[1]["value"].values().toList() shouldBe listOf(3, 4) + } +}