diff --git a/core/api/core.api b/core/api/core.api index 790ce8fa29..ffeec4d531 100644 --- a/core/api/core.api +++ b/core/api/core.api @@ -663,6 +663,7 @@ public final class org/jetbrains/kotlinx/dataframe/api/ChunkedKt { } public final class org/jetbrains/kotlinx/dataframe/api/CodeString { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/api/CodeString$Companion; public static final synthetic fun box-impl (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/api/CodeString; public static fun constructor-impl (Ljava/lang/String;)Ljava/lang/String; public fun equals (Ljava/lang/Object;)Z @@ -676,6 +677,9 @@ public final class org/jetbrains/kotlinx/dataframe/api/CodeString { public final synthetic fun unbox-impl ()Ljava/lang/String; } +public final class org/jetbrains/kotlinx/dataframe/api/CodeString$Companion { +} + public abstract interface class org/jetbrains/kotlinx/dataframe/api/ColColumnsSelectionDsl { public fun col (Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/columns/ColumnAccessor; public fun col (Ljava/lang/String;I)Lorg/jetbrains/kotlinx/dataframe/columns/SingleColumn; @@ -2471,6 +2475,7 @@ public abstract interface class org/jetbrains/kotlinx/dataframe/api/GlobalParser public abstract fun getNulls ()Ljava/util/Set; public abstract fun getParseExperimentalInstant ()Z public abstract fun getParseExperimentalUuid ()Z + public abstract fun getParseToDataFrameReadSource ()Z public abstract fun getSkipTypes ()Ljava/util/Set; public abstract fun getUseFastDoubleParser ()Z public abstract fun resetToDefault ()V @@ -2478,6 +2483,7 @@ public abstract interface class org/jetbrains/kotlinx/dataframe/api/GlobalParser public abstract fun setLocale (Ljava/util/Locale;)V public abstract fun setParseExperimentalInstant (Z)V public abstract fun setParseExperimentalUuid (Z)V + public abstract fun setParseToDataFrameReadSource (Z)V public abstract fun setUseFastDoubleParser (Z)V } @@ -3618,18 +3624,19 @@ public final class org/jetbrains/kotlinx/dataframe/api/ParserOptions { public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;ILkotlin/jvm/internal/DefaultConstructorMarker;)V public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;)V public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V - public fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)V - public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V - public fun (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)V - public synthetic fun (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V - public final fun copy (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/lang/Iterable;Ljava/lang/Iterable;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; - public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/lang/Iterable;Ljava/lang/Iterable;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)V + public synthetic fun (Ljava/util/Locale;Ljava/time/format/DateTimeFormatter;Ljava/lang/String;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public fun (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)V + public synthetic fun (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/util/Set;Ljava/util/Set;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun copy (Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/lang/Iterable;Ljava/lang/Iterable;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;Ljava/util/Locale;Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions;Ljava/lang/Iterable;Ljava/lang/Iterable;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;Ljava/lang/Boolean;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; public fun equals (Ljava/lang/Object;)Z public final fun getDateTime ()Lorg/jetbrains/kotlinx/dataframe/api/DateTimeParserOptions; public final fun getLocale ()Ljava/util/Locale; public final fun getNullStrings ()Ljava/util/Set; public final fun getParseExperimentalInstant ()Ljava/lang/Boolean; public final fun getParseExperimentalUuid ()Ljava/lang/Boolean; + public final fun getParseToDataFrameReadSource ()Ljava/lang/Boolean; public final fun getSkipTypes ()Ljava/util/Set; public final fun getUseFastDoubleParser ()Ljava/lang/Boolean; public fun hashCode ()I @@ -6028,6 +6035,37 @@ public final class org/jetbrains/kotlinx/dataframe/io/DataFrameHtmlData$Companio public static synthetic fun tableDefinitions$default (Lorg/jetbrains/kotlinx/dataframe/io/DataFrameHtmlData$Companion;ZZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/DataFrameHtmlData; } +public abstract interface class org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { +} + +public abstract interface class org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public abstract fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public abstract fun getSupportedTypes ()Ljava/util/Set; + public abstract fun getTestOrder ()I + public abstract fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static synthetic fun readDataFrameOrNull$default (Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadSource;Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public static synthetic fun readDataFrameSchemaOrNull$default (Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadSource;Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public static synthetic fun readDataSchemaCodeOrNull-myXLQ2E$default (Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadSource;Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;ILjava/lang/Object;)Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/DataSourceInfo { + public fun (Lkotlin/reflect/KType;Ljava/lang/String;Ljava/lang/String;)V + public synthetic fun (Lkotlin/reflect/KType;Ljava/lang/String;Ljava/lang/String;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Lkotlin/reflect/KType; + public final fun component2 ()Ljava/lang/String; + public final fun component3 ()Ljava/lang/String; + public final fun copy (Lkotlin/reflect/KType;Ljava/lang/String;Ljava/lang/String;)Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lkotlin/reflect/KType;Ljava/lang/String;Ljava/lang/String;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo; + public fun equals (Ljava/lang/Object;)Z + public final fun getExtension ()Ljava/lang/String; + public final fun getKType ()Lkotlin/reflect/KType; + public final fun getMimeType ()Ljava/lang/String; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/DisplayConfiguration { public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/DisplayConfiguration$Companion; public synthetic fun (Ljava/lang/Integer;Ljava/lang/Integer;ILkotlin/jvm/functions/Function3;Ljava/lang/String;ZZZZZILkotlin/jvm/internal/DefaultConstructorMarker;)V @@ -6078,6 +6116,18 @@ public final class org/jetbrains/kotlinx/dataframe/io/DisplayConfiguration$Compa public final fun getDEFAULT ()Lorg/jetbrains/kotlinx/dataframe/io/DisplayConfiguration; } +public final class org/jetbrains/kotlinx/dataframe/io/Guess2Kt { + public static final fun getNewSupportedFormats ()Ljava/util/List; + public static final fun readSource (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static final fun readSource (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataRow; + public static final fun readSource (Lorg/jetbrains/kotlinx/dataframe/api/CodeString$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;)Ljava/lang/String; + public static final fun readSource (Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public static synthetic fun readSource$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public static synthetic fun readSource$default (Lorg/jetbrains/kotlinx/dataframe/DataRow$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataRow; + public static synthetic fun readSource$default (Lorg/jetbrains/kotlinx/dataframe/api/CodeString$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;ILjava/lang/Object;)Ljava/lang/String; + public static synthetic fun readSource$default (Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema$Companion;Ljava/lang/Object;Lkotlin/reflect/KType;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;Ljava/util/List;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; +} + public final class org/jetbrains/kotlinx/dataframe/io/GuessKt { public static final fun read (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static final fun read (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/lang/String;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; diff --git a/core/build.gradle.kts b/core/build.gradle.kts index 60e39459a5..07bdd2f173 100644 --- a/core/build.gradle.kts +++ b/core/build.gradle.kts @@ -59,6 +59,7 @@ dependencies { implementation(libs.commonsIo) implementation(libs.fastDoubleParser) + implementation(libs.tika) api(libs.kotlin.datetimeJvm) implementation(libs.kotlinpoet) @@ -76,7 +77,6 @@ dependencies { testImplementation(libs.kotlin.scriptingJvm) testImplementation(libs.jsoup) testImplementation(libs.sl4jsimple) - testImplementation(projects.dataframeJson) testImplementation(libs.serialization.core) testImplementation(libs.serialization.json) @@ -85,6 +85,13 @@ dependencies { // for samples.api testImplementation(projects.dataframeCsv) + testImplementation(projects.dataframeJson) + testImplementation(projects.dataframeArrow) + testImplementation(projects.dataframeExcel) +// testImplementation(projects.dataframeGeo) + testImplementation(projects.dataframeJdbc) + testImplementation(libs.h2db) + testImplementation(projects.dataframeOpenapiGenerator) } // Configure testJava16 dependencies to extend from test @@ -308,6 +315,8 @@ tasks.withType { tasks.test { maxHeapSize = "1g" + // Arrow's off-heap allocator needs deep reflection into java.nio. + jvmArgs("--add-opens", "java.base/java.nio=ALL-UNNAMED") } // Test task for Java 16+ language-specific tests diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ParserOptions.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ParserOptions.kt index b77de07bab..abad2f2631 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ParserOptions.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/ParserOptions.kt @@ -291,6 +291,8 @@ public interface GlobalParserOptions { * @see [addJavaDateTimePattern] */ public var dateTimeLibrary: ParseDateTimeLibrary? + + public var parseToDataFrameReadSource: Boolean } /** @include [GlobalParserOptions] */ @@ -410,6 +412,7 @@ public class ParserOptions( public val useFastDoubleParser: Boolean? = null, public val parseExperimentalUuid: Boolean? = null, public val parseExperimentalInstant: Boolean? = null, + public val parseToDataFrameReadSource: Boolean? = null, ) { public fun copy( locale: Locale? = this.locale, @@ -419,6 +422,7 @@ public class ParserOptions( useFastDoubleParser: Boolean? = this.useFastDoubleParser, parseExperimentalUuid: Boolean? = this.parseExperimentalUuid, parseExperimentalInstant: Boolean? = this.parseExperimentalInstant, + parseToDataFrameReadSource: Boolean? = this.parseToDataFrameReadSource, ): ParserOptions = ParserOptions( locale = locale, @@ -428,6 +432,7 @@ public class ParserOptions( useFastDoubleParser = useFastDoubleParser, parseExperimentalUuid = parseExperimentalUuid, parseExperimentalInstant = parseExperimentalInstant, + parseToDataFrameReadSource = parseToDataFrameReadSource, ) override fun equals(other: Any?): Boolean { @@ -439,6 +444,7 @@ public class ParserOptions( if (useFastDoubleParser != other.useFastDoubleParser) return false if (parseExperimentalUuid != other.parseExperimentalUuid) return false if (parseExperimentalInstant != other.parseExperimentalInstant) return false + if (parseToDataFrameReadSource != other.parseToDataFrameReadSource) return false if (locale != other.locale) return false if (dateTime != other.dateTime) return false if (nullStrings != other.nullStrings) return false @@ -451,6 +457,8 @@ public class ParserOptions( var result = useFastDoubleParser?.hashCode() ?: 0 result = 31 * result + (parseExperimentalUuid?.hashCode() ?: 0) result = 31 * result + (parseExperimentalInstant?.hashCode() ?: 0) + result = 31 * result + (parseToDataFrameReadSource?.hashCode() ?: 0) + result = 31 * result + (parseExperimentalInstant?.hashCode() ?: 0) result = 31 * result + (locale?.hashCode() ?: 0) result = 31 * result + (dateTime?.hashCode() ?: 0) result = 31 * result + (nullStrings?.hashCode() ?: 0) @@ -459,7 +467,7 @@ public class ParserOptions( } override fun toString(): String = - "ParserOptions(locale=$locale, dateTimeParserOptions=$dateTime, nullStrings=$nullStrings, skipTypes=$skipTypes, useFastDoubleParser=$useFastDoubleParser, parseExperimentalUuid=$parseExperimentalUuid, parseExperimentalInstant=$parseExperimentalInstant)" + "ParserOptions(locale=$locale, dateTimeParserOptions=$dateTime, nullStrings=$nullStrings, skipTypes=$skipTypes, useFastDoubleParser=$useFastDoubleParser, parseExperimentalUuid=$parseExperimentalUuid, parseExperimentalInstant=$parseExperimentalInstant, parseToDataFrameReadSource=$parseToDataFrameReadSource)" // region deprecated constructors @@ -518,6 +526,7 @@ public class ParserOptions( useFastDoubleParser: Boolean? = null, parseExperimentalUuid: Boolean? = null, parseExperimentalInstant: Boolean? = null, + parseToDataFrameReadSource: Boolean? = null, ) : this( locale = locale, dateTime = 0.run { @@ -535,6 +544,7 @@ public class ParserOptions( useFastDoubleParser = useFastDoubleParser, parseExperimentalUuid = parseExperimentalUuid, parseExperimentalInstant = parseExperimentalInstant, + parseToDataFrameReadSource = parseToDataFrameReadSource, ) // endregion } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/generateCode.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/generateCode.kt index 5137cf200c..4ad31062af 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/generateCode.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/generateCode.kt @@ -297,6 +297,8 @@ public val NameNormalizer.Companion.default: NameNormalizer get() = NameNormaliz @RequiredByIntellijPlugin public value class CodeString(public val value: String) { override fun toString(): String = value + + public companion object } @PublishedApi diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt index fbaafd0370..c734a6b101 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt @@ -22,6 +22,7 @@ import kotlinx.datetime.toStdlibInstant import org.jetbrains.kotlinx.dataframe.AnyCol import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow import org.jetbrains.kotlinx.dataframe.RowColumnExpression import org.jetbrains.kotlinx.dataframe.RowValueExpression import org.jetbrains.kotlinx.dataframe.api.Convert @@ -30,7 +31,6 @@ import org.jetbrains.kotlinx.dataframe.api.Infer import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.api.asColumn import org.jetbrains.kotlinx.dataframe.api.isValueColumn -import org.jetbrains.kotlinx.dataframe.api.mapIndexed import org.jetbrains.kotlinx.dataframe.api.name import org.jetbrains.kotlinx.dataframe.columns.values import org.jetbrains.kotlinx.dataframe.dataTypes.IFRAME @@ -41,13 +41,14 @@ import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException import org.jetbrains.kotlinx.dataframe.exceptions.TypeConverterNotFoundException import org.jetbrains.kotlinx.dataframe.impl.columns.newColumn import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType -import org.jetbrains.kotlinx.dataframe.impl.isSubtypeWithNullabilityOf +import org.jetbrains.kotlinx.dataframe.io.dataFrameReadSourceByType +import org.jetbrains.kotlinx.dataframe.io.readSource import org.jetbrains.kotlinx.dataframe.path +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import org.jetbrains.kotlinx.dataframe.type import java.math.BigDecimal import java.math.BigInteger import java.net.URL -import java.util.Locale import kotlin.math.roundToInt import kotlin.math.roundToLong import kotlin.reflect.KType @@ -197,16 +198,53 @@ private enum class DummyEnum @Suppress("UNCHECKED_CAST") internal fun createConverter(from: KType, to: KType, options: ParserOptions? = null): TypeConverter? { - if (from.arguments.isNotEmpty() || to.arguments.isNotEmpty()) return null if (from.isMarkedNullable) { val res = createConverter(from.withNullability(false), to, options) ?: return null return { res(it) } } val fromClass = from.jvmErasure val toClass = to.jvmErasure + + // early exit when we encounter types with generics (except DataFrame and DataRow), which we don't support + if (from.arguments.isNotEmpty() || + (to.arguments.isNotEmpty() && toClass !in setOf(DataFrame::class, DataRow::class)) + ) { + return null + } + + val fromTypeInDfReadSources = + dataFrameReadSourceByType.keys.any { from.isSubtypeOf(it) } || from == typeOf() + return when { fromClass == toClass -> TypeConverterIdentity + fromTypeInDfReadSources && toClass == DataFrame::class -> + convert { source -> + DataFrame.readSource( + source = source, + type = from, + options = null, + ) + } + + fromTypeInDfReadSources && toClass == DataRow::class -> + convert { source -> + DataRow.readSource( + source = source, + type = from, + options = null, + ) + } + + fromTypeInDfReadSources && toClass == DataFrameSchema::class -> + convert { source -> + DataFrameSchema.readSource( + source = source, + type = from, + options = null, + ) + } + // kotlin.time.Duration is a value class, // so it must be handled before the generic toClass.isValue / fromClass.isValue branches. toClass == Duration::class -> when (fromClass) { @@ -231,7 +269,7 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n val underlyingType = constructor.parameters.single().type val converter = getConverter(from, underlyingType) ?: throw TypeConverterNotFoundException(from, underlyingType, null) - return convert { + convert { val converted = converter(it) if (converted == null && !underlyingType.isMarkedNullable) { throw TypeConversionException(it, from, underlyingType, null) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index 2d654e6024..77cbdcce35 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -39,6 +39,7 @@ import org.jetbrains.kotlinx.dataframe.api.isFrameColumn import org.jetbrains.kotlinx.dataframe.api.isSubtypeOf import org.jetbrains.kotlinx.dataframe.api.map import org.jetbrains.kotlinx.dataframe.api.parser +import org.jetbrains.kotlinx.dataframe.api.singleOrNull import org.jetbrains.kotlinx.dataframe.columns.TypeSuggestion import org.jetbrains.kotlinx.dataframe.columns.size import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException @@ -50,7 +51,10 @@ import org.jetbrains.kotlinx.dataframe.impl.createStarProjectedType import org.jetbrains.kotlinx.dataframe.impl.io.FastDoubleParser import org.jetbrains.kotlinx.dataframe.impl.javaDurationCanParse import org.jetbrains.kotlinx.dataframe.impl.lazyMapOf +import org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource import org.jetbrains.kotlinx.dataframe.io.isUrl +import org.jetbrains.kotlinx.dataframe.io.newSupportedFormats +import org.jetbrains.kotlinx.dataframe.io.readSourceImpl import org.jetbrains.kotlinx.dataframe.values import java.math.BigDecimal import java.math.BigInteger @@ -334,6 +338,8 @@ internal object Parsers : GlobalParserOptions { override var dateTimeLibrary: ParseDateTimeLibrary? = null + override var parseToDataFrameReadSource by Delegates.notNull() + override fun resetToDefault() { customGlobalJavaFormatters.values.forEach { it.clear() } nullStrings.clear() @@ -343,6 +349,8 @@ internal object Parsers : GlobalParserOptions { useFastDoubleParser = true parseExperimentalUuid = false parseExperimentalInstant = true + // disabled by default, because it can be very heavy + parseToDataFrameReadSource = false _locale = null dateTimeLibrary = null nullStrings.addAll(listOf("null", "NULL", "NA", "N/A")) @@ -941,40 +949,76 @@ internal object Parsers : GlobalParserOptions { stringParser { it.toBigIntegerOrNull() }, // BigDecimal stringParser { it.toBigDecimalOrNull() }, - // JSON array as DataFrame<*> - stringParser(catch = true) { - val trimmed = it.trim() - if (trimmed.startsWith("[") && trimmed.endsWith("]")) { - if (readJsonStrAnyFrame == null) { - logger.warn { - "parse() encountered a string that looks like a JSON array, but the dataframe-json dependency was not detected. Skipping for now." - } - null - } else { - readJsonStrAnyFrame!!(trimmed) + // Char + stringParser { it.singleOrNull() }, + stringParserWithOptions { options, isConverter -> + if (options?.parseToDataFrameReadSource ?: this.parseToDataFrameReadSource) { + parseBy { + readSourceImpl( + source = it, + sourceType = typeOf(), + options = null, + formats = newSupportedFormats, + resultKind = "DataRow", + doStringToUrlConversion = isConverter, + readOrNull = { source, sourceInfo, options -> + readDataFrameOrNull(source, sourceInfo, options)?.singleOrNull() + }, + ).getOrNull() } } else { - null + SKIP_PARSER } }, - // JSON object as DataRow<*> - stringParser(catch = true) { - val trimmed = it.trim() - if (trimmed.startsWith("{") && trimmed.endsWith("}")) { - if (readJsonStrAnyRow == null) { - logger.warn { - "parse() encountered a string that looks like a JSON object, but the dataframe-json dependency was not detected. Skipping for now." - } - null - } else { - readJsonStrAnyRow!!(trimmed) + stringParserWithOptions { options, isConverter -> + if (options?.parseToDataFrameReadSource ?: this.parseToDataFrameReadSource) { + parseBy { + readSourceImpl( + source = it, + sourceType = typeOf(), + options = null, + formats = newSupportedFormats, + resultKind = "DataFrame", + doStringToUrlConversion = isConverter, + readOrNull = DataFrameReadSource::readDataFrameOrNull, + ).getOrNull() } } else { - null + SKIP_PARSER } }, - // Char - stringParser { it.singleOrNull() }, +// // JSON array as DataFrame<*> +// stringParser(catch = true) { +// val trimmed = it.trim() +// if (trimmed.startsWith("[") && trimmed.endsWith("]")) { +// if (readJsonStrAnyFrame == null) { +// logger.warn { +// "parse() encountered a string that looks like a JSON array, but the dataframe-json dependency was not detected. Skipping for now." +// } +// null +// } else { +// readJsonStrAnyFrame!!(trimmed) +// } +// } else { +// null +// } +// }, +// // JSON object as DataRow<*> +// stringParser(catch = true) { +// val trimmed = it.trim() +// if (trimmed.startsWith("{") && trimmed.endsWith("}")) { +// if (readJsonStrAnyRow == null) { +// logger.warn { +// "parse() encountered a string that looks like a JSON object, but the dataframe-json dependency was not detected. Skipping for now." +// } +// null +// } else { +// readJsonStrAnyRow!!(trimmed) +// } +// } else { +// null +// } +// }, // No parser found, return as String // must be last in the list of parsers to return original unparsed string stringParser { it }, diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt new file mode 100644 index 0000000000..7f92575aa0 --- /dev/null +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/guess2.kt @@ -0,0 +1,423 @@ +package org.jetbrains.kotlinx.dataframe.io + +import org.apache.tika.detect.DefaultDetector +import org.apache.tika.io.TikaInputStream +import org.apache.tika.metadata.Metadata +import org.apache.tika.metadata.TikaCoreProperties +import org.apache.tika.mime.MediaType +import org.jetbrains.kotlinx.dataframe.AnyFrame +import org.jetbrains.kotlinx.dataframe.AnyRow +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow +import org.jetbrains.kotlinx.dataframe.api.CodeString +import org.jetbrains.kotlinx.dataframe.api.generateInterfaces +import org.jetbrains.kotlinx.dataframe.api.schema +import org.jetbrains.kotlinx.dataframe.api.single +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema +import java.io.ByteArrayInputStream +import java.io.File +import java.io.FileNotFoundException +import java.io.IOException +import java.io.InputStream +import java.net.URI +import java.net.URL +import java.nio.file.Path +import java.util.ServiceLoader +import kotlin.io.path.extension +import kotlin.io.path.name +import kotlin.reflect.KType +import kotlin.reflect.full.withNullability +import kotlin.reflect.typeOf + +public interface DataFrameReadOptions + +public interface DataFrameReadSource { + /** + * The set of source [KType]s this format knows how to read. The framework uses this in the default + * [acceptsSource] implementation, and overriding `acceptsSource` implementations should still consult it + * so that adding a new supported type only requires updating this set. + * + * Note: a `String` *reference* (path/URL) is normalized to a [URL] by `readSourceImpl` before any format + * is invoked, so only include `String` here when raw text content is a legitimate input (e.g., JSON/CSV + * text). For binary formats, leave `String` out. + */ + public val supportedTypes: Set + + public fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions? = null, + ): DataFrame<*>? + + /** + * Read just the [DataFrameSchema] for [source]. + * + * The default implementation reads the full DataFrame and calls [DataFrame.schema]. Override when the + * source format can introspect types without materializing rows (e.g., JDBC metadata queries, Parquet/Arrow + * file footers, OpenAPI specs). + */ + public fun readDataFrameSchemaOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions? = null, + ): DataFrameSchema? = readDataFrameOrNull(source, sourceInfo, options)?.schema() + + public fun readDataSchemaCodeOrNull( + source: Any, + sourceInfo: DataSourceInfo, + name: String, + options: DataFrameReadOptions? = null, + ): CodeString? = + readDataFrameSchemaOrNull(source, sourceInfo, options) + ?.generateInterfaces(name) + + public fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean + + // `DataFrame.Companion.read` methods uses this to sort list of all supported formats in ascending order (-1, 2, 10) + // sorted list is used to test if any format can read given input + public val testOrder: Int +} + +/** + * Description of a source passed to [DataFrameReadSource]. Carries the static [kType] of the value and + * optional [extension]/[mimeType] hints, both of which may be `null` when the source is in-memory content + * with no reasonable file-extension/MIME interpretation (e.g., a raw [String], [InputStream], [java.sql.Connection], + * etc.). + */ +public data class DataSourceInfo( + public val kType: KType, + public val extension: String? = null, + public val mimeType: String? = null, +) + +/** + * NOTE: Needs to have fully qualified name in + * resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource + * to be detected here. + */ +@PublishedApi +internal val newSupportedFormats: List by lazy { + ServiceLoader.load(DataFrameReadSource::class.java) + .toList() + .distinct() + .sortedBy { it.testOrder } +} + +internal val dataFrameReadSourceByType: Map> by lazy { + buildMap> { + newSupportedFormats.forEach { format -> + format.supportedTypes.forEach { type -> + getOrPut(type) { mutableListOf() }.let { + if (format !in it) it += format + } + } + } + values.forEach { + it.sortBy { it.testOrder } + } + } +} + +/** + * Shared dispatch loop for [readDataFrameImpl] and [readDataFrameSchemaImpl]: handles String→URL + * normalization, InputStream buffering, sorted iteration, and error aggregation. The per-format read + * operation is supplied as [readOrNull]; [resultKind] is used only in the "unknown source" error message. + * + * @param [readOrNull] [DataFrameReadSource.readDataFrameOrNull] or [DataFrameReadSource.readDataFrameSchemaOrNull] + * Potentially, this could also return another type, like a GeoDataFrame. + */ +internal fun readSourceImpl( + source: Any, + sourceType: KType, + options: DataFrameReadOptions?, + formats: List, + resultKind: String, + doStringToUrlConversion: Boolean, + readOrNull: DataFrameReadSource.( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ) -> T?, +): Result { + if (doStringToUrlConversion && source is String) { + val url = asUrlOrNull(source) + if (url != null) { + return readSourceImpl( + source = url, + sourceType = typeOf(), + options = options, + formats = formats, + resultKind = resultKind, + doStringToUrlConversion = true, + readOrNull = readOrNull, + ) + } + } + + // Some sources can only be read once, like InputStreams, so we need to buffer them + var bufferedSource: Any? = null + + fun getSource(): Any = + when (source) { + is InputStream -> { + if (bufferedSource == null) bufferedSource = source.readBytes() + ByteArrayInputStream(bufferedSource as ByteArray) + } + + else -> source + } + + val sourceInfo = DataSourceInfo( + kType = sourceType, + extension = getSource().extensionOrNull(), + mimeType = getSource().mimeTypeOrNull(), + ) + + val tries = mutableMapOf() + formats.sortedBy { it.testOrder }.forEach { + if (!it.acceptsSource(sourceInfo, options)) return@forEach + try { + val result = it.readOrNull(getSource(), sourceInfo, options) + if (result != null) return Result.success(result) + + val name = it::class.simpleName!! + tries[name] = Exception("$name returned null.") + } catch (e: FileNotFoundException) { + // fail early. File not found means the reference is broken. + return Result.failure(exception = e) + } catch (e: Exception) { + tries[it::class.simpleName!!] = e + } + } + return Result.failure( + exception = IllegalArgumentException("Unknown $resultKind source $source, $sourceInfo; Tried $tries"), + ) +} + +/** + * Unified entry point for the [DataFrameReadSource] framework: passes [source] through every registered + * format until one reads it. + * + * For a [String] that points to an existing file or a recognized URL (`http://`, `https://`, `ftp://`), + * the source is normalized to a [URL] so the file-extension hint can be used to disambiguate formats. Any + * other [String] is treated as in-memory content (raw JSON/CSV/etc.). + * + * Named [readSource] rather than `read` to avoid shadowing the legacy `DataFrame.read(File/URL/Path/String, header)` + * entries in `guess.kt` that use the older [SupportedDataFrameFormat] system. Once the legacy entries are + * retired, this can be renamed to `read`. + */ +public fun DataFrame.Companion.readSource( + source: Any, + type: KType, + options: DataFrameReadOptions? = null, + formats: List = newSupportedFormats, +): AnyFrame = + readSourceImpl( + source = source, + sourceType = type.withNullability(false), + options = options, + formats = formats, + resultKind = "DataFrame", + doStringToUrlConversion = true, + readOrNull = DataFrameReadSource::readDataFrameOrNull, + ).getOrThrow() + +public inline fun DataRow.Companion.readSource( + source: R, + options: DataFrameReadOptions? = null, + formats: List = newSupportedFormats, +): AnyRow = readSource(source = source, type = typeOf(), options = options, formats = formats) + +public fun DataRow.Companion.readSource( + source: Any, + type: KType, + options: DataFrameReadOptions? = null, + formats: List = newSupportedFormats, +): AnyRow = + readSourceImpl( + source = source, + sourceType = type.withNullability(false), + options = options, + formats = formats, + resultKind = "DataRow", + doStringToUrlConversion = true, + readOrNull = { source, sourceInfo, options -> + readDataFrameOrNull(source, sourceInfo, options)?.single() + }, + ).getOrThrow() + +public inline fun DataFrame.Companion.readSource( + source: R, + options: DataFrameReadOptions? = null, + formats: List = newSupportedFormats, +): AnyFrame = + readSource( + source = source, + type = typeOf(), + options = options, + formats = formats, + ) + +/** + * Schema-only counterpart of [DataFrame.Companion.readSource]: dispatches through every registered + * [DataFrameReadSource] and returns the resulting [DataFrameSchema] without materializing rows when the + * format supports it (e.g., JDBC). Formats with no fast schema path fall back to reading the full DataFrame + * and calling [DataFrame.schema]. + */ +public fun DataFrameSchema.Companion.readSource( + source: Any, + type: KType, + options: DataFrameReadOptions? = null, + formats: List = newSupportedFormats, +): DataFrameSchema = + readSourceImpl( + source = source, + sourceType = type.withNullability(false), + options = options, + formats = formats, + resultKind = "DataFrameSchema", + doStringToUrlConversion = true, + readOrNull = DataFrameReadSource::readDataFrameSchemaOrNull, + ).getOrThrow() + +public inline fun DataFrameSchema.Companion.readSource( + source: R, + options: DataFrameReadOptions? = null, + formats: List = newSupportedFormats, +): DataFrameSchema = + readSource( + source = source, + type = typeOf(), + options = options, + formats = formats, + ) + +/** + * Code-generation counterpart of [DataFrame.Companion.readSource]: dispatches through every registered + * [DataFrameReadSource] and returns a [CodeString] containing the generated `@DataSchema` interface + * declarations (plus enums/typealiases for formats like OpenAPI). The [name] is the marker name used for + * the top-level generated interface. + * + * The default implementation in [DataFrameReadSource.readDataSchemaCodeOrNull] runs + * [DataFrameSchema.generateInterfaces] on the format's [DataFrameReadSource.readDataFrameSchemaOrNull] + * result; formats that produce richer code (OpenAPI markers, enums, typealiases) override the method + * directly. + */ +public fun CodeString.Companion.readSource( + source: Any, + type: KType, + name: String, + options: DataFrameReadOptions? = null, + formats: List = newSupportedFormats, +): CodeString = + readSourceImpl( + source = source, + sourceType = type.withNullability(false), + options = options, + formats = formats, + resultKind = "CodeString", + doStringToUrlConversion = true, + readOrNull = { src, info, opts -> + readDataSchemaCodeOrNull(src, info, name, opts) + }, + ).getOrThrow() + +public inline fun CodeString.Companion.readSource( + source: R, + name: String, + options: DataFrameReadOptions? = null, + formats: List = newSupportedFormats, +): CodeString = + readSource( + source = source, + type = typeOf(), + name = name, + options = options, + formats = formats, + ) + +private val tikaDetector by lazy { DefaultDetector() } + +internal fun Any.mimeTypeOrNull(): String? { + val inputStream = try { + when (this) { + is Path -> TikaInputStream.get(this) + + is File -> + @Suppress("DEPRECATION") + TikaInputStream.get(this) + + is URL -> TikaInputStream.get(this) + + is InputStream -> TikaInputStream.get(this) + + is ByteArray -> TikaInputStream.get(this) + + else -> null + } + } catch (_: IOException) { + null + } ?: return null + + val metadata = Metadata().apply { + if (inputStream.hasFile()) { + add(TikaCoreProperties.RESOURCE_NAME_KEY, inputStream.path.name) + } + } + return try { + val detected = tikaDetector.detect(inputStream, metadata) + return when { + detected == MediaType.OCTET_STREAM -> null + detected == MediaType.TEXT_PLAIN -> null + detected == MediaType.EMPTY -> null + detected.toString().isEmpty() -> null + else -> detected.toString() + } + } catch (_: IOException) { + null + } +} + +internal fun Any.extensionOrNull(): String? = + when (this) { + is Path -> extension + + is File -> extension + + is URL -> path.takeIf { it.isNotBlank() }?.substringAfterLast('.') + + is String -> try { + asUrl(this).extensionOrNull() + } catch (_: Exception) { + null + } + + else -> null + }?.lowercase() + +/** + * Non-throwing variant of [asUrl]: returns the [URL] iff [string] is a recognized URL (`http`/`https`/`ftp`) + * or an existing file path. Used by [readSource] to decide whether a [String] should be treated as a reference + * or as raw content. + */ +internal fun asUrlOrNull(string: String): URL? = + when { + isUrl(string) -> try { + URI(string).toURL() + } catch (_: Exception) { + null + } + + else -> { + val file = try { + File(string) + } catch (_: Exception) { + null + } + if (file != null && file.exists() && file.isFile) { + file.toURI().toURL() + } else { + null + } + } + } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt new file mode 100644 index 0000000000..d4b9503a95 --- /dev/null +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/Guess2.kt @@ -0,0 +1,644 @@ +package org.jetbrains.kotlinx.dataframe.io + +import io.kotest.matchers.shouldBe +import io.kotest.matchers.string.shouldContain +import kotlinx.serialization.json.Json +import kotlinx.serialization.json.JsonElement +import org.apache.poi.ss.usermodel.WorkbookFactory +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow +import org.jetbrains.kotlinx.dataframe.api.CodeString +import org.jetbrains.kotlinx.dataframe.api.columnOf +import org.jetbrains.kotlinx.dataframe.api.convert +import org.jetbrains.kotlinx.dataframe.api.dataFrameOf +import org.jetbrains.kotlinx.dataframe.api.named +import org.jetbrains.kotlinx.dataframe.api.schema +import org.jetbrains.kotlinx.dataframe.api.single +import org.jetbrains.kotlinx.dataframe.api.toDataFrame +import org.jetbrains.kotlinx.dataframe.io.db.H2 +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema +import org.junit.Test +import java.io.File +import java.sql.Connection +import java.sql.DriverManager +import javax.sql.DataSource +import kotlin.io.path.Path +import kotlin.io.path.absolute + +class Guess2 { + + @Test + fun `read JSON reference`() { + val expected = DataFrame.readJson("../data/participants.json") + + DataFrame.readSource("../data/participants.json") shouldBe expected + DataFrame.readSource(Path("../data/participants.json")) shouldBe expected + DataFrame.readSource(File("../data/participants.json")) shouldBe expected + DataFrame.readSource( + Path("../data/participants.json").absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = org.jetbrains.kotlinx.dataframe.io.Json.Options( + typeClashTactic = JSON.TypeClashTactic.ANY_COLUMNS, + ) + + DataFrame.readSource("../data/participants.json", options) shouldBe expected + DataFrame.readSource(Path("../data/participants.json"), options) shouldBe expected + DataFrame.readSource(File("../data/participants.json"), options) shouldBe expected + DataFrame.readSource( + Path("../data/participants.json").absolute().normalize().toUri().toURL(), + options, + ) shouldBe expected + } + + @Test + fun `read JSON in memory`() { + val expected = DataFrame.readJson("../data/participants.json") + + val file = File("../data/participants.json") + + DataFrame.readSource(file.readText()) shouldBe expected + DataFrame.readSource(file.inputStream()) shouldBe expected + DataFrame.readSource(Json.decodeFromString(file.readText())) shouldBe expected + + val options = org.jetbrains.kotlinx.dataframe.io.Json.Options( + typeClashTactic = JSON.TypeClashTactic.ANY_COLUMNS, + ) + + DataFrame.readSource(file.readText(), options) shouldBe expected + DataFrame.readSource(file.inputStream(), options) shouldBe expected + DataFrame.readSource(Json.decodeFromString(file.readText()), options) shouldBe expected + } + + @Test + fun `read CSV reference`() { + val csvPath = "../data/movies.csv" + val expected = DataFrame.readCsv(csvPath) + + DataFrame.readSource(csvPath) shouldBe expected + DataFrame.readSource(Path(csvPath)) shouldBe expected + DataFrame.readSource(File(csvPath)) shouldBe expected + DataFrame.readSource( + Path(csvPath).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = Csv.Options(delimiter = ',') + + DataFrame.readSource(csvPath, options) shouldBe expected + DataFrame.readSource(Path(csvPath), options) shouldBe expected + DataFrame.readSource(File(csvPath), options) shouldBe expected + DataFrame.readSource( + Path(csvPath).absolute().normalize().toUri().toURL(), + options, + ) shouldBe expected + } + + @Test + fun `read CSV in memory`() { + val file = File("../data/movies.csv") + val expected = DataFrame.readCsv(file) + + // String content has no extension hint, so we pin the format via options. + val options = Csv.Options(delimiter = ',') + + DataFrame.readSource(file.readText(), options) shouldBe expected + DataFrame.readSource(file.inputStream(), options) shouldBe expected + } + + @Test + fun `read TSV reference`() { + val tsvFile = File("src/test/resources/abc.tsv") + val expected = DataFrame.readTsv(tsvFile) + + DataFrame.readSource(tsvFile.path) shouldBe expected + DataFrame.readSource(Path(tsvFile.path)) shouldBe expected + DataFrame.readSource(tsvFile) shouldBe expected + DataFrame.readSource( + Path(tsvFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = Tsv.Options(delimiter = '\t') + + DataFrame.readSource(tsvFile.path, options) shouldBe expected + DataFrame.readSource(Path(tsvFile.path), options) shouldBe expected + DataFrame.readSource(tsvFile, options) shouldBe expected + DataFrame.readSource( + Path(tsvFile.path).absolute().normalize().toUri().toURL(), + options, + ) shouldBe expected + } + + @Test + fun `read TSV in memory`() { + val tsvFile = File("src/test/resources/abc.tsv") + val expected = DataFrame.readTsv(tsvFile) + val options = Tsv.Options(delimiter = '\t') + + // Binary/text without extension — options pin Tsv over Csv/Json/Xlsx. + DataFrame.readSource(tsvFile.readText(), options) shouldBe expected + DataFrame.readSource(tsvFile.inputStream(), options) shouldBe expected + } + + @Test + fun `read XLSX reference`() { + val xlsxFile = File("src/test/resources/sample2.xlsx") + val expected = DataFrame.readExcel(xlsxFile) + + DataFrame.readSource(xlsxFile.path) shouldBe expected + DataFrame.readSource(Path(xlsxFile.path)) shouldBe expected + DataFrame.readSource(xlsxFile) shouldBe expected + DataFrame.readSource( + Path(xlsxFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = ExcelNEW.Options(sheetName = "Sheet1") + + DataFrame.readSource(xlsxFile.path, options) shouldBe expected + DataFrame.readSource(Path(xlsxFile.path), options) shouldBe expected + DataFrame.readSource(xlsxFile, options) shouldBe expected + DataFrame.readSource( + Path(xlsxFile.path).absolute().normalize().toUri().toURL(), + options, + ) shouldBe expected + } + + @Test + fun `read XLS reference`() { + val xlsFile = File("src/test/resources/sample.xls") + val expected = DataFrame.readExcel(xlsFile) + + DataFrame.readSource(xlsFile.path) shouldBe expected + DataFrame.readSource(Path(xlsFile.path)) shouldBe expected + DataFrame.readSource(xlsFile) shouldBe expected + DataFrame.readSource( + Path(xlsFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + } + + @Test + fun `read XLSX in memory`() { + val xlsxFile = File("src/test/resources/sample2.xlsx") + val expected = DataFrame.readExcel(xlsxFile) + + // Workbook and Sheet are exclusive to ExcelNEW, so type-based dispatch works without options. + WorkbookFactory.create(xlsxFile.inputStream()).use { wb -> + DataFrame.readSource(wb) shouldBe expected + DataFrame.readSource(wb.getSheetAt(0)) shouldBe expected + } + + val options = ExcelNEW.Options() + + // Binary streams have no extension and are accepted by every format, + // so options are needed to pin ExcelNEW for the InputStream variant. + DataFrame.readSource(xlsxFile.inputStream(), options) shouldBe expected + + WorkbookFactory.create(xlsxFile.inputStream()).use { wb -> + DataFrame.readSource(wb, options) shouldBe expected + DataFrame.readSource(wb.getSheetAt(0), options) shouldBe expected + } + } + + @Test + fun `read XLS in memory`() { + val xlsFile = File("src/test/resources/sample.xls") + val expected = DataFrame.readExcel(xlsFile) + + WorkbookFactory.create(xlsFile.inputStream()).use { wb -> + DataFrame.readSource(wb) shouldBe expected + } + DataFrame.readSource(xlsFile.inputStream()) shouldBe expected + WorkbookFactory.create(xlsFile.inputStream()).use { wb -> + DataFrame.readSource(wb) shouldBe expected + } + } + + private fun h2Url(name: String) = "jdbc:h2:mem:$name;DB_CLOSE_DELAY=-1" + + private fun seed(connection: Connection) { + connection.createStatement().use { st -> + st.execute("CREATE TABLE Customer (id INT, name VARCHAR(255), age INT)") + st.execute("INSERT INTO Customer (id, name, age) VALUES (1, 'John', 40), (2, 'Alice', 25), (3, 'Bob', 47)") + } + } + + @Test + fun `read JDBC in memory`() { + val url = h2Url("guess2_inmem") + DriverManager.getConnection(url).use { conn -> + seed(conn) + + val expected = DataFrame.readSqlTable(conn, "Customer") + val tableOpts = Jdbc2.Options(sqlQueryOrTableName = "Customer") + val queryOpts = Jdbc2.Options(sqlQueryOrTableName = "SELECT * FROM Customer") + + // Connection — exclusive type, but query/table name must come from options. + DataFrame.readSource(conn, tableOpts) shouldBe expected + DataFrame.readSource(conn, queryOpts) shouldBe expected + + // DbConnectionConfig as InMemory. + val config = DbConnectionConfig(url = url) + DataFrame.readSource(config, tableOpts) shouldBe expected + DataFrame.readSource(config, queryOpts) shouldBe expected + + // DataSource — opens a fresh connection each call (DataSource.readDataFrame closes it via `use`). + val dataSource = object : DataSource { + override fun getConnection() = DriverManager.getConnection(url) + + override fun getConnection(u: String?, p: String?) = DriverManager.getConnection(url) + + override fun getLogWriter() = null + + override fun setLogWriter(out: java.io.PrintWriter?) {} + + override fun setLoginTimeout(seconds: Int) {} + + override fun getLoginTimeout() = 0 + + override fun getParentLogger() = throw UnsupportedOperationException() + + override fun unwrap(iface: Class?): T = throw UnsupportedOperationException() + + override fun isWrapperFor(iface: Class<*>?) = false + } + DataFrame.readSource(dataSource, tableOpts) shouldBe expected + + // ResultSet — no sqlQueryOrTableName needed; just dbType (or a Connection to derive it). + conn.prepareStatement("SELECT * FROM Customer").use { ps -> + ps.executeQuery().use { rs -> + DataFrame.readSource( + rs, + Jdbc2.Options(dbType = H2()), + ) shouldBe expected + } + } + conn.prepareStatement("SELECT * FROM Customer").use { ps -> + ps.executeQuery().use { rs -> + DataFrame.readSource( + rs, + Jdbc2.Options(resultSetConnection = conn), + ) shouldBe expected + } + } + } + } + + @Test + fun `unified readSource auto-detects references vs content`() { + // String that points to an existing file → routed through URL → JSON wins on extension + val jsonExpected = DataFrame.readJson("../data/participants.json") + DataFrame.readSource("../data/participants.json") shouldBe jsonExpected + + // Same idea for CSV/XLSX + val csvExpected = DataFrame.readCsv("../data/movies.csv") + DataFrame.readSource("../data/movies.csv") shouldBe csvExpected + + val xlsxExpected = DataFrame.readExcel(File("src/test/resources/sample2.xlsx")) + DataFrame.readSource("src/test/resources/sample2.xlsx") shouldBe xlsxExpected + + // String that doesn't resolve to a file → treated as raw content (JSON content here) + val file = File("../data/participants.json") + DataFrame.readSource(file.readText()) shouldBe jsonExpected + + // Non-String types: still work, no special handling needed + DataFrame.readSource(file) shouldBe jsonExpected + DataFrame.readSource(Path("../data/participants.json")) shouldBe jsonExpected + } + + @Test + fun `read JDBC reference`() { + val url = h2Url("guess2_ref") + DriverManager.getConnection(url).use { conn -> seed(conn) } + + val config = DbConnectionConfig(url = url) + val expected = DataFrame.readSqlTable(config, "Customer") + val tableOpts = Jdbc2.Options(sqlQueryOrTableName = "Customer") + + DataFrame.readSource(config, tableOpts) shouldBe expected + DataFrame.readSource(config, Jdbc2.Options(sqlQueryOrTableName = "SELECT * FROM Customer")) shouldBe expected + } + + @Test + fun `read schema via default fallback (file-based formats)`() { + // JSON + val jsonExpected = DataFrame.readJson("../data/participants.json").schema() + DataFrameSchema.readSource( + File("../data/participants.json"), + ) shouldBe jsonExpected + DataFrameSchema.readSource( + "../data/participants.json", + ) shouldBe jsonExpected + + // CSV + val csvExpected = DataFrame.readCsv("../data/movies.csv").schema() + DataFrameSchema.readSource( + File("../data/movies.csv"), + ) shouldBe csvExpected + + // TSV + val tsvFile = File("src/test/resources/abc.tsv") + val tsvExpected = DataFrame.readTsv(tsvFile).schema() + DataFrameSchema.readSource(tsvFile) shouldBe tsvExpected + + // XLSX + val xlsxFile = File("src/test/resources/sample2.xlsx") + val xlsxExpected = DataFrame.readExcel(xlsxFile).schema() + DataFrameSchema.readSource(xlsxFile) shouldBe xlsxExpected + } + + @Test + fun `read JDBC schema via override`() { + val url = h2Url("guess2_schema") + DriverManager.getConnection(url).use { conn -> + seed(conn) + val expected = DataFrameSchema.readSqlTable(conn, "Customer") + val tableOpts = Jdbc2.Options(sqlQueryOrTableName = "Customer") + val queryOpts = Jdbc2.Options(sqlQueryOrTableName = "SELECT * FROM Customer") + + DataFrameSchema.readSource(conn, tableOpts) shouldBe expected + DataFrameSchema.readSource(conn, queryOpts) shouldBe expected + + val config = DbConnectionConfig(url = url) + DataFrameSchema.readSource(config, tableOpts) shouldBe expected + } + } + + @Test + fun `read JDBC schema from ResultSet does not advance cursor`() { + val url = h2Url("guess2_rs_schema") + DriverManager.getConnection(url).use { conn -> + seed(conn) + + conn.prepareStatement("SELECT * FROM Customer").use { ps -> + ps.executeQuery().use { rs -> + // Schema-from-ResultSet uses JDBC metadata only — no rows are fetched, so the + // cursor stays at "before first row". (And nullability comes from the column metadata, + // which is conservatively nullable for columns without NOT NULL constraints; this is + // why we don't compare against the data-inferred schema directly.) + val expected = DataFrameSchema.readResultSet( + conn.prepareStatement("SELECT * FROM Customer").executeQuery(), + H2(), + ) + val schema = DataFrameSchema.readSource(rs, Jdbc2.Options(dbType = H2())) + schema shouldBe expected + rs.isBeforeFirst shouldBe true + } + } + } + } + + @Test + fun `read Arrow Feather reference`() { + val featherFile = File("src/test/resources/test.feather") + val expected = DataFrame.readArrowFeather(featherFile) + + DataFrame.readSource(featherFile.path) shouldBe expected + DataFrame.readSource(Path(featherFile.path)) shouldBe expected + DataFrame.readSource(featherFile) shouldBe expected + DataFrame.readSource( + Path(featherFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = ArrowFeatherNEW.Options() + + DataFrame.readSource(featherFile.path, options) shouldBe expected + DataFrame.readSource(featherFile, options) shouldBe expected + } + + @Test + fun `read Arrow Feather in memory`() { + val featherFile = File("src/test/resources/test.feather") + val expected = DataFrame.readArrowFeather(featherFile) + val options = ArrowFeatherNEW.Options() + + // ByteArray, InputStream, SeekableByteChannel all need options to disambiguate (no extension). + DataFrame.readSource(featherFile.readBytes(), options) shouldBe expected + DataFrame.readSource(featherFile.inputStream(), options) shouldBe expected + java.nio.file.Files.newByteChannel(featherFile.toPath()).use { channel -> + DataFrame.readSource(channel, options) shouldBe expected + } + } + + @Test + fun `read Arrow IPC reference`() { + val ipcFile = File("src/test/resources/test.arrow") + val expected = DataFrame.readArrowIPC(ipcFile) + + DataFrame.readSource(ipcFile.path) shouldBe expected + DataFrame.readSource(Path(ipcFile.path)) shouldBe expected + DataFrame.readSource(ipcFile) shouldBe expected + DataFrame.readSource( + Path(ipcFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = ArrowIPC.Options() + DataFrame.readSource(ipcFile, options) shouldBe expected + } + + @Test + fun `read Parquet reference`() { + val parquetFile = File("src/test/resources/test.parquet") + val expected = DataFrame.readParquet(parquetFile) + + DataFrame.readSource(parquetFile.path) shouldBe expected + DataFrame.readSource(Path(parquetFile.path)) shouldBe expected + DataFrame.readSource(parquetFile) shouldBe expected + DataFrame.readSource( + Path(parquetFile.path).absolute().normalize().toUri().toURL(), + ) shouldBe expected + + val options = Parquet.Options() + DataFrame.readSource(parquetFile, options) shouldBe expected + } + + @Test + fun `read OpenAPI yaml as code`() { + val openApiFile = File("src/test/resources/petstore.yaml") + + // The reference call from the existing helper, used as the ground truth. + val expected = readOpenApiAsString( + openApiAsString = openApiFile.readText(), + name = "Petstore", + extensionProperties = false, + generateHelperCompanionObject = false, + ) + + // String path / File / Path / URL all route through readSourceImpl to OpenApi2. + CodeString.readSource(openApiFile.path, name = "Petstore").value shouldBe expected + CodeString.readSource(openApiFile, name = "Petstore").value shouldBe expected + CodeString.readSource(Path(openApiFile.path), name = "Petstore").value shouldBe expected + CodeString.readSource( + Path(openApiFile.path).absolute().normalize().toUri().toURL(), + name = "Petstore", + ).value shouldBe expected + + // String content path (raw spec text) also works. + CodeString.readSource(openApiFile.readText(), name = "Petstore").value shouldBe expected + } + + @Test + fun `OpenAPI does not steal plain JSON DataFrame reads`() { + // A regular JSON file (not an OpenAPI spec) still goes to Json, even though OpenApi2 runs first. + // OpenApi2.readDataSchemaCodeOrNull returns null for non-OpenAPI content, but more importantly + // OpenApi2.readDataFrameOrNull is the interface default (null), so DataFrame reads fall through. + val expected = DataFrame.readJson("../data/participants.json") + DataFrame.readSource(File("../data/participants.json")) shouldBe expected + } + + @Test + fun `default DataSchema code generation works for JSON via interface default`() { + // The interface default reads the schema and calls generateInterfaces — exercise it on a JSON file. + val jsonFile = File("../data/participants.json") + val schemaCode = CodeString.readSource(jsonFile, name = "Participants") + // The output is non-empty and includes the marker name. + schemaCode.value shouldContain "Participants" + } + + // region DataRow.readSource — single-row inputs across formats + + @Test + fun `read DataRow from CSV string`() { + val csvText = "a,b,c\n1,2,3" + val expected = DataFrame.readCsvStr(csvText).single() + DataRow.readSource(csvText, Csv.Options()) shouldBe expected + } + + @Test + fun `read DataRow from TSV string`() { + val tsvText = "a\tb\tc\n1\t2\t3" + val expected = DataFrame.readTsvStr(tsvText).single() + DataRow.readSource(tsvText, Tsv.Options()) shouldBe expected + } + + @Test + fun `read DataRow from JSON string`() { + // A single-element JSON array yields a one-row DataFrame. + val jsonText = """[{"a": 1, "b": 2}]""" + val expected = DataFrame.readJsonStr(jsonText).single() + DataRow.readSource(jsonText) shouldBe expected + } + + @Test + fun `read DataRow from single-row XLSX file`() { + // sample2.xlsx has exactly one data row. + val xlsxFile = File("src/test/resources/sample2.xlsx") + val expected = DataFrame.readExcel(xlsxFile).single() + DataRow.readSource(xlsxFile) shouldBe expected + } + + @Test + fun `read DataRow from JDBC with single-row query`() { + val url = h2Url("guess2_datarow") + DriverManager.getConnection(url).use { conn -> + seed(conn) + val query = "SELECT * FROM Customer WHERE id = 1" + val expected = DataFrame.readSqlQuery(conn, query).single() + DataRow.readSource(conn, Jdbc2.Options(sqlQueryOrTableName = query)) shouldBe expected + } + } + + @Test + fun `read DataRow throws when source has multiple rows`() { + // movies.csv has many rows — DataRow.single() should fail, surfaced as the framework's + // "Unknown DataRow source" since the exception is caught and converted. + val movies = File("../data/movies.csv") + try { + DataRow.readSource(movies) + error("Expected DataRow.readSource to fail on a multi-row CSV") + } catch (_: IllegalArgumentException) { + // expected + } + } + + // endregion + + // region convert API integration — convert { col }.to() + // + // Frame columns are typed by their schema, so each column being converted must contain sources of the + // same shape. Mixing, say, a CSV-shaped source and a JSON-shaped source in the same column would yield + // a FrameColumn with no coherent single schema — these tests keep each column homogeneous and put + // differently-shaped sources into separate columns. + + @Test + fun `convert column of CSV files to DataFrame`() { + // Two cells, both pointing at the same CSV → uniform shape in the resulting FrameColumn. + val csvFile = File("../data/movies.csv") + val df = dataFrameOf("source")(csvFile, csvFile) + + val converted = df.convert("source").to>() + + val expected = DataFrame.readCsv(csvFile) + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + @Test + fun `convert column of CSV files to DataFrameSchema`() { + val csvFile = File("../data/movies.csv") + val df = dataFrameOf("source")(csvFile, csvFile) + + val converted = df.convert("source").to() + + val expected = DataFrame.readCsv(csvFile).schema() + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + @Test + fun `convert column of single-row XLSX files to DataRow`() { + // sample2.xlsx has exactly one data row, so .to>() works for each cell. + val xlsxFile = File("src/test/resources/sample2.xlsx") + val df = dataFrameOf("source")(xlsxFile, xlsxFile) + + val converted = df.convert("source").to>() + + val expected = DataFrame.readExcel(xlsxFile).single() + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + @Test + fun `convert column of String content to DataFrame`() { + // Multiple parallel JSON content strings (same shape) → uniform FrameColumn. + val text = """[{"a": 1, "b": 2}]""" + val df = dataFrameOf("source")(text, text) + + val converted = df.convert("source").to>() + + val expected = DataFrame.readJsonStr(text) + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + @Test + fun `convert two homogeneous source columns at once`() { + // Each column is internally uniform: csvCol has CSV-shaped cells, jsonCol has JSON-shaped cells. + // The result is two FrameColumns, each with its own coherent schema. + val csvFile = File("../data/movies.csv") + val jsonFile = File("../data/participants.json") + val df = dataFrameOf("csvCol", "jsonCol")(csvFile, jsonFile, csvFile, jsonFile) + + val converted = df.convert("csvCol", "jsonCol").to>() + + val expectedCsv = DataFrame.readCsv(csvFile) + val expectedJson = DataFrame.readJson(jsonFile) + converted["csvCol"][0] shouldBe expectedCsv + converted["csvCol"][1] shouldBe expectedCsv + converted["jsonCol"][0] shouldBe expectedJson + converted["jsonCol"][1] shouldBe expectedJson + } + + @Test + fun `convert column of URLs to DataFrame`() { + // Two URLs pointing at the same JSON file → uniform schema in the FrameColumn. + val jsonUrl = File("../data/participants.json").toURI().toURL() + val urls = columnOf(jsonUrl, jsonUrl) named "source" + val df = urls.toDataFrame() + + val converted = df.convert("source").to>() + val expected = DataFrame.readJson(jsonUrl) + converted["source"][0] shouldBe expected + converted["source"][1] shouldBe expected + } + + // endregion +} diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 5900956fcf..3ba90f39d1 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -464,7 +464,9 @@ class ParserTests { @Test fun `Mixing null and json`() { val col by columnOf("[\"str\"]", "[]", "null") - val parsed = col.parse() + val parsed = col.parse( + ParserOptions(parseToDataFrameReadSource = true), + ) parsed.type() shouldBe typeOf() parsed.kind() shouldBe ColumnKind.Frame require(parsed.isFrameColumn()) diff --git a/dataframe-arrow/api/dataframe-arrow.api b/dataframe-arrow/api/dataframe-arrow.api index c3f0a80c3b..4a098bd002 100644 --- a/dataframe-arrow/api/dataframe-arrow.api +++ b/dataframe-arrow/api/dataframe-arrow.api @@ -9,6 +9,64 @@ public final class org/jetbrains/kotlinx/dataframe/io/ArrowFeather : org/jetbrai public fun readDataFrame (Ljava/nio/file/Path;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; } +public final class org/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)V + public synthetic fun (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public final fun copy (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$Options;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowFeatherNEW$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getNullability ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowIPC : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/ArrowIPC$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowIPC$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/ArrowIPC$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)V + public synthetic fun (Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Lorg/apache/arrow/memory/RootAllocator; + public final fun component2 ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public final fun copy (Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowIPC$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/ArrowIPC$Options;Lorg/apache/arrow/memory/RootAllocator;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ArrowIPC$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getAllocator ()Lorg/apache/arrow/memory/RootAllocator; + public final fun getNullability ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/ArrowReadingKt { public static final fun readArrow (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Lorg/apache/arrow/vector/ipc/ArrowReader;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static synthetic fun readArrow$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Lorg/apache/arrow/vector/ipc/ArrowReader;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; @@ -323,3 +381,33 @@ public final class org/jetbrains/kotlinx/dataframe/io/ConvertingMismatch$Widenin public fun toString ()Ljava/lang/String; } +public final class org/jetbrains/kotlinx/dataframe/io/Parquet : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/Parquet$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Parquet$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/Parquet$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;J)V + public synthetic fun (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;JILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public final fun component2 ()J + public final fun copy (Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;J)Lorg/jetbrains/kotlinx/dataframe/io/Parquet$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Parquet$Options;Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions;JILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Parquet$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getBatchSize ()J + public final fun getNullability ()Lorg/jetbrains/kotlinx/dataframe/api/NullabilityOptions; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + diff --git a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt index 5cf884d73d..bc4f1b78df 100644 --- a/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt +++ b/dataframe-arrow/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/arrowReading.kt @@ -18,6 +18,9 @@ import java.nio.channels.ReadableByteChannel import java.nio.channels.SeekableByteChannel import java.nio.file.Files import java.nio.file.Path +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf +import kotlin.reflect.typeOf public class ArrowFeather : SupportedDataFrameFormat { override fun readDataFrame(stream: InputStream, header: List): AnyFrame = @@ -36,6 +39,252 @@ public class ArrowFeather : SupportedDataFrameFormat { DefaultReadArrowMethod(pathRepresentation) } +/** + * [DataFrameReadSource] for [Arrow Feather files][DataFrame.readArrowFeather] (random-access IPC format). + * + * Supported source types: + * - References: [URL], [Path], [File] + * - In-memory: [SeekableByteChannel], [ByteArray], [InputStream], [ArrowReader] + * + * Default-accepts the `.feather` extension. To read with no extension hint (e.g., an [InputStream]) pass + * an [Options] instance to disambiguate from text formats. + */ +public class ArrowFeatherNEW : DataFrameReadSource { + + public data class Options(val nullability: NullabilityOptions = NullabilityOptions.Infer) : DataFrameReadOptions + + override val supportedTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + + public companion object { + internal const val EXTENSION: String = "feather" + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.kType + + // ArrowReader is exclusive; check before more general types. + if (kType.isSubTypeOf()) { + return (source as? ArrowReader)?.let { DataFrame.readArrow(it, opts.nullability) } + } + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return DataFrame.readArrowFeather(url, opts.nullability) + } + + return when { + kType.isSubTypeOf() -> + (source as? SeekableByteChannel)?.let { + DataFrame.readArrowFeather(it, nullability = opts.nullability) + } + + kType.isSubTypeOf() -> + (source as? ByteArray)?.let { DataFrame.readArrowFeather(it, opts.nullability) } + + kType.isSubTypeOf() -> + (source as? InputStream)?.let { DataFrame.readArrowFeather(it, opts.nullability) } + + else -> null + } + } + + override val testOrder: Int = 60_000 + + override fun toString(): String = "ArrowFeather" +} + +/** + * [DataFrameReadSource] for [Arrow IPC streaming files][DataFrame.readArrowIPC]. + * + * Supported source types: + * - References: [URL], [Path], [File] + * - In-memory: [InputStream], [ByteArray], [ReadableByteChannel], [ArrowReader] + * + * There's no widely-standardized extension for IPC streaming files (`.arrow` is most common but is also + * used for random-access Feather), so this format accepts the `.arrow` extension. If your `.arrow` file is + * actually random-access (Feather), prefer [ArrowFeatherNEW] — both formats will match `.arrow`, but + * [ArrowFeatherNEW] runs first by [testOrder] and a Feather read of a streaming-format file will throw, + * letting the framework fall through to [ArrowIPC]. + */ +public class ArrowIPC : DataFrameReadSource { + + public data class Options( + val allocator: RootAllocator = Allocator.ROOT, + val nullability: NullabilityOptions = NullabilityOptions.Infer, + ) : DataFrameReadOptions + + override val supportedTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + + public companion object { + internal const val EXTENSION: String = "arrow" + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.kType + + if (kType.isSubTypeOf()) { + return (source as? ArrowReader)?.let { DataFrame.readArrow(it, opts.nullability) } + } + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return DataFrame.readArrowIPC(url, opts.nullability) + } + + return when { + kType.isSubTypeOf() -> + (source as? ReadableByteChannel)?.let { + DataFrame.readArrowIPC(it, allocator = opts.allocator, nullability = opts.nullability) + } + + kType.isSubTypeOf() -> + (source as? ByteArray)?.let { DataFrame.readArrowIPC(it, opts.nullability) } + + kType.isSubTypeOf() -> + (source as? InputStream)?.let { DataFrame.readArrowIPC(it, opts.nullability) } + + else -> null + } + } + + // Runs after ArrowFeatherNEW so that `.feather` files get the random-access reader first. + // Both accept `.arrow`; if Feather reading throws on an IPC streaming file the framework falls + // through to here. + override val testOrder: Int = 60_100 + + override fun toString(): String = "ArrowIPC" +} + +/** + * [DataFrameReadSource] for Apache Parquet files (read via Arrow Dataset). + * + * Arrow Dataset only consumes URIs, so only reference-style sources are supported: + * - References: [URL], [Path], [File] + * + * TODO? Multi-file Parquet datasets (vararg in [DataFrame.readParquet]) aren't covered by this single-source API; + * use [DataFrame.readParquet] directly for those. + */ +public class Parquet : DataFrameReadSource { + + public data class Options( + val nullability: NullabilityOptions = NullabilityOptions.Infer, + val batchSize: Long = ARROW_PARQUET_DEFAULT_BATCH_SIZE, + ) : DataFrameReadOptions + + override val supportedTypes: Set = + setOf(typeOf(), typeOf(), typeOf()) + + public companion object { + internal const val EXTENSION: String = "parquet" + internal val MIME_TYPES = setOf( + "application/x-parquet", + "application/parquet", + ) + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPES) return false + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.kType + return when { + kType.isSubTypeOf() -> + (source as? URL)?.let { + DataFrame.readParquet( + it, + nullability = opts.nullability, + batchSize = opts.batchSize, + ) + } + + kType.isSubTypeOf() -> + (source as? Path)?.let { + DataFrame.readParquet( + it, + nullability = opts.nullability, + batchSize = opts.batchSize, + ) + } + + kType.isSubTypeOf() -> + (source as? File)?.let { + DataFrame.readParquet( + it, + nullability = opts.nullability, + batchSize = opts.batchSize, + ) + } + + else -> null + } + } + + override val testOrder: Int = 60_500 + + override fun toString(): String = "Parquet" +} + +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + private const val READ_ARROW_FEATHER = "readArrowFeather" internal const val ARROW_PARQUET_DEFAULT_BATCH_SIZE = 32768L diff --git a/dataframe-arrow/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource b/dataframe-arrow/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource new file mode 100644 index 0000000000..ef8466f22f --- /dev/null +++ b/dataframe-arrow/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource @@ -0,0 +1,3 @@ +org.jetbrains.kotlinx.dataframe.io.ArrowIPC +org.jetbrains.kotlinx.dataframe.io.ArrowFeatherNEW +org.jetbrains.kotlinx.dataframe.io.Parquet diff --git a/dataframe-csv/api/dataframe-csv.api b/dataframe-csv/api/dataframe-csv.api index a9a964783c..d8c4ca4133 100644 --- a/dataframe-csv/api/dataframe-csv.api +++ b/dataframe-csv/api/dataframe-csv.api @@ -1,3 +1,57 @@ +public final class org/jetbrains/kotlinx/dataframe/io/Csv : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/Csv$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Csv$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/Csv$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZ)V + public synthetic fun (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()C + public final fun component10 ()Z + public final fun component11 ()C + public final fun component12 ()Z + public final fun component13 ()Z + public final fun component14 ()Z + public final fun component2 ()Ljava/util/List; + public final fun component3 ()Ljava/nio/charset/Charset; + public final fun component4 ()Ljava/util/Map; + public final fun component5 ()J + public final fun component6 ()Ljava/lang/Long; + public final fun component7 ()Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public final fun component8 ()Z + public final fun component9 ()Z + public final fun copy (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZ)Lorg/jetbrains/kotlinx/dataframe/io/Csv$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Csv$Options;CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Csv$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getAllowMissingColumns ()Z + public final fun getCharset ()Ljava/nio/charset/Charset; + public final fun getColTypes ()Ljava/util/Map; + public final fun getDelimiter ()C + public final fun getHeader ()Ljava/util/List; + public final fun getIgnoreEmptyLines ()Z + public final fun getIgnoreExcessColumns ()Z + public final fun getIgnoreSurroundingSpaces ()Z + public final fun getParseParallel ()Z + public final fun getParserOptions ()Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public final fun getQuote ()C + public final fun getReadLines ()Ljava/lang/Long; + public final fun getSkipLines ()J + public final fun getTrimInsideQuoted ()Z + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/CsvDeephaven : org/jetbrains/kotlinx/dataframe/io/SupportedDataFrameFormat { public fun ()V public fun (C)V @@ -121,6 +175,60 @@ public final class org/jetbrains/kotlinx/dataframe/io/ToTsvStrKt { public static synthetic fun toTsvStr$default (Lorg/jetbrains/kotlinx/dataframe/DataFrame;ZCLjava/lang/Character;Lorg/jetbrains/kotlinx/dataframe/io/QuoteMode;Ljava/lang/Character;Ljava/lang/Character;Ljava/util/List;Ljava/lang/String;ILjava/lang/Object;)Ljava/lang/String; } +public final class org/jetbrains/kotlinx/dataframe/io/Tsv : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/Tsv$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Tsv$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/Tsv$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZ)V + public synthetic fun (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()C + public final fun component10 ()Z + public final fun component11 ()C + public final fun component12 ()Z + public final fun component13 ()Z + public final fun component14 ()Z + public final fun component2 ()Ljava/util/List; + public final fun component3 ()Ljava/nio/charset/Charset; + public final fun component4 ()Ljava/util/Map; + public final fun component5 ()J + public final fun component6 ()Ljava/lang/Long; + public final fun component7 ()Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public final fun component8 ()Z + public final fun component9 ()Z + public final fun copy (CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZ)Lorg/jetbrains/kotlinx/dataframe/io/Tsv$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Tsv$Options;CLjava/util/List;Ljava/nio/charset/Charset;Ljava/util/Map;JLjava/lang/Long;Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions;ZZZCZZZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Tsv$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getAllowMissingColumns ()Z + public final fun getCharset ()Ljava/nio/charset/Charset; + public final fun getColTypes ()Ljava/util/Map; + public final fun getDelimiter ()C + public final fun getHeader ()Ljava/util/List; + public final fun getIgnoreEmptyLines ()Z + public final fun getIgnoreExcessColumns ()Z + public final fun getIgnoreSurroundingSpaces ()Z + public final fun getParseParallel ()Z + public final fun getParserOptions ()Lorg/jetbrains/kotlinx/dataframe/api/ParserOptions; + public final fun getQuote ()C + public final fun getReadLines ()Ljava/lang/Long; + public final fun getSkipLines ()J + public final fun getTrimInsideQuoted ()Z + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/TsvDeephaven : org/jetbrains/kotlinx/dataframe/io/SupportedDataFrameFormat { public fun ()V public fun (C)V diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt index 34bf7aab0d..cd9230b610 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/io/readDelim.kt @@ -231,8 +231,16 @@ private fun CsvReader.ResultColumn.toDataColumn( val givenSkipTypes = parserOptions?.skipTypes ?: DataFrame.parser.skipTypes // no need to check for types that Deephaven already parses, skip those too val adjustedSkipTypes = givenSkipTypes + typesDeephavenAlreadyParses + + val parseDfReadSource = parserOptions?.parseToDataFrameReadSource + ?: DataFrame.parser.parseToDataFrameReadSource.takeIf { it } // only take if adjusted to 'true' + ?: true + val adjustedParserOptions = (parserOptions ?: ParserOptions()) - .copy(skipTypes = adjustedSkipTypes) + .copy( + skipTypes = adjustedSkipTypes, + parseToDataFrameReadSource = parseDfReadSource, + ) column.tryParse(adjustedParserOptions) } diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt index 801fd3a717..57cdb342da 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/csv.kt @@ -3,12 +3,17 @@ package org.jetbrains.kotlinx.dataframe.io import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams import java.io.File import java.io.InputStream +import java.net.URL +import java.nio.charset.Charset import java.nio.file.Path +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf import kotlin.reflect.typeOf public class CsvDeephaven(private val delimiter: Char = DelimParams.CSV_DELIMITER) : SupportedDataFrameFormat { @@ -33,6 +38,134 @@ public class CsvDeephaven(private val delimiter: Char = DelimParams.CSV_DELIMITE } } +public class Csv : DataFrameReadSource { + + public data class Options( + val delimiter: Char = DelimParams.CSV_DELIMITER, + val header: List = DelimParams.HEADER, + val charset: Charset? = DelimParams.CHARSET, + val colTypes: Map = DelimParams.COL_TYPES, + val skipLines: Long = DelimParams.SKIP_LINES, + val readLines: Long? = DelimParams.READ_LINES, + val parserOptions: ParserOptions? = DelimParams.PARSER_OPTIONS, + val ignoreEmptyLines: Boolean = DelimParams.IGNORE_EMPTY_LINES, + val allowMissingColumns: Boolean = DelimParams.ALLOW_MISSING_COLUMNS, + val ignoreExcessColumns: Boolean = DelimParams.IGNORE_EXCESS_COLUMNS, + val quote: Char = DelimParams.QUOTE, + val ignoreSurroundingSpaces: Boolean = DelimParams.IGNORE_SURROUNDING_SPACES, + val trimInsideQuoted: Boolean = DelimParams.TRIM_INSIDE_QUOTED, + val parseParallel: Boolean = DelimParams.PARSE_PARALLEL, + ) : DataFrameReadOptions + + override val supportedTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) + + public companion object { + internal val EXTENSIONS = setOf("csv", "zip", "gz") + internal val MIME_TYPES = setOf( + "text/csv", + "application/zip", + "application/gzip", + ) + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + if (sourceInfo.extension != null && sourceInfo.extension !in EXTENSIONS) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPES) return false + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.kType + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return DataFrame.readCsv( + url = url, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + return when { + kType.isSubTypeOf() -> + (source as? InputStream)?.let { stream -> + DataFrame.readCsv( + inputStream = stream, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + kType.isSubTypeOf() -> + (source as? String)?.let { text -> + // early fail + if (opts.delimiter !in text) return null + + DataFrame.readCsvStr( + text = text, + delimiter = opts.delimiter, + header = opts.header, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + else -> null + } + } + + override val testOrder: Int = 20_000 + + override fun toString(): String = "Csv" +} + +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + private const val READ_CSV = "readCsv" internal class DefaultReadCsvMethod(path: String?, arguments: MethodArguments) : diff --git a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt index ecb123b93c..49f3723a5e 100644 --- a/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt +++ b/dataframe-csv/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/tsv.kt @@ -3,12 +3,17 @@ package org.jetbrains.kotlinx.dataframe.io import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod import org.jetbrains.kotlinx.dataframe.documentationCsv.DelimParams import java.io.File import java.io.InputStream +import java.net.URL +import java.nio.charset.Charset import java.nio.file.Path +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf import kotlin.reflect.typeOf public class TsvDeephaven(private val delimiter: Char = DelimParams.TSV_DELIMITER) : SupportedDataFrameFormat { @@ -33,6 +38,134 @@ public class TsvDeephaven(private val delimiter: Char = DelimParams.TSV_DELIMITE } } +public class Tsv : DataFrameReadSource { + + public data class Options( + val delimiter: Char = DelimParams.TSV_DELIMITER, + val header: List = DelimParams.HEADER, + val charset: Charset? = DelimParams.CHARSET, + val colTypes: Map = DelimParams.COL_TYPES, + val skipLines: Long = DelimParams.SKIP_LINES, + val readLines: Long? = DelimParams.READ_LINES, + val parserOptions: ParserOptions? = DelimParams.PARSER_OPTIONS, + val ignoreEmptyLines: Boolean = DelimParams.IGNORE_EMPTY_LINES, + val allowMissingColumns: Boolean = DelimParams.ALLOW_MISSING_COLUMNS, + val ignoreExcessColumns: Boolean = DelimParams.IGNORE_EXCESS_COLUMNS, + val quote: Char = DelimParams.QUOTE, + val ignoreSurroundingSpaces: Boolean = DelimParams.IGNORE_SURROUNDING_SPACES, + val trimInsideQuoted: Boolean = DelimParams.TRIM_INSIDE_QUOTED, + val parseParallel: Boolean = DelimParams.PARSE_PARALLEL, + ) : DataFrameReadOptions + + override val supportedTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) + + public companion object { + internal val EXTENSIONS = setOf("tsv", "zip", "gz") + internal val MIME_TYPE = setOf( + "text/tab-separated-values", + "application/zip", + "application/gzip", + ) + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + if (sourceInfo.extension != null && sourceInfo.extension !in EXTENSIONS) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPE) return false + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.kType + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return DataFrame.readTsv( + url = url, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + return when { + kType.isSubTypeOf() -> + (source as? InputStream)?.let { stream -> + DataFrame.readTsv( + inputStream = stream, + delimiter = opts.delimiter, + header = opts.header, + charset = opts.charset, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + kType.isSubTypeOf() -> + (source as? String)?.let { text -> + // early fail + if (opts.delimiter !in text) return null + + DataFrame.readTsvStr( + text = text, + delimiter = opts.delimiter, + header = opts.header, + colTypes = opts.colTypes, + skipLines = opts.skipLines, + readLines = opts.readLines, + parserOptions = opts.parserOptions, + ignoreEmptyLines = opts.ignoreEmptyLines, + allowMissingColumns = opts.allowMissingColumns, + ignoreExcessColumns = opts.ignoreExcessColumns, + quote = opts.quote, + ignoreSurroundingSpaces = opts.ignoreSurroundingSpaces, + trimInsideQuoted = opts.trimInsideQuoted, + parseParallel = opts.parseParallel, + ) + } + + else -> null + } + } + + override val testOrder: Int = 30_000 + + override fun toString(): String = "Tsv" +} + +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + private const val READ_TSV = "readTsv" internal class DefaultReadTsvMethod(path: String?, arguments: MethodArguments) : diff --git a/dataframe-csv/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource b/dataframe-csv/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource new file mode 100644 index 0000000000..ebcae7710f --- /dev/null +++ b/dataframe-csv/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource @@ -0,0 +1,2 @@ +org.jetbrains.kotlinx.dataframe.io.Csv +org.jetbrains.kotlinx.dataframe.io.Tsv diff --git a/dataframe-excel/api/dataframe-excel.api b/dataframe-excel/api/dataframe-excel.api index 06541cf3c9..0320ffc31d 100644 --- a/dataframe-excel/api/dataframe-excel.api +++ b/dataframe-excel/api/dataframe-excel.api @@ -9,6 +9,47 @@ public final class org/jetbrains/kotlinx/dataframe/io/Excel : org/jetbrains/kotl public fun readDataFrame (Ljava/nio/file/Path;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; } +public final class org/jetbrains/kotlinx/dataframe/io/ExcelNEW : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/ExcelNEW$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/ExcelNEW$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/ExcelNEW$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public synthetic fun (Ljava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy;ZZILkotlin/jvm/internal/DefaultConstructorMarker;)V + public synthetic fun (Ljava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy;ZZLkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Ljava/lang/String; + public final fun component2 ()I + public final fun component3 ()Ljava/lang/String; + public final fun component4-358K8uM ()Ljava/lang/String; + public final fun component5 ()Ljava/lang/Integer; + public final fun component6 ()Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy; + public final fun component7 ()Z + public final fun component8 ()Z + public final fun copy-vOPuZIo (Ljava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy;ZZ)Lorg/jetbrains/kotlinx/dataframe/io/ExcelNEW$Options; + public static synthetic fun copy-vOPuZIo$default (Lorg/jetbrains/kotlinx/dataframe/io/ExcelNEW$Options;Ljava/lang/String;ILjava/lang/String;Ljava/lang/String;Ljava/lang/Integer;Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy;ZZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/ExcelNEW$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getColumns ()Ljava/lang/String; + public final fun getFirstRowIsHeader ()Z + public final fun getNameRepairStrategy ()Lorg/jetbrains/kotlinx/dataframe/io/NameRepairStrategy; + public final fun getParseEmptyAsNull ()Z + public final fun getRowsCount ()Ljava/lang/Integer; + public final fun getSheetName ()Ljava/lang/String; + public final fun getSkipRows ()I + public final fun getStringColumns-358K8uM ()Ljava/lang/String; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/FormattingOptions { public fun (Ljava/lang/String;Lorg/apache/poi/ss/usermodel/DataFormatter;)V public synthetic fun (Ljava/lang/String;Lorg/apache/poi/ss/usermodel/DataFormatter;ILkotlin/jvm/internal/DefaultConstructorMarker;)V diff --git a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt index e76332bfc7..543b30b333 100644 --- a/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt +++ b/dataframe-excel/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/xlsx.kt @@ -45,6 +45,9 @@ import kotlin.io.path.exists import kotlin.io.path.fileSize import kotlin.io.path.inputStream import kotlin.io.path.outputStream +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf +import kotlin.reflect.typeOf import java.time.LocalDate as JavaLocalDate import java.time.LocalDateTime as JavaLocalDateTime import java.util.Date as JavaDate @@ -64,6 +67,135 @@ public class Excel : SupportedDataFrameFormat { DefaultReadExcelMethod(pathRepresentation) } +public class ExcelNEW : DataFrameReadSource { + + public data class Options( + val sheetName: String? = null, + val skipRows: Int = 0, + val columns: String? = null, + val stringColumns: StringColumns? = null, + val rowsCount: Int? = null, + val nameRepairStrategy: NameRepairStrategy = NameRepairStrategy.CHECK_UNIQUE, + val firstRowIsHeader: Boolean = true, + val parseEmptyAsNull: Boolean = true, + ) : DataFrameReadOptions + + // String reference paths are normalized to URL by readSourceImpl, so no String entry here; + // Excel is binary, so raw String content isn't a meaningful input either. + override val supportedTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + + public companion object { + internal val EXTENSIONS: Set = setOf("xls", "xlsx") + internal val MIME_TYPES: Set = setOf( + "application/vnd.ms-excel", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + "application/x-tika-ooxml", + "application/x-tika-msoffice", + ) + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + val ext = sourceInfo.extension?.lowercase() + if (ext != null && ext !in EXTENSIONS) return false + val mime = sourceInfo.mimeType?.lowercase() + if (mime != null && mime !in MIME_TYPES) return false + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.kType + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return DataFrame.readExcel( + url = url, + sheetName = opts.sheetName, + skipRows = opts.skipRows, + columns = opts.columns, + stringColumns = opts.stringColumns, + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } + + return when { + kType.isSubTypeOf() -> + (source as? InputStream)?.let { stream -> + DataFrame.readExcel( + inputStream = stream, + sheetName = opts.sheetName, + skipRows = opts.skipRows, + columns = opts.columns, + stringColumns = opts.stringColumns, + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } + + kType.isSubTypeOf() -> + (source as? Workbook)?.let { wb -> + DataFrame.readExcel( + wb = wb, + sheetName = opts.sheetName, + skipRows = opts.skipRows, + columns = opts.columns, + formattingOptions = opts.stringColumns?.toFormattingOptions(), + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } + + kType.isSubTypeOf() -> + (source as? Sheet)?.let { sheet -> + // readExcel(Sheet) has no sheetName parameter — the sheet is already selected. + DataFrame.readExcel( + sheet = sheet, + columns = opts.columns, + formattingOptions = opts.stringColumns?.toFormattingOptions(), + skipRows = opts.skipRows, + rowsCount = opts.rowsCount, + nameRepairStrategy = opts.nameRepairStrategy, + firstRowIsHeader = opts.firstRowIsHeader, + parseEmptyAsNull = opts.parseEmptyAsNull, + ) + } + + else -> null + } + } + + override val testOrder: Int = 40_000 + + override fun toString(): String = "Xlsx" +} + +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + private const val MESSAGE_REMOVE_1_1 = "Will be removed in 1.1." internal const val READ_EXCEL_OLD = "This function is only here for binary compatibility. $MESSAGE_REMOVE_1_1" diff --git a/dataframe-excel/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource b/dataframe-excel/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource new file mode 100644 index 0000000000..9ceeadf039 --- /dev/null +++ b/dataframe-excel/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource @@ -0,0 +1 @@ +org.jetbrains.kotlinx.dataframe.io.ExcelNEW diff --git a/dataframe-jdbc/api/dataframe-jdbc.api b/dataframe-jdbc/api/dataframe-jdbc.api index 43b8f1cf82..4b3c3fc4e3 100644 --- a/dataframe-jdbc/api/dataframe-jdbc.api +++ b/dataframe-jdbc/api/dataframe-jdbc.api @@ -25,6 +25,42 @@ public final class org/jetbrains/kotlinx/dataframe/io/Jdbc : org/jetbrains/kotli public fun readDataFrame (Ljava/nio/file/Path;Ljava/util/List;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; } +public final class org/jetbrains/kotlinx/dataframe/io/Jdbc2 : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Jdbc2$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Ljava/lang/String;Ljava/lang/Integer;ZLorg/jetbrains/kotlinx/dataframe/io/db/DbType;ZLkotlin/jvm/functions/Function1;Ljava/sql/Connection;)V + public synthetic fun (Ljava/lang/String;Ljava/lang/Integer;ZLorg/jetbrains/kotlinx/dataframe/io/db/DbType;ZLkotlin/jvm/functions/Function1;Ljava/sql/Connection;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Ljava/lang/String; + public final fun component2 ()Ljava/lang/Integer; + public final fun component3 ()Z + public final fun component4 ()Lorg/jetbrains/kotlinx/dataframe/io/db/DbType; + public final fun component5 ()Z + public final fun component6 ()Lkotlin/jvm/functions/Function1; + public final fun component7 ()Ljava/sql/Connection; + public final fun copy (Ljava/lang/String;Ljava/lang/Integer;ZLorg/jetbrains/kotlinx/dataframe/io/db/DbType;ZLkotlin/jvm/functions/Function1;Ljava/sql/Connection;)Lorg/jetbrains/kotlinx/dataframe/io/Jdbc2$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Jdbc2$Options;Ljava/lang/String;Ljava/lang/Integer;ZLorg/jetbrains/kotlinx/dataframe/io/db/DbType;ZLkotlin/jvm/functions/Function1;Ljava/sql/Connection;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Jdbc2$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getConfigureStatement ()Lkotlin/jvm/functions/Function1; + public final fun getDbType ()Lorg/jetbrains/kotlinx/dataframe/io/db/DbType; + public final fun getInferNullability ()Z + public final fun getLimit ()Ljava/lang/Integer; + public final fun getResultSetConnection ()Ljava/sql/Connection; + public final fun getSqlQueryOrTableName ()Ljava/lang/String; + public final fun getStrictValidation ()Z + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/JdbcSchemaKt { public static final fun buildCodeForDB (Ljava/net/URL;Ljava/lang/String;)Ljava/lang/String; public static final fun getDatabaseCodeGenReader (Lorg/jetbrains/kotlinx/dataframe/codeGen/CodeGenerator$Companion;)Lkotlin/jvm/functions/Function2; diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt index a5307b96d9..b78be745f1 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/Jdbc.kt @@ -5,9 +5,19 @@ import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.codeGen.AbstractDefaultReadMethod import org.jetbrains.kotlinx.dataframe.codeGen.Code import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod +import org.jetbrains.kotlinx.dataframe.io.db.DbType +import org.jetbrains.kotlinx.dataframe.io.db.extractDBTypeFromConnection +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import java.io.File import java.io.InputStream import java.nio.file.Path +import java.sql.Connection +import java.sql.PreparedStatement +import java.sql.ResultSet +import javax.sql.DataSource +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf +import kotlin.reflect.typeOf // TODO: https://github.com/Kotlin/dataframe/issues/450 public class Jdbc : @@ -36,6 +46,152 @@ public class Jdbc : DefaultReadJdbcMethod(pathRepresentation) } +/** + * [DataFrameReadSource] for JDBC. + * + * Reading from JDBC always needs a "what" (a SQL query or table name) — unlike a file, a [Connection] doesn't + * carry that instruction. Provide it via [Options.sqlQueryOrTableName]. The only exception is [ResultSet], + * which is already an executed query. + * + * Supported source types: [Connection], [DataSource], [DbConnectionConfig], [ResultSet]. + * + * `readAllSqlTables` returns a `Map` and doesn't fit the single-DataFrame contract; it + * remains as a direct API call. + */ +public class Jdbc2 : DataFrameReadSource { + + public data class Options( + /** + * SQL query (e.g. `"SELECT * FROM users"`) or table name (e.g. `"users"`). + * Required for [Connection], [DataSource], and [DbConnectionConfig] sources. + * Ignored for [ResultSet] (it's already an executed query). + */ + val sqlQueryOrTableName: String? = null, + val limit: Int? = null, + val inferNullability: Boolean = true, + /** Optional, auto-detected from the source when `null`. */ + val dbType: DbType? = null, + val strictValidation: Boolean = true, + val configureStatement: (PreparedStatement) -> Unit = {}, + /** + * Only used when the source is a [ResultSet] and [dbType] is `null`; provides a [Connection] + * to auto-detect the database type. Ignored otherwise. + */ + val resultSetConnection: Connection? = null, + ) : DataFrameReadOptions + + override val supportedTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + return when (source) { + is ResultSet -> when { + opts.dbType != null -> + DataFrame.readResultSet(source, opts.dbType, opts.limit, opts.inferNullability) + + opts.resultSetConnection != null -> + DataFrame.readResultSet( + source, + opts.resultSetConnection, + opts.limit, + opts.inferNullability, + ) + + // Without dbType or a connection we can't read a ResultSet — fall through. + else -> null + } + + is Connection -> opts.sqlQueryOrTableName?.let { + source.readDataFrame( + sqlQueryOrTableName = it, + limit = opts.limit, + inferNullability = opts.inferNullability, + dbType = opts.dbType, + strictValidation = opts.strictValidation, + configureStatement = opts.configureStatement, + ) + } + + is DataSource -> opts.sqlQueryOrTableName?.let { + source.readDataFrame( + sqlQueryOrTableName = it, + limit = opts.limit, + inferNullability = opts.inferNullability, + dbType = opts.dbType, + strictValidation = opts.strictValidation, + configureStatement = opts.configureStatement, + ) + } + + is DbConnectionConfig -> opts.sqlQueryOrTableName?.let { + source.readDataFrame( + sqlQueryOrTableName = it, + limit = opts.limit, + inferNullability = opts.inferNullability, + dbType = opts.dbType, + strictValidation = opts.strictValidation, + configureStatement = opts.configureStatement, + ) + } + + else -> null + } + } + + override fun readDataFrameSchemaOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrameSchema? { + val opts = (options ?: Options()) as Options + return when (source) { + // ResultSet has a true zero-row metadata-only path. + is ResultSet -> when { + opts.dbType != null -> + DataFrameSchema.readResultSet(source, opts.dbType) + + opts.resultSetConnection != null -> + DataFrameSchema.readResultSet(source, extractDBTypeFromConnection(opts.resultSetConnection)) + + else -> null + } + + is Connection -> opts.sqlQueryOrTableName?.let { + source.readDataFrameSchema(sqlQueryOrTableName = it, dbType = opts.dbType) + } + + is DataSource -> opts.sqlQueryOrTableName?.let { + source.readDataFrameSchema(sqlQueryOrTableName = it, dbType = opts.dbType) + } + + is DbConnectionConfig -> opts.sqlQueryOrTableName?.let { + source.readDataFrameSchema(sqlQueryOrTableName = it, dbType = opts.dbType) + } + + else -> null + } + } + + override val testOrder: Int = 50_000 + + override fun toString(): String = "Jdbc" +} + private fun DataFrame.Companion.readJDBC(stream: File): DataFrame<*> { TODO("Not yet implemented") } diff --git a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt index e454dec406..80b54229b3 100644 --- a/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt +++ b/dataframe-jdbc/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/db/DuckDb.kt @@ -43,6 +43,7 @@ import org.duckdb.JsonNode import org.jetbrains.kotlinx.dataframe.AnyRow import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.api.Infer +import org.jetbrains.kotlinx.dataframe.api.ParserOptions import org.jetbrains.kotlinx.dataframe.api.asColumnGroup import org.jetbrains.kotlinx.dataframe.api.asDataColumn import org.jetbrains.kotlinx.dataframe.api.inferType @@ -158,7 +159,7 @@ public object DuckDb : AdvancedDbType("duckdb") { .withColumnBuilder(targetSchema = null) { name, values, inferNullability -> values .toColumn(name, if (inferNullability) Infer.Nulls else Infer.None) - .tryParse() + .tryParse(ParserOptions(parseToDataFrameReadSource = true)) .inferType() } diff --git a/dataframe-json/api/dataframe-json.api b/dataframe-json/api/dataframe-json.api index 368faf4a12..46885ef8e5 100644 --- a/dataframe-json/api/dataframe-json.api +++ b/dataframe-json/api/dataframe-json.api @@ -40,6 +40,40 @@ public final class org/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic : jav public static fun values ()[Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic; } +public final class org/jetbrains/kotlinx/dataframe/io/Json : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/Json$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/Json$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/Json$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Ljava/util/List;Z)V + public synthetic fun (Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Ljava/util/List;ZILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Ljava/util/List; + public final fun component2 ()Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic; + public final fun component3 ()Ljava/util/List; + public final fun component4 ()Z + public final fun copy (Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Ljava/util/List;Z)Lorg/jetbrains/kotlinx/dataframe/io/Json$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/Json$Options;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Ljava/util/List;ZILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/Json$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getHeader ()Ljava/util/List; + public final fun getKeyValuePaths ()Ljava/util/List; + public final fun getTypeClashTactic ()Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic; + public final fun getUnifyNumbers ()Z + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/JsonKt { public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/File;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame; public static final fun readJson (Lorg/jetbrains/kotlinx/dataframe/DataFrame$Companion;Ljava/io/InputStream;Ljava/util/List;Ljava/util/List;Lorg/jetbrains/kotlinx/dataframe/io/JSON$TypeClashTactic;Z)Lorg/jetbrains/kotlinx/dataframe/DataFrame; diff --git a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt index 4949d1f104..7a1e9f8532 100644 --- a/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt +++ b/dataframe-json/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/json.kt @@ -23,6 +23,8 @@ import org.jetbrains.kotlinx.dataframe.impl.io.encodeDataFrameWithMetadata import org.jetbrains.kotlinx.dataframe.impl.io.encodeFrame import org.jetbrains.kotlinx.dataframe.impl.io.encodeRow import org.jetbrains.kotlinx.dataframe.impl.io.readJsonImpl +import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions.Companion.GZIP_ON +import org.jetbrains.kotlinx.dataframe.io.Base64ImageEncodingOptions.Companion.LIMIT_SIZE_ON import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic import org.jetbrains.kotlinx.dataframe.io.JSON.TypeClashTactic.ARRAY_AND_VALUE_COLUMNS import java.io.File @@ -30,8 +32,113 @@ import java.io.InputStream import java.net.URL import java.nio.file.Path import kotlin.io.path.writeText +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf import kotlin.reflect.typeOf +public class Json : DataFrameReadSource { + + public data class Options( + val header: List = emptyList(), + val typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS, + val keyValuePaths: List = emptyList(), + val unifyNumbers: Boolean = true, + ) : DataFrameReadOptions + + override val supportedTypes: Set = + setOf( + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + typeOf(), + ) + + public companion object { + internal const val EXTENSION = "json" + internal val MIME_TYPES = setOf( + "application/json", + "application/x-json", + "text/json", + "text/x-json", + ) + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + if (sourceInfo.extension?.lowercase()?.equals(EXTENSION) == false) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPES) return false + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + @OptIn(ExperimentalSerializationApi::class) + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.kType + + val url: URL? = when { + kType.isSubTypeOf() -> source as? URL + kType.isSubTypeOf() -> (source as? Path)?.toUri()?.toURL() + kType.isSubTypeOf() -> (source as? File)?.toPath()?.toUri()?.toURL() + else -> null + } + if (url != null) { + return DataFrame.readJson( + url = url, + header = opts.header, + typeClashTactic = opts.typeClashTactic, + keyValuePaths = opts.keyValuePaths, + unifyNumbers = opts.unifyNumbers, + ) + } + + val element: JsonElement = when { + kType.isSubTypeOf() -> + (source as? InputStream)?.let { Json.decodeFromStream(it) } + + kType.isSubTypeOf() -> + (source as? String)?.let { + if (it.isNotJson()) return null + Json.decodeFromString(it) + } + + kType.isSubTypeOf() -> + source as? JsonElement + + else -> null + } ?: return null + + return readJsonImpl( + parsed = element, + header = opts.header, + typeClashTactic = opts.typeClashTactic, + keyValuePaths = opts.keyValuePaths, + unifyNumbers = opts.unifyNumbers, + ) + } + + override val testOrder: Int = 10_000 + + override fun toString(): String = "Json" + + // early-exit check for String to see if it's definitely not json + private fun String.isNotJson(): Boolean = + trim().let { + it.isEmpty() || + !( + (it.startsWith('{') && it.endsWith('}')) || + (it.startsWith('[') && it.endsWith(']')) + ) + } +} + +private inline fun KType.isSubTypeOf(): Boolean = this.isSubtypeOf(typeOf()) + public class JSON( private val typeClashTactic: TypeClashTactic = ARRAY_AND_VALUE_COLUMNS, private val keyValuePaths: List = emptyList(), diff --git a/dataframe-json/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource b/dataframe-json/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource new file mode 100644 index 0000000000..bb9b992aea --- /dev/null +++ b/dataframe-json/src/main/resources/META-INF/services/org.jetbrains.kotlinx.dataframe.io.DataFrameReadSource @@ -0,0 +1 @@ +org.jetbrains.kotlinx.dataframe.io.Json diff --git a/dataframe-json/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParseJsonColumnTests.kt b/dataframe-json/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParseJsonColumnTests.kt new file mode 100644 index 0000000000..1a6f3430bc --- /dev/null +++ b/dataframe-json/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParseJsonColumnTests.kt @@ -0,0 +1,215 @@ +package org.jetbrains.kotlinx.dataframe.io + +import io.kotest.matchers.shouldBe +import org.intellij.lang.annotations.Language +import org.jetbrains.kotlinx.dataframe.DataFrame +import org.jetbrains.kotlinx.dataframe.DataRow +import org.jetbrains.kotlinx.dataframe.api.asColumnGroup +import org.jetbrains.kotlinx.dataframe.api.asFrameColumn +import org.jetbrains.kotlinx.dataframe.api.columnOf +import org.jetbrains.kotlinx.dataframe.api.dataFrameOf +import org.jetbrains.kotlinx.dataframe.api.getColumnGroup +import org.jetbrains.kotlinx.dataframe.api.getFrameColumn +import org.jetbrains.kotlinx.dataframe.api.isColumnGroup +import org.jetbrains.kotlinx.dataframe.api.isFrameColumn +import org.jetbrains.kotlinx.dataframe.api.parse +import org.jetbrains.kotlinx.dataframe.api.parser +import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup +import org.jetbrains.kotlinx.dataframe.columns.FrameColumn +import org.junit.jupiter.api.AfterAll +import org.junit.jupiter.api.BeforeAll +import kotlin.reflect.typeOf +import kotlin.test.Test + +/** + * Tests that strings containing JSON in a [String] column can be parsed via [parse], + * where JSON arrays become [DataFrame]s (forming a [FrameColumn]) + * and JSON objects become [DataRow]s (forming a [ColumnGroup]). + */ +class ParseJsonColumnTests { + + companion object { + @[BeforeAll JvmStatic] + fun `setup ParserOptions`() { + DataFrame.parser.parseToDataFrameReadSource = true + } + + @[AfterAll JvmStatic] + fun `reset ParserOptions`() { + DataFrame.parser.resetToDefault() + } + } + + @Test + fun `parse column of json arrays into FrameColumn`() { + @Language("json") + val a = """[1, 2, 3]""" + + @Language("json") + val b = """[4, 5, 6]""" + + val col = columnOf(a, b) + val parsed = col.parse() + + parsed.isFrameColumn() shouldBe true + val frameCol = parsed.asFrameColumn() + frameCol.size() shouldBe 2 + frameCol[0]["value"].values().toList() shouldBe listOf(1, 2, 3) + frameCol[1]["value"].values().toList() shouldBe listOf(4, 5, 6) + } + + @Test + fun `parse column of json objects into ColumnGroup`() { + @Language("json") + val a = """{"x": 1, "y": "a"}""" + + @Language("json") + val b = """{"x": 2, "y": "b"}""" + + val col = columnOf(a, b) + val parsed = col.parse() + + parsed.isColumnGroup() shouldBe true + val group = parsed.asColumnGroup() + group.columnsCount() shouldBe 2 + group["x"].type() shouldBe typeOf() + group["y"].type() shouldBe typeOf() + group["x"].values().toList() shouldBe listOf(1, 2) + group["y"].values().toList() shouldBe listOf("a", "b") + } + + @Test + fun `parse dataframe column of json arrays into FrameColumn`() { + @Language("json") + val a = """[10, 20]""" + + @Language("json") + val b = """[30, 40, 50]""" + + val df = dataFrameOf("data")(a, b) + val parsed = df.parse("data") + + parsed.rowsCount() shouldBe 2 + parsed["data"].isFrameColumn() shouldBe true + val frameCol = parsed.getFrameColumn("data") + frameCol[0]["value"].values().toList() shouldBe listOf(10, 20) + frameCol[1]["value"].values().toList() shouldBe listOf(30, 40, 50) + } + + @Test + fun `parse dataframe column of json objects into ColumnGroup`() { + @Language("json") + val a = """{"name": "Alice", "age": 30}""" + + @Language("json") + val b = """{"name": "Bob", "age": 25}""" + + val df = dataFrameOf("person")(a, b) + val parsed = df.parse("person") + + parsed.rowsCount() shouldBe 2 + parsed["person"].isColumnGroup() shouldBe true + val group = parsed.getColumnGroup("person") + group.columnsCount() shouldBe 2 + group["name"].values().toList() shouldBe listOf("Alice", "Bob") + group["age"].values().toList() shouldBe listOf(30, 25) + group["name"].type() shouldBe typeOf() + group["age"].type() shouldBe typeOf() + } + + @Test + fun `parse column of json arrays of objects`() { + @Language("json") + val a = """[{"k": 1}, {"k": 2}]""" + + @Language("json") + val b = """[{"k": 3}, {"k": 4}, {"k": 5}]""" + + val parsed = columnOf(a, b).parse() + + parsed.isFrameColumn() shouldBe true + val frameCol = parsed.asFrameColumn() + frameCol.size() shouldBe 2 + frameCol[0]["k"].values().toList() shouldBe listOf(1, 2) + frameCol[1]["k"].values().toList() shouldBe listOf(3, 4, 5) + } + + @Test + fun `parse column of nested json objects`() { + @Language("json") + val a = """{"outer": {"inner": 1}}""" + + @Language("json") + val b = """{"outer": {"inner": 2}}""" + + val parsed = columnOf(a, b).parse() + + parsed.isColumnGroup() shouldBe true + val outer = parsed.asColumnGroup().getColumnGroup("outer") + outer["inner"].type() shouldBe typeOf() + outer["inner"].values().toList() shouldBe listOf(1, 2) + } + + @Test + fun `parse column of json objects containing arrays`() { + @Language("json") + val a = """{"name": "list1", "values": [1, 2, 3]}""" + + @Language("json") + val b = """{"name": "list2", "values": [4, 5]}""" + + val parsed = columnOf(a, b).parse() + + parsed.isColumnGroup() shouldBe true + val group = parsed.asColumnGroup() + group["name"].values().toList() shouldBe listOf("list1", "list2") + group["values"].type() shouldBe typeOf>() + group["values"].values().toList() shouldBe listOf(listOf(1, 2, 3), listOf(4, 5)) + } + + @Test + fun `parse column of json arrays with whitespace`() { + val col = columnOf(" [1, 2, 3] ", "\n[4, 5]\t") + val parsed = col.parse() + + parsed.isFrameColumn() shouldBe true + val frameCol = parsed.asFrameColumn() + frameCol.size() shouldBe 2 + frameCol[0]["value"].values().toList() shouldBe listOf(1, 2, 3) + frameCol[1]["value"].values().toList() shouldBe listOf(4, 5) + } + + @Test + fun `parse dataframe with multiple json columns`() { + @Language("json") + val obj1 = """{"a": 1}""" + + @Language("json") + val obj2 = """{"a": 2}""" + + @Language("json") + val arr1 = """[1, 2]""" + + @Language("json") + val arr2 = """[3, 4]""" + + val df = dataFrameOf("obj", "arr")( + obj1, + arr1, + obj2, + arr2, + ) + val parsed = df.parse() + + parsed.rowsCount() shouldBe 2 + parsed["obj"].isColumnGroup() shouldBe true + parsed["arr"].isFrameColumn() shouldBe true + + val objGroup = parsed.getColumnGroup("obj") + objGroup["a"].values().toList() shouldBe listOf(1, 2) + + val arrFrame = parsed.getFrameColumn("arr") + arrFrame[0]["value"].values().toList() shouldBe listOf(1, 2) + arrFrame[1]["value"].values().toList() shouldBe listOf(3, 4) + } +} diff --git a/dataframe-openapi-generator/api/dataframe-openapi-generator.api b/dataframe-openapi-generator/api/dataframe-openapi-generator.api index cc65f36f8e..76583b5641 100644 --- a/dataframe-openapi-generator/api/dataframe-openapi-generator.api +++ b/dataframe-openapi-generator/api/dataframe-openapi-generator.api @@ -19,6 +19,42 @@ public final class org/jetbrains/kotlinx/dataframe/io/OpenApi : org/jetbrains/ko public static synthetic fun readCodeForGeneration$default (Lorg/jetbrains/kotlinx/dataframe/io/OpenApi;Ljava/lang/String;Ljava/lang/String;ZZILjava/lang/Object;)Ljava/lang/String; } +public final class org/jetbrains/kotlinx/dataframe/io/OpenApi2 : org/jetbrains/kotlinx/dataframe/io/DataFrameReadSource { + public static final field Companion Lorg/jetbrains/kotlinx/dataframe/io/OpenApi2$Companion; + public fun ()V + public fun acceptsSource (Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Z + public fun getSupportedTypes ()Ljava/util/Set; + public fun getTestOrder ()I + public fun readDataFrameOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/DataFrame; + public fun readDataFrameSchemaOrNull (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Lorg/jetbrains/kotlinx/dataframe/schema/DataFrameSchema; + public fun readDataSchemaCodeOrNull-myXLQ2E (Ljava/lang/Object;Lorg/jetbrains/kotlinx/dataframe/io/DataSourceInfo;Ljava/lang/String;Lorg/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions;)Ljava/lang/String; + public fun toString ()Ljava/lang/String; +} + +public final class org/jetbrains/kotlinx/dataframe/io/OpenApi2$Companion { +} + +public final class org/jetbrains/kotlinx/dataframe/io/OpenApi2$Options : org/jetbrains/kotlinx/dataframe/io/DataFrameReadOptions { + public fun ()V + public fun (Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;)V + public synthetic fun (Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;ILkotlin/jvm/internal/DefaultConstructorMarker;)V + public final fun component1 ()Ljava/util/List; + public final fun component2 ()Lio/swagger/v3/parser/core/models/ParseOptions; + public final fun component3 ()Z + public final fun component4 ()Z + public final fun component5 ()Lorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility; + public final fun copy (Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;)Lorg/jetbrains/kotlinx/dataframe/io/OpenApi2$Options; + public static synthetic fun copy$default (Lorg/jetbrains/kotlinx/dataframe/io/OpenApi2$Options;Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;ILjava/lang/Object;)Lorg/jetbrains/kotlinx/dataframe/io/OpenApi2$Options; + public fun equals (Ljava/lang/Object;)Z + public final fun getAuth ()Ljava/util/List; + public final fun getExtensionProperties ()Z + public final fun getGenerateHelperCompanionObject ()Z + public final fun getParseOptions ()Lio/swagger/v3/parser/core/models/ParseOptions; + public final fun getVisibility ()Lorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility; + public fun hashCode ()I + public fun toString ()Ljava/lang/String; +} + public final class org/jetbrains/kotlinx/dataframe/io/ReadOpenapiKt { public static final fun readOpenApi (Ljava/lang/String;Ljava/lang/String;Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;)Ljava/lang/String; public static synthetic fun readOpenApi$default (Ljava/lang/String;Ljava/lang/String;Ljava/util/List;Lio/swagger/v3/parser/core/models/ParseOptions;ZZLorg/jetbrains/kotlinx/dataframe/codeGen/MarkerVisibility;ILjava/lang/Object;)Ljava/lang/String; diff --git a/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt b/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt index d3650fa791..ef7e087916 100644 --- a/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt +++ b/dataframe-openapi-generator/src/main/kotlin/org/jetbrains/kotlinx/dataframe/io/OpenApi.kt @@ -1,10 +1,152 @@ package org.jetbrains.kotlinx.dataframe.io +import io.swagger.v3.parser.core.models.AuthorizationValue +import io.swagger.v3.parser.core.models.ParseOptions +import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.annotations.DataSchema +import org.jetbrains.kotlinx.dataframe.api.CodeString import org.jetbrains.kotlinx.dataframe.codeGen.Code import org.jetbrains.kotlinx.dataframe.codeGen.DefaultReadDfMethod +import org.jetbrains.kotlinx.dataframe.codeGen.MarkerVisibility +import org.jetbrains.kotlinx.dataframe.schema.DataFrameSchema import java.io.File import java.io.InputStream +import java.net.URL +import java.nio.file.Path +import kotlin.io.path.readText +import kotlin.reflect.KType +import kotlin.reflect.full.isSubtypeOf +import kotlin.reflect.typeOf + +/** + * [DataFrameReadSource] for OpenAPI specifications. + * + * OpenAPI doesn't produce a `DataFrame` or a single `DataFrameSchema` — its output is a multi-marker code + * blob (interfaces + enums + typealiases). Only [readDataSchemaCodeOrNull] is overridden; the DataFrame + * and Schema methods return `null` (via the interface's defaults), so calling + * `DataFrame.readSource(openapiFile)` falls through to JSON, while `CodeString.readSource(openapiFile, name)` + * dispatches here. + * + * `.yaml`/`.yml` files are unambiguously OpenAPI; `.json` files are disambiguated at read time by + * [isOpenApiStr] returning null early when the JSON isn't actually an OpenAPI spec, letting the framework + * fall through to the JSON format for plain data. + */ +public class OpenApi2 : DataFrameReadSource { + + public data class Options( + val auth: List? = null, + val parseOptions: ParseOptions? = null, + val extensionProperties: Boolean = false, + val generateHelperCompanionObject: Boolean = false, + val visibility: MarkerVisibility = MarkerVisibility.IMPLICIT_PUBLIC, + ) : DataFrameReadOptions + + override val supportedTypes: Set = + setOf(typeOf(), typeOf(), typeOf(), typeOf(), typeOf()) + + public companion object { + internal val EXTENSIONS: Set = setOf("yaml", "yml", "json") + internal val MIME_TYPES = setOf( + "application/vnd.oai.openapi", + "application/vnd.oai.openapi+json", + "application/vnd.oai.openapi.yaml", + "application/vnd.oai.openapi+yaml", + "text/x-yaml", + "text/yaml", + "application/x-yaml", + "application/yaml", + "application/x-json", + "application/json", + "text/x-json", + "text/json", + ) + } + + override fun acceptsSource(sourceInfo: DataSourceInfo, options: DataFrameReadOptions?): Boolean { + if (options != null && options !is Options) return false + val ext = sourceInfo.extension?.lowercase() + if (ext != null && ext !in EXTENSIONS) return false + if (sourceInfo.mimeType != null && sourceInfo.mimeType !in MIME_TYPES) return false + return supportedTypes.any { sourceInfo.kType.isSubtypeOf(it) } + } + + // OpenAPI doesn't produce a DataFrame. + override fun readDataFrameOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrame<*>? = null + + // ...nor a single DataFrameSchema, it can produce enums, typealiases, etc. + // so it only supports readDataSchemaCodeOrNull() + override fun readDataFrameSchemaOrNull( + source: Any, + sourceInfo: DataSourceInfo, + options: DataFrameReadOptions?, + ): DataFrameSchema? = null + + override fun readDataSchemaCodeOrNull( + source: Any, + sourceInfo: DataSourceInfo, + name: String, + options: DataFrameReadOptions?, + ): CodeString? { + val opts = (options ?: Options()) as Options + val kType = sourceInfo.kType + + // Resolve to OpenAPI-spec text, returning null if the content isn't OpenAPI. + val text: String = when { + kType.isSubtypeOf(typeOf()) -> { + val url = (source as? URL) ?: return null + if (!isOpenApi(url)) return null + url.readText() + } + + kType.isSubtypeOf(typeOf()) -> { + val path = (source as? Path) ?: return null + if (!isOpenApi(path)) return null + path.readText() + } + + kType.isSubtypeOf(typeOf()) -> { + val file = (source as? File) ?: return null + if (!isOpenApi(file.toPath())) return null + file.readText() + } + + kType.isSubtypeOf(typeOf()) -> { + val text = (source as? String) ?: return null + if (!isOpenApiStr(text)) return null + text + } + + kType.isSubtypeOf(typeOf()) -> { + val text = (source as? InputStream)?.bufferedReader()?.readText() ?: return null + if (!isOpenApiStr(text)) return null + text + } + + else -> return null + } + + return CodeString( + readOpenApiAsString( + openApiAsString = text, + name = name, + auth = opts.auth, + options = opts.parseOptions, + extensionProperties = opts.extensionProperties, + generateHelperCompanionObject = opts.generateHelperCompanionObject, + visibility = opts.visibility, + ), + ) + } + + // Run before Json (10_000) so .json files get the OpenAPI content check first. + override val testOrder: Int = 9_000 + + override fun toString(): String = "OpenApi" +} /** * Allows for OpenApi type schemas to be converted to [DataSchema] interfaces. diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml index ed04b6e669..5f5cd5aa86 100644 --- a/gradle/libs.versions.toml +++ b/gradle/libs.versions.toml @@ -34,6 +34,7 @@ commonsCsv = "1.14.1" commonsCompress = "1.28.0" commonsIo = "2.21.0" commonsStatistics = "1.2" +tika = "3.3.0" serialization = "1.11.0" poi = "5.5.1" mariadb = "3.5.8" @@ -100,6 +101,7 @@ commonsCsv = { group = "org.apache.commons", name = "commons-csv", version.ref = commonsCompress = { group = "org.apache.commons", name = "commons-compress", version.ref = "commonsCompress" } commonsIo = { group = "commons-io", name = "commons-io", version.ref = "commonsIo" } commonsStatisticsDescriptive = { group = "org.apache.commons", name = "commons-statistics-descriptive", version.ref = "commonsStatistics" } +tika = { group = "org.apache.tika", name = "tika-core", version.ref = "tika" } # Serialization serialization-core = { group = "org.jetbrains.kotlinx", name = "kotlinx-serialization-core", version.ref = "serialization" }