Merge pull request #173 from Kotlin/new-open-api

OpenAPI/Swagger JSON type schema support + many small fixes I came across
Kotlin · Nov 25, 2022 · 0089ed3 · 0089ed3
2 parents 48a3594 + 4673b9c
commit 0089ed3
Show file tree

Hide file tree

Showing 119 changed files with 14,197 additions and 899 deletions.
diff --git a/.editorconfig b/.editorconfig
@@ -7,4 +7,7 @@ indent_size=4
 max_line_length=120
 
 [*.json]
-indent_size=2
+indent_size=2
+
+[*.yaml]
+indent_size=2
diff --git a/build.gradle.kts b/build.gradle.kts
@@ -9,6 +9,7 @@ plugins {
     kotlin("libs.publisher") version libs.versions.libsPublisher
     kotlin("plugin.serialization") version libs.versions.kotlin
     id("org.jetbrains.kotlinx.dataframe") version libs.versions.dataframe apply false
+    kotlin("jupyter.api") version libs.versions.kotlinJupyter apply false
 
     id("org.jetbrains.dokka") version libs.versions.dokka
     id("org.jetbrains.kotlinx.kover") version libs.versions.kover
@@ -73,14 +74,13 @@ group = "org.jetbrains.kotlinx"
 fun detectVersion(): String {
     val buildNumber = rootProject.findProperty("build.number") as String?
     val versionProp = property("version") as String
-    return if(buildNumber != null) {
+    return if (buildNumber != null) {
         if (rootProject.findProperty("build.number.detection") == "true") {
             "$versionProp-dev-$buildNumber"
         } else {
             buildNumber
         }
-    }
-    else if(hasProperty("release")) {
+    } else if (hasProperty("release")) {
         versionProp
     } else {
         "$versionProp-dev"
@@ -104,15 +104,15 @@ kotlinPublications {
     fairDokkaJars.set(false)
 
     sonatypeSettings(
-            project.findProperty("kds.sonatype.user") as String?,
-            project.findProperty("kds.sonatype.password") as String?,
-            "dataframe project, v. ${project.version}"
+        project.findProperty("kds.sonatype.user") as String?,
+        project.findProperty("kds.sonatype.password") as String?,
+        "dataframe project, v. ${project.version}"
     )
 
     signingCredentials(
-            project.findProperty("kds.sign.key.id") as String?,
-            project.findProperty("kds.sign.key.private") as String?,
-            project.findProperty("kds.sign.key.passphrase") as String?
+        project.findProperty("kds.sign.key.id") as String?,
+        project.findProperty("kds.sign.key.private") as String?,
+        project.findProperty("kds.sign.key.passphrase") as String?
     )
 
     pom {

diff --git a/core/build.gradle.kts b/core/build.gradle.kts
@@ -1,10 +1,9 @@
-
 @Suppress("DSL_SCOPE_VIOLATION", "UnstableApiUsage")
 plugins {
     kotlin("jvm")
     kotlin("libs.publisher")
     kotlin("plugin.serialization")
-    kotlin("jupyter.api") version libs.versions.kotlinJupyter
+    kotlin("jupyter.api")
 
     id("io.github.devcrocod.korro") version libs.versions.korro
     id("org.jetbrains.dataframe.generator")
@@ -25,16 +24,16 @@ repositories {
 }
 
 dependencies {
+    api(libs.kotlin.reflect)
     implementation(libs.kotlin.stdlib)
     implementation(libs.kotlin.stdlib.jdk8)
-    implementation(libs.kotlin.reflect)
 
     api(libs.commonsCsv)
     implementation(libs.klaxon)
     implementation(libs.fuel)
 
-    implementation(libs.kotlin.datetimeJvm)
-    implementation("com.squareup:kotlinpoet:1.11.0")
+    api(libs.kotlin.datetimeJvm)
+    implementation(libs.kotlinpoet)
 
     testImplementation(libs.junit)
     testImplementation(libs.kotestAssertions) {
@@ -114,7 +113,8 @@ kotlinter {
         "experimental:annotation",
         "max-line-length",
         "filename",
-        "comment-spacing"
+        "comment-spacing",
+        "curly-spacing",
     )
 }
 
@@ -137,10 +137,12 @@ tasks.withType<org.jetbrains.kotlin.gradle.tasks.KotlinCompile> {
 tasks.test {
     maxHeapSize = "2048m"
     extensions.configure(kotlinx.kover.api.KoverTaskExtension::class) {
-        excludes.set(listOf(
-            "org.jetbrains.kotlinx.dataframe.jupyter.*",
-            "org.jetbrains.kotlinx.dataframe.jupyter.SampleNotebooksTests"
-        ))
+        excludes.set(
+            listOf(
+                "org.jetbrains.kotlinx.dataframe.jupyter.*",
+                "org.jetbrains.kotlinx.dataframe.jupyter.SampleNotebooksTests"
+            )
+        )
     }
 }
 
@@ -168,6 +170,7 @@ artifacts {
     }
 }
 
+// Disable and enable if updating plugin breaks the build
 dataframes {
     schema {
         sourceSet = "test"

diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/DataFrame.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/DataFrame.kt
@@ -13,6 +13,7 @@ import org.jetbrains.kotlinx.dataframe.columns.UnresolvedColumnsPolicy
 import org.jetbrains.kotlinx.dataframe.impl.DataFrameImpl
 import org.jetbrains.kotlinx.dataframe.impl.DataFrameSize
 import org.jetbrains.kotlinx.dataframe.impl.getColumnsImpl
+import org.jetbrains.kotlinx.dataframe.impl.headPlusArray
 import org.jetbrains.kotlinx.dataframe.impl.headPlusIterable
 import org.jetbrains.kotlinx.dataframe.impl.schema.createEmptyDataFrameOf
 import kotlin.reflect.KType
@@ -53,10 +54,13 @@ public interface DataFrame<out T> : Aggregatable<T>, ColumnsContainer<T> {
 
     // region get columns
 
-    override operator fun <C> get(columns: ColumnsSelector<T, C>): List<DataColumn<C>> = getColumnsImpl(UnresolvedColumnsPolicy.Fail, columns)
+    override operator fun <C> get(columns: ColumnsSelector<T, C>): List<DataColumn<C>> =
+        getColumnsImpl(UnresolvedColumnsPolicy.Fail, columns)
+
     public operator fun get(first: Column, vararg other: Column): DataFrame<T> = select(listOf(first) + other)
     public operator fun get(first: String, vararg other: String): DataFrame<T> = select(listOf(first) + other)
-    public operator fun get(columnRange: ClosedRange<String>): DataFrame<T> = select { columnRange.start..columnRange.endInclusive }
+    public operator fun get(columnRange: ClosedRange<String>): DataFrame<T> =
+        select { columnRange.start..columnRange.endInclusive }
 
     // endregion
 
@@ -65,8 +69,11 @@ public interface DataFrame<out T> : Aggregatable<T>, ColumnsContainer<T> {
     public operator fun get(index: Int): DataRow<T>
     public operator fun get(indices: Iterable<Int>): DataFrame<T> = getRows(indices)
     public operator fun get(range: IntRange): DataFrame<T> = getRows(range)
-    public operator fun get(vararg ranges: IntRange): DataFrame<T> = getRows(ranges.asSequence().flatMap { it.asSequence() }.asIterable())
-    public operator fun get(firstIndex: Int, vararg otherIndices: Int): DataFrame<T> = get(headPlusIterable(firstIndex, otherIndices.asIterable()))
+    public operator fun get(first: IntRange, vararg ranges: IntRange): DataFrame<T> =
+        getRows(headPlusArray(first, ranges).asSequence().flatMap { it.asSequence() }.asIterable())
+
+    public operator fun get(firstIndex: Int, vararg otherIndices: Int): DataFrame<T> =
+        get(headPlusIterable(firstIndex, otherIndices.asIterable()))
 
     // endregion
 

diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/annotations/ImportDataSchema.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/annotations/ImportDataSchema.kt
@@ -1,8 +1,14 @@
 package org.jetbrains.kotlinx.dataframe.annotations
 
+import org.jetbrains.kotlinx.dataframe.api.JsonPath
+import org.jetbrains.kotlinx.dataframe.api.KeyValueProperty
+import org.jetbrains.kotlinx.dataframe.columns.ColumnGroup
+import org.jetbrains.kotlinx.dataframe.columns.FrameColumn
+import org.jetbrains.kotlinx.dataframe.io.JSON
+
 /**
  * Annotation preprocessing will generate a DataSchema interface from the data at `path`.
- * Data must be of supported format: CSV, JSON, Apache Arrow, Excel.
+ * Data must be of supported format: CSV, JSON, Apache Arrow, Excel, OpenAPI (Swagger) in YAML/JSON.
  * Generated data schema has properties inferred from data and a companion object with `read method`.
  * `read method` is either `readCSV` or `readJson` that returns `DataFrame<name>`
  *
@@ -15,7 +21,8 @@ package org.jetbrains.kotlinx.dataframe.annotations
  * @param normalizationDelimiters if not empty, split property names by delimiters,
  * lowercase parts and join to camel case. Set empty list to disable normalization
  * @param withDefaultPath if `true`, generate `defaultPath` property to the data schema's companion object and make it default argument for a `read method`
- * @param csvOptions options to parse CSV data. Not used when data is JSON
+ * @param csvOptions options to parse CSV data. Not used when data is not Csv
+ * @param jsonOptions options to parse JSON data. Not used when data is not Json
  */
 @Retention(AnnotationRetention.SOURCE)
 @Target(AnnotationTarget.FILE)
@@ -26,13 +33,29 @@ public annotation class ImportDataSchema(
     val visibility: DataSchemaVisibility = DataSchemaVisibility.IMPLICIT_PUBLIC,
     val normalizationDelimiters: CharArray = ['\t', ' ', '_'],
     val withDefaultPath: Boolean = true,
-    val csvOptions: CsvOptions = CsvOptions(',')
+    val csvOptions: CsvOptions = CsvOptions(','),
+    val jsonOptions: JsonOptions = JsonOptions(),
 )
 
 public enum class DataSchemaVisibility {
     INTERNAL, IMPLICIT_PUBLIC, EXPLICIT_PUBLIC
 }
 
 public annotation class CsvOptions(
-    val delimiter: Char
+    public val delimiter: Char,
+)
+
+public annotation class JsonOptions(
+
+    /** Allows the choice of how to handle type clashes when reading a JSON file. */
+    public val typeClashTactic: JSON.TypeClashTactic = JSON.TypeClashTactic.ARRAY_AND_VALUE_COLUMNS,
+
+    /**
+     * List of [JsonPath]s where instead of a [ColumnGroup], a [FrameColumn]<[KeyValueProperty]>
+     *     will be created.
+     *
+     * Example:
+     * `["""$["store"]["book"][*]["author"]"""]`
+     */
+    public val keyValuePaths: Array<String> = [],
 )
diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/DataRowApi.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/DataRowApi.kt
@@ -67,13 +67,17 @@ public operator fun AnyRow.contains(column: KProperty<*>): Boolean = containsKey
 
 @OptIn(ExperimentalTypeInference::class)
 @OverloadResolutionByLambdaReturnType
-public fun <T> DataRow<T>.diff(expression: RowExpression<T, Double>): Double? = prev()?.let { p -> expression(this, this) - expression(p, p) }
+public fun <T> DataRow<T>.diff(expression: RowExpression<T, Double>): Double? =
+    prev()?.let { p -> expression(this, this) - expression(p, p) }
 
-public fun <T> DataRow<T>.diff(expression: RowExpression<T, Int>): Int? = prev()?.let { p -> expression(this, this) - expression(p, p) }
+public fun <T> DataRow<T>.diff(expression: RowExpression<T, Int>): Int? =
+    prev()?.let { p -> expression(this, this) - expression(p, p) }
 
-public fun <T> DataRow<T>.diff(expression: RowExpression<T, Long>): Long? = prev()?.let { p -> expression(this, this) - expression(p, p) }
+public fun <T> DataRow<T>.diff(expression: RowExpression<T, Long>): Long? =
+    prev()?.let { p -> expression(this, this) - expression(p, p) }
 
-public fun <T> DataRow<T>.diff(expression: RowExpression<T, Float>): Float? = prev()?.let { p -> expression(this, this) - expression(p, p) }
+public fun <T> DataRow<T>.diff(expression: RowExpression<T, Float>): Float? =
+    prev()?.let { p -> expression(this, this) - expression(p, p) }
 
 public fun AnyRow.columnsCount(): Int = df().ncol
 public fun AnyRow.columnNames(): List<String> = df().columnNames()

diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/JsonPath.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/JsonPath.kt
@@ -0,0 +1,69 @@
+package org.jetbrains.kotlinx.dataframe.api
+
+import org.intellij.lang.annotations.Language
+import java.io.Serializable
+
+/**
+ * Simplistic JSON path implementation.
+ * Supports just keys (in bracket notation), double quotes, arrays and wildcards.
+ *
+ * Examples:
+ * `$["store"]["book"][*]["author"]`
+ *
+ * `$[1]` will match `$[*]`
+ */
+@JvmInline
+public value class JsonPath(@Language("jsonpath") public val path: String = "$") : Serializable {
+
+    public fun append(name: String): JsonPath = JsonPath("$path[\"$name\"]")
+
+    public fun appendWildcard(): JsonPath = JsonPath("$path[*]")
+
+    public fun appendArrayWithIndex(index: Int): JsonPath = JsonPath("$path[$index]")
+
+    public fun appendArrayWithWildcard(): JsonPath = JsonPath("$path[*]")
+
+    public fun replaceLastWildcardWithIndex(index: Int): JsonPath = JsonPath(
+        path.toCharArray().let { chars ->
+            val lastStarIndex = chars.lastIndexOf('*')
+            chars.flatMapIndexed { i, c ->
+                if (i == lastStarIndex) index.toString().toCharArray().toList()
+                else listOf(c)
+            }.joinToString("")
+        }
+    )
+
+    public fun prepend(name: String): JsonPath = JsonPath(
+        "\$[\"$name\"]" + path.removePrefix("$")
+    )
+
+    public fun prependWildcard(): JsonPath = JsonPath(
+        "\$[*]" + path.removePrefix("$")
+    )
+
+    public fun prependArrayWithIndex(index: Int): JsonPath = JsonPath(
+        "\$[$index]" + path.removePrefix("$")
+    )
+
+    public fun prependArrayWithWildcard(): JsonPath = JsonPath(
+        "\$[*]" + path.removePrefix("$")
+    )
+
+    public fun erasedIndices(): JsonPath = JsonPath(
+        path.replace("""\[[0-9]+]""".toRegex(), "[*]")
+    )
+
+    private fun splitPath() = path.split("[", "]").filter { it.isNotBlank() }
+
+    public fun matches(other: JsonPath): Boolean =
+        path == other.path ||
+            run {
+                val path = splitPath()
+                val otherPath = other.splitPath()
+
+                if (path.size != otherPath.size) false
+                else path.zip(otherPath).all { (p, o) ->
+                    p == o || p == "*" || o == "*"
+                }
+            }
+}
diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/KeyValueProperty.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/KeyValueProperty.kt
@@ -0,0 +1,15 @@
+package org.jetbrains.kotlinx.dataframe.api
+
+import org.jetbrains.kotlinx.dataframe.annotations.ColumnName
+import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
+
+/** A [DataSchema] interface / class can implement this if it represents a map-like data schema (so key: value). */
+@DataSchema
+public interface KeyValueProperty<T> {
+    // needs to be explicitly overridden in @DataSchema interface, otherwise extension functions won't generate (TODO)
+    public val key: String
+
+    // needs to be explicitly overridden in @DataSchema interface, otherwise type will be read as `T` and extensions won't generate (TODO)
+    @ColumnName("value")
+    public val `value`: T
+}
diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/TypeConversions.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/TypeConversions.kt
@@ -298,7 +298,7 @@ public fun <T, G> DataFrame<T>.asGroupBy(selector: ColumnSelector<T, DataFrame<G
 
 public fun <T> DataRow<T>.toDataFrame(): DataFrame<T> = owner[index..index]
 
-public fun AnyRow.toMap(): Map<String, Any?> = df().columns().map { it.name() to it[index] }.toMap()
+public fun AnyRow.toMap(): Map<String, Any?> = df().columns().associate { it.name() to it[index] }
 
 // endregion
 

diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/all.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/all.kt
@@ -12,8 +12,10 @@ import org.jetbrains.kotlinx.dataframe.index
 
 // region DataColumn
 
+/** Returns `true` if all [values] match the given [predicate] or [values] is empty. */
 public fun <T> DataColumn<T>.all(predicate: Predicate<T>): Boolean = values.all(predicate)
 
+/** Returns `true` if all [values] are `null` or [values] is empty. */
 public fun <C> DataColumn<C>.allNulls(): Boolean = size == 0 || all { it == null }
 
 // endregion
@@ -26,6 +28,7 @@ public fun AnyRow.allNA(): Boolean = owner.columns().all { it[index].isNA }
 
 // region DataFrame
 
+/** Returns `true` if all [rows] match the given [predicate] or [rows] is empty. */
 public fun <T> DataFrame<T>.all(predicate: RowFilter<T>): Boolean = rows().all { predicate(it, it) }
 
 // endregion