From df54aa17fe805ddd2adafc645fe74ed42b2afdce Mon Sep 17 00:00:00 2001 From: Jolan Rensen Date: Mon, 28 Nov 2022 17:17:07 +0100 Subject: [PATCH] defaultPath in AbstractDefaultReadMethod implementing IO methods will now be absolute path to solve difference between calls from gradle/the IDE. @ImportDataSchema and dataframes { schemas { data = "" } } now support absolute path too. Relative path will still be attempted first. --- .../examples/movies/moviesWithInterface.kt | 16 ++++++- .../dataframe/examples/titanic/ml/titanic.kt | 48 +++++++++++++------ .../gradle/GenerateDataSchemaTask.kt | 37 +++++++++----- .../dataframe/ksp/DataSchemaGenerator.kt | 20 ++++---- 4 files changed, 86 insertions(+), 35 deletions(-) diff --git a/examples/idea-examples/movies/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/movies/moviesWithInterface.kt b/examples/idea-examples/movies/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/movies/moviesWithInterface.kt index 691bf235b..214961fa0 100644 --- a/examples/idea-examples/movies/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/movies/moviesWithInterface.kt +++ b/examples/idea-examples/movies/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/movies/moviesWithInterface.kt @@ -2,11 +2,23 @@ package org.jetbrains.kotlinx.dataframe.examples.movies import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.annotations.DataSchema -import org.jetbrains.kotlinx.dataframe.api.* +import org.jetbrains.kotlinx.dataframe.api.by +import org.jetbrains.kotlinx.dataframe.api.convertTo +import org.jetbrains.kotlinx.dataframe.api.count +import org.jetbrains.kotlinx.dataframe.api.explode +import org.jetbrains.kotlinx.dataframe.api.filter +import org.jetbrains.kotlinx.dataframe.api.groupBy +import org.jetbrains.kotlinx.dataframe.api.inplace +import org.jetbrains.kotlinx.dataframe.api.into +import org.jetbrains.kotlinx.dataframe.api.mean +import org.jetbrains.kotlinx.dataframe.api.pivot +import org.jetbrains.kotlinx.dataframe.api.print +import org.jetbrains.kotlinx.dataframe.api.sortBy +import org.jetbrains.kotlinx.dataframe.api.split import org.jetbrains.kotlinx.dataframe.io.read @DataSchema -interface Movie{ +interface Movie { val movieId: String val title: String val genres: String diff --git a/examples/idea-examples/titanic/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/titanic/ml/titanic.kt b/examples/idea-examples/titanic/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/titanic/ml/titanic.kt index 92b630f72..810c6b955 100644 --- a/examples/idea-examples/titanic/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/titanic/ml/titanic.kt +++ b/examples/idea-examples/titanic/src/main/kotlin/org/jetbrains/kotlinx/dataframe/examples/titanic/ml/titanic.kt @@ -2,8 +2,24 @@ package org.jetbrains.kotlinx.dataframe.examples.titanic.ml import org.jetbrains.kotlinx.dataframe.ColumnSelector import org.jetbrains.kotlinx.dataframe.DataFrame -import org.jetbrains.kotlinx.dataframe.api.* +import org.jetbrains.kotlinx.dataframe.api.by import org.jetbrains.kotlinx.dataframe.api.column +import org.jetbrains.kotlinx.dataframe.api.convert +import org.jetbrains.kotlinx.dataframe.api.dfsOf +import org.jetbrains.kotlinx.dataframe.api.fillNulls +import org.jetbrains.kotlinx.dataframe.api.getColumn +import org.jetbrains.kotlinx.dataframe.api.into +import org.jetbrains.kotlinx.dataframe.api.mean +import org.jetbrains.kotlinx.dataframe.api.merge +import org.jetbrains.kotlinx.dataframe.api.perCol +import org.jetbrains.kotlinx.dataframe.api.pivotMatches +import org.jetbrains.kotlinx.dataframe.api.remove +import org.jetbrains.kotlinx.dataframe.api.select +import org.jetbrains.kotlinx.dataframe.api.shuffle +import org.jetbrains.kotlinx.dataframe.api.toFloat +import org.jetbrains.kotlinx.dataframe.api.toFloatArray +import org.jetbrains.kotlinx.dataframe.api.toTypedArray +import org.jetbrains.kotlinx.dataframe.api.withValue import org.jetbrains.kotlinx.dl.api.core.Sequential import org.jetbrains.kotlinx.dl.api.core.activation.Activations import org.jetbrains.kotlinx.dl.api.core.initializer.HeNormal @@ -14,7 +30,7 @@ import org.jetbrains.kotlinx.dl.api.core.loss.Losses import org.jetbrains.kotlinx.dl.api.core.metric.Metrics import org.jetbrains.kotlinx.dl.api.core.optimizer.Adam import org.jetbrains.kotlinx.dl.dataset.OnHeapDataset -import java.util.* +import java.util.Locale private const val SEED = 12L private const val TEST_BATCH_SIZE = 100 @@ -33,8 +49,7 @@ fun main() { // Set Locale for correct number parsing Locale.setDefault(Locale.FRANCE) - // Set path for correct resolution (https://github.com/Kotlin/dataframe/issues/139) - val df = Passenger.readCSV("examples/idea-examples/titanic/src/main/resources/titanic.csv") + val df = Passenger.readCSV() // Calculating imputing values val (train, test) = df @@ -44,7 +59,7 @@ fun main() { // one hot encoding .pivotMatches { pclass and sex } // feature extraction - .select { survived and pclass and sibsp and parch and age and fare and sex} + .select { survived and pclass and sibsp and parch and age and fare and sex } .shuffle() .toTrainTest(0.7) { survived } @@ -64,18 +79,23 @@ fun main() { } } -fun DataFrame.toTrainTest(trainRatio: Double, yColumn: ColumnSelector): Pair = - toOnHeapDataset(yColumn).split(trainRatio) +fun DataFrame.toTrainTest( + trainRatio: Double, + yColumn: ColumnSelector, +): Pair = + toOnHeapDataset(yColumn) + .split(trainRatio) -private fun DataFrame.toOnHeapDataset(yColumn: ColumnSelector): OnHeapDataset { - return OnHeapDataset.create( +private fun DataFrame.toOnHeapDataset(yColumn: ColumnSelector): OnHeapDataset = + OnHeapDataset.create( dataframe = this, - yColumn = yColumn + yColumn = yColumn, ) -} - -private fun OnHeapDataset.Companion.create(dataframe: DataFrame, yColumn: ColumnSelector): OnHeapDataset { +private fun OnHeapDataset.Companion.create( + dataframe: DataFrame, + yColumn: ColumnSelector, +): OnHeapDataset { val x by column("X") fun extractX(): Array = @@ -88,6 +108,6 @@ private fun OnHeapDataset.Companion.create(dataframe: DataFrame, yColumn: return create( ::extractX, - ::extractY + ::extractY, ) } diff --git a/plugins/dataframe-gradle-plugin/src/main/kotlin/org/jetbrains/dataframe/gradle/GenerateDataSchemaTask.kt b/plugins/dataframe-gradle-plugin/src/main/kotlin/org/jetbrains/dataframe/gradle/GenerateDataSchemaTask.kt index 68f4fb764..fb1a1a9c0 100644 --- a/plugins/dataframe-gradle-plugin/src/main/kotlin/org/jetbrains/dataframe/gradle/GenerateDataSchemaTask.kt +++ b/plugins/dataframe-gradle-plugin/src/main/kotlin/org/jetbrains/dataframe/gradle/GenerateDataSchemaTask.kt @@ -22,6 +22,7 @@ import org.jetbrains.kotlinx.dataframe.io.Excel import org.jetbrains.kotlinx.dataframe.io.JSON import org.jetbrains.kotlinx.dataframe.io.OpenApi import org.jetbrains.kotlinx.dataframe.io.TSV +import org.jetbrains.kotlinx.dataframe.io.isURL import java.io.File import java.net.URL import java.nio.file.Paths @@ -128,14 +129,22 @@ abstract class GenerateDataSchemaTask : DefaultTask() { schemaFile.writeText(codeGenResult.toStandaloneSnippet(escapedPackageName, readDfMethod.additionalImports)) } - private fun stringOf(data: Any): String { - return when (data) { - is File -> data.toRelativeString(base = project.projectDir) + private fun stringOf(data: Any): String = + when (data) { + is File -> data.absolutePath is URL -> data.toExternalForm() - is String -> data + is String -> + when { + isURL(data) -> stringOf(URL(data)) + else -> { + val relativeFile = project.file(data) + val absoluteFile = File(data) + stringOf(if (relativeFile.exists()) relativeFile else absoluteFile) + } + } + else -> unsupportedType() } - } private fun escapePackageName(packageName: String): String { // See RegexExpectationsTest @@ -147,20 +156,26 @@ abstract class GenerateDataSchemaTask : DefaultTask() { } } - private fun urlOf(data: Any): URL { - fun isURL(fileOrUrl: String): Boolean = listOf("http:", "https:", "ftp:").any { fileOrUrl.startsWith(it) } - - return when (data) { + private fun urlOf(data: Any): URL = + when (data) { is File -> data.toURI() is URL -> data.toURI() is String -> when { isURL(data) -> URL(data).toURI() - else -> project.file(data).toURI() + else -> { + val relativeFile = project.file(data) + val absoluteFile = File(data) + + if (relativeFile.exists()) { + relativeFile + } else { + absoluteFile + }.toURI() + } } else -> unsupportedType() }.toURL() - } private fun unsupportedType(): Nothing = throw IllegalArgumentException("data for schema \"${interfaceName.get()}\" must be File, URL or String") diff --git a/plugins/symbol-processor/src/main/kotlin/org/jetbrains/dataframe/ksp/DataSchemaGenerator.kt b/plugins/symbol-processor/src/main/kotlin/org/jetbrains/dataframe/ksp/DataSchemaGenerator.kt index 04feab40f..e9bd8a418 100644 --- a/plugins/symbol-processor/src/main/kotlin/org/jetbrains/dataframe/ksp/DataSchemaGenerator.kt +++ b/plugins/symbol-processor/src/main/kotlin/org/jetbrains/dataframe/ksp/DataSchemaGenerator.kt @@ -26,6 +26,7 @@ import org.jetbrains.kotlinx.dataframe.io.Excel import org.jetbrains.kotlinx.dataframe.io.JSON import org.jetbrains.kotlinx.dataframe.io.OpenApi import org.jetbrains.kotlinx.dataframe.io.TSV +import org.jetbrains.kotlinx.dataframe.io.isURL import java.io.File import java.net.MalformedURLException import java.net.URL @@ -63,30 +64,33 @@ class DataSchemaGenerator( } private fun ImportDataSchema.toStatement(file: KSFile, logger: KSPLogger): ImportDataSchemaStatement? { - val protocols = listOf("http", "https", "ftp") - val url = if (protocols.any { path.startsWith(it, ignoreCase = true) }) { + val url = if (isURL(path)) { try { URL(this.path) } catch (exception: MalformedURLException) { logger.error("'${this.path}' is not valid URL: ${exception.message}", file) - null + return null } } else { val resolutionDir: String = resolutionDir ?: run { reportMissingKspArgument(file) return null } - val data = File(resolutionDir, path) + + val relativeFile = File(resolutionDir, path) + val absoluteFile = File(path) + val data = if (relativeFile.exists()) relativeFile else absoluteFile try { - data.toURI().toURL() + data.toURI().toURL() ?: return null } catch (exception: MalformedURLException) { logger.error( - "Failed to convert resolved path '${data.absolutePath}' to URL: ${exception.message}", + "Failed to convert resolved path '${relativeFile.absolutePath}' or '${absoluteFile.absolutePath}' to URL: ${exception.message}", file ) - null + return null } - } ?: return null + } + return ImportDataSchemaStatement( origin = file, name = name,