Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Absolute path fixes in plugins #191

Merged
merged 1 commit into from
Nov 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,23 @@ package org.jetbrains.kotlinx.dataframe.examples.movies

import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.annotations.DataSchema
import org.jetbrains.kotlinx.dataframe.api.*
import org.jetbrains.kotlinx.dataframe.api.by
import org.jetbrains.kotlinx.dataframe.api.convertTo
import org.jetbrains.kotlinx.dataframe.api.count
import org.jetbrains.kotlinx.dataframe.api.explode
import org.jetbrains.kotlinx.dataframe.api.filter
import org.jetbrains.kotlinx.dataframe.api.groupBy
import org.jetbrains.kotlinx.dataframe.api.inplace
import org.jetbrains.kotlinx.dataframe.api.into
import org.jetbrains.kotlinx.dataframe.api.mean
import org.jetbrains.kotlinx.dataframe.api.pivot
import org.jetbrains.kotlinx.dataframe.api.print
import org.jetbrains.kotlinx.dataframe.api.sortBy
import org.jetbrains.kotlinx.dataframe.api.split
import org.jetbrains.kotlinx.dataframe.io.read

@DataSchema
interface Movie{
interface Movie {
val movieId: String
val title: String
val genres: String
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,24 @@ package org.jetbrains.kotlinx.dataframe.examples.titanic.ml

import org.jetbrains.kotlinx.dataframe.ColumnSelector
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.*
import org.jetbrains.kotlinx.dataframe.api.by
import org.jetbrains.kotlinx.dataframe.api.column
import org.jetbrains.kotlinx.dataframe.api.convert
import org.jetbrains.kotlinx.dataframe.api.dfsOf
import org.jetbrains.kotlinx.dataframe.api.fillNulls
import org.jetbrains.kotlinx.dataframe.api.getColumn
import org.jetbrains.kotlinx.dataframe.api.into
import org.jetbrains.kotlinx.dataframe.api.mean
import org.jetbrains.kotlinx.dataframe.api.merge
import org.jetbrains.kotlinx.dataframe.api.perCol
import org.jetbrains.kotlinx.dataframe.api.pivotMatches
import org.jetbrains.kotlinx.dataframe.api.remove
import org.jetbrains.kotlinx.dataframe.api.select
import org.jetbrains.kotlinx.dataframe.api.shuffle
import org.jetbrains.kotlinx.dataframe.api.toFloat
import org.jetbrains.kotlinx.dataframe.api.toFloatArray
import org.jetbrains.kotlinx.dataframe.api.toTypedArray
import org.jetbrains.kotlinx.dataframe.api.withValue
import org.jetbrains.kotlinx.dl.api.core.Sequential
import org.jetbrains.kotlinx.dl.api.core.activation.Activations
import org.jetbrains.kotlinx.dl.api.core.initializer.HeNormal
Expand All @@ -14,7 +30,7 @@ import org.jetbrains.kotlinx.dl.api.core.loss.Losses
import org.jetbrains.kotlinx.dl.api.core.metric.Metrics
import org.jetbrains.kotlinx.dl.api.core.optimizer.Adam
import org.jetbrains.kotlinx.dl.dataset.OnHeapDataset
import java.util.*
import java.util.Locale

private const val SEED = 12L
private const val TEST_BATCH_SIZE = 100
Expand All @@ -33,8 +49,7 @@ fun main() {
// Set Locale for correct number parsing
Locale.setDefault(Locale.FRANCE)

// Set path for correct resolution (https://github.com/Kotlin/dataframe/issues/139)
val df = Passenger.readCSV("examples/idea-examples/titanic/src/main/resources/titanic.csv")
val df = Passenger.readCSV()

// Calculating imputing values
val (train, test) = df
Expand All @@ -44,7 +59,7 @@ fun main() {
// one hot encoding
.pivotMatches { pclass and sex }
// feature extraction
.select { survived and pclass and sibsp and parch and age and fare and sex}
.select { survived and pclass and sibsp and parch and age and fare and sex }
.shuffle()
.toTrainTest(0.7) { survived }

Expand All @@ -64,18 +79,23 @@ fun main() {
}
}

fun <T> DataFrame<T>.toTrainTest(trainRatio: Double, yColumn: ColumnSelector<T, Number>): Pair<OnHeapDataset, OnHeapDataset> =
toOnHeapDataset(yColumn).split(trainRatio)
fun <T> DataFrame<T>.toTrainTest(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

let's agree to create separate commits for this kind of changes? it will be easier to review without taking time to make sure semantics haven't changed, because now it's a bit painful

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I want to, but even having a file opened, after I press run, then IntelliJ lints it XD Then it feels wrong to undo it again. It's good practice for code I write myself.
I'll go look for that setting next. I understand it's difficult to check it like this, so I'll try.
However, the entire project could do with a good linting, cause in almost every file there are long lines, mixed named and positional arguments, missing spaces before brackets etc. etc. Will get to that one day :)

trainRatio: Double,
yColumn: ColumnSelector<T, Number>,
): Pair<OnHeapDataset, OnHeapDataset> =
toOnHeapDataset(yColumn)
.split(trainRatio)

private fun <T> DataFrame<T>.toOnHeapDataset(yColumn: ColumnSelector<T, Number>): OnHeapDataset {
return OnHeapDataset.create(
private fun <T> DataFrame<T>.toOnHeapDataset(yColumn: ColumnSelector<T, Number>): OnHeapDataset =
OnHeapDataset.create(
dataframe = this,
yColumn = yColumn
yColumn = yColumn,
)
}

private fun <T> OnHeapDataset.Companion.create(dataframe: DataFrame<T>, yColumn: ColumnSelector<T, Number>): OnHeapDataset {

private fun <T> OnHeapDataset.Companion.create(
dataframe: DataFrame<T>,
yColumn: ColumnSelector<T, Number>,
): OnHeapDataset {
val x by column<FloatArray>("X")

fun extractX(): Array<FloatArray> =
Expand All @@ -88,6 +108,6 @@ private fun <T> OnHeapDataset.Companion.create(dataframe: DataFrame<T>, yColumn:

return create(
::extractX,
::extractY
::extractY,
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import org.jetbrains.kotlinx.dataframe.io.Excel
import org.jetbrains.kotlinx.dataframe.io.JSON
import org.jetbrains.kotlinx.dataframe.io.OpenApi
import org.jetbrains.kotlinx.dataframe.io.TSV
import org.jetbrains.kotlinx.dataframe.io.isURL
import java.io.File
import java.net.URL
import java.nio.file.Paths
Expand Down Expand Up @@ -128,14 +129,22 @@ abstract class GenerateDataSchemaTask : DefaultTask() {
schemaFile.writeText(codeGenResult.toStandaloneSnippet(escapedPackageName, readDfMethod.additionalImports))
}

private fun stringOf(data: Any): String {
return when (data) {
is File -> data.toRelativeString(base = project.projectDir)
private fun stringOf(data: Any): String =
when (data) {
is File -> data.absolutePath
is URL -> data.toExternalForm()
is String -> data
is String ->
when {
isURL(data) -> stringOf(URL(data))
else -> {
val relativeFile = project.file(data)
val absoluteFile = File(data)
stringOf(if (relativeFile.exists()) relativeFile else absoluteFile)
}
}

else -> unsupportedType()
}
}

private fun escapePackageName(packageName: String): String {
// See RegexExpectationsTest
Expand All @@ -147,20 +156,26 @@ abstract class GenerateDataSchemaTask : DefaultTask() {
}
}

private fun urlOf(data: Any): URL {
fun isURL(fileOrUrl: String): Boolean = listOf("http:", "https:", "ftp:").any { fileOrUrl.startsWith(it) }

return when (data) {
private fun urlOf(data: Any): URL =
when (data) {
is File -> data.toURI()
is URL -> data.toURI()
is String -> when {
isURL(data) -> URL(data).toURI()
else -> project.file(data).toURI()
else -> {
val relativeFile = project.file(data)
val absoluteFile = File(data)

if (relativeFile.exists()) {
relativeFile
} else {
absoluteFile
}.toURI()
}
}

else -> unsupportedType()
}.toURL()
}

private fun unsupportedType(): Nothing =
throw IllegalArgumentException("data for schema \"${interfaceName.get()}\" must be File, URL or String")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import org.jetbrains.kotlinx.dataframe.io.Excel
import org.jetbrains.kotlinx.dataframe.io.JSON
import org.jetbrains.kotlinx.dataframe.io.OpenApi
import org.jetbrains.kotlinx.dataframe.io.TSV
import org.jetbrains.kotlinx.dataframe.io.isURL
import java.io.File
import java.net.MalformedURLException
import java.net.URL
Expand Down Expand Up @@ -63,30 +64,33 @@ class DataSchemaGenerator(
}

private fun ImportDataSchema.toStatement(file: KSFile, logger: KSPLogger): ImportDataSchemaStatement? {
val protocols = listOf("http", "https", "ftp")
val url = if (protocols.any { path.startsWith(it, ignoreCase = true) }) {
val url = if (isURL(path)) {
try {
URL(this.path)
} catch (exception: MalformedURLException) {
logger.error("'${this.path}' is not valid URL: ${exception.message}", file)
null
return null
}
} else {
val resolutionDir: String = resolutionDir ?: run {
reportMissingKspArgument(file)
return null
}
val data = File(resolutionDir, path)

val relativeFile = File(resolutionDir, path)
val absoluteFile = File(path)
val data = if (relativeFile.exists()) relativeFile else absoluteFile
try {
data.toURI().toURL()
data.toURI().toURL() ?: return null
} catch (exception: MalformedURLException) {
logger.error(
"Failed to convert resolved path '${data.absolutePath}' to URL: ${exception.message}",
"Failed to convert resolved path '${relativeFile.absolutePath}' or '${absoluteFile.absolutePath}' to URL: ${exception.message}",
file
)
null
return null
}
} ?: return null
}

return ImportDataSchemaStatement(
origin = file,
name = name,
Expand Down