Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More Converting operations #133

Merged
merged 7 commits into from
Aug 9, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import org.jetbrains.kotlinx.dataframe.RowValueExpression
import org.jetbrains.kotlinx.dataframe.columns.ColumnReference
import org.jetbrains.kotlinx.dataframe.dataTypes.IFRAME
import org.jetbrains.kotlinx.dataframe.dataTypes.IMG
import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException
import org.jetbrains.kotlinx.dataframe.impl.api.Parsers
import org.jetbrains.kotlinx.dataframe.impl.api.convertRowColumnImpl
import org.jetbrains.kotlinx.dataframe.impl.api.convertToTypeImpl
Expand Down Expand Up @@ -125,6 +126,48 @@ public fun <T : Any> DataColumn<T?>.convertToString(): DataColumn<String?> = con
public fun <T : Any> DataColumn<T>.convertToDouble(): DataColumn<Double> = convertTo()
public fun <T : Any> DataColumn<T?>.convertToDouble(): DataColumn<Double?> = convertTo()

/**
* Parse String column to Double considering locale (number format).
* If [locale] parameter is defined, it's number format is used for parsing.
* If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used.
*/
@JvmName("convertToDoubleFromString")
public fun DataColumn<String>.convertToDouble(locale: Locale? = null): DataColumn<Double> {
if (locale is Locale) {
val explicitConverter = Parsers.getDoubleConverter(locale) as (String) -> Double?
return map { explicitConverter(it.trim()) ?: error("Can't convert `$it` to Double") }
} else {
return try {
val defaultConverter = Parsers.getDoubleConverter() as (String) -> Double?
map { defaultConverter(it.trim()) ?: error("Can't convert `$it` to Double") }
} catch (e: TypeConversionException) {
val posixConverter = Parsers.getDoubleConverter(Locale.forLanguageTag("C.UTF-8")) as (String) -> Double?
map { posixConverter(it.trim()) ?: error("Can't convert `$it` to Double") }
}
}
}

/**
* Parse String column to Double considering locale (number format).
* If [locale] parameter is defined, it's number format is used for parsing.
* If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used.
*/
@JvmName("convertToDoubleFromStringNullable")
public fun DataColumn<String?>.convertToDouble(locale: Locale? = null): DataColumn<Double?> {
if (locale is Locale) {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think?

val converter = if (locale != null) {
    Parsers.getDoubleConverter(locale)
} else {
    Parsers.getDoubleConverter()
}

Copy link
Contributor Author

@Kopilov Kopilov Aug 6, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And then simply call map { converter(it) }? This doesn't seem to work because we wanted the scalar converter not to use fallback logic. And it is implemented here now…

val explicitConverter = Parsers.getDoubleConverter(locale) as (String) -> Double?
return map { it?.let { explicitConverter(it.trim()) ?: error("Can't convert `$it` to Double") } }
} else {
return try {
val defaultConverter = Parsers.getDoubleConverter() as (String) -> Double?
map { it?.let { defaultConverter(it.trim()) ?: error("Can't convert `$it` to Double") } }
} catch (e: IllegalStateException) {
val posixConverter = Parsers.getDoubleConverter(Locale.forLanguageTag("C.UTF-8")) as (String) -> Double?
map { it?.let { posixConverter(it.trim()) ?: error("Can't convert `$it` to Double") } }
}
}
}

@JvmName("convertToFloatFromT")
public fun <T : Any> DataColumn<T>.convertToFloat(): DataColumn<Float> = convertTo()
public fun <T : Any> DataColumn<T?>.convertToFloat(): DataColumn<Float?> = convertTo()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,23 +171,18 @@ internal object Parsers : GlobalParserOptions {
return null
}

private val posixNumberFormat = NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8"))

private fun String.parseDouble(userNumberFormat: NumberFormat) =
private fun String.parseDouble(format: NumberFormat) =
when (uppercase(Locale.getDefault())) {
"NAN" -> Double.NaN
"INF" -> Double.POSITIVE_INFINITY
"-INF" -> Double.NEGATIVE_INFINITY
"INFINITY" -> Double.POSITIVE_INFINITY
"-INFINITY" -> Double.NEGATIVE_INFINITY
else -> {
fun parseWithFormat(format: NumberFormat): Double? {
val parsePosition = ParsePosition(0)
val result: Double? = format.parse(this, parsePosition)?.toDouble()
return if (parsePosition.index != this.length) null
else result
}
parseWithFormat(userNumberFormat) ?: parseWithFormat(posixNumberFormat)
val parsePosition = ParsePosition(0)
val result: Double? = format.parse(this, parsePosition)?.toDouble()
if (parsePosition.index != this.length) null
else result
}
}

Expand All @@ -199,6 +194,12 @@ internal object Parsers : GlobalParserOptions {
inline fun <reified T : Any> stringParserWithOptions(noinline body: (ParserOptions?) -> ((String) -> T?)) =
StringParserWithFormat(typeOf<T>(), body)

private val parserToDoubleWithOptions = stringParserWithOptions { options ->
val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault())
val parser = { it: String -> it.parseDouble(numberFormat) }
parser
}

private val parsersOrder = listOf(
stringParser { it.toIntOrNull() },
stringParser { it.toLongOrNull() },
Expand Down Expand Up @@ -231,12 +232,12 @@ internal object Parsers : GlobalParserOptions {

stringParser { it.toUrlOrNull() },

stringParserWithOptions { options ->
// Double, with explicit number format or taken from current locale
parserToDoubleWithOptions,

// Double, with POSIX format
stringParser { it.parseDouble(NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8"))) },

val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault())
val parser = { it: String -> it.parseDouble(numberFormat) }
parser
},
stringParser { it.toBooleanOrNull() },
stringParser { it.toBigDecimalOrNull() },

Expand Down Expand Up @@ -271,6 +272,13 @@ internal object Parsers : GlobalParserOptions {
) else null
return parser.applyOptions(options)
}

internal fun getDoubleConverter(locale: Locale? = null): TypeConverter {
val options = if (locale != null) ParserOptions(
locale = locale
) else null
return parserToDoubleWithOptions.toConverter(options)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use parserToDoubleWithOptions.applyOptions here, i think

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Originally I used toConverter for getting TypeConversionException to be thrown. Now it is thrown in .convertToDouble function and this is renamed to getDoubleParser.

}
}

internal fun DataColumn<String?>.tryParseImpl(options: ParserOptions?): DataColumn<*> {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,27 @@ class CsvTests {
assertColumnType("quality", Int::class)
}

@Test
fun `read standard CSV with floats when user has alternative locale`() {
val currentLocale = Locale.getDefault()
try {
Locale.setDefault(Locale.forLanguageTag("ru-RU"))
val df = DataFrame.readCSV(wineCsv, delimiter = ';')
val schema = df.schema()
fun assertColumnType(columnName: String, kClass: KClass<*>) {
val col = schema.columns[columnName]
col.shouldNotBeNull()
col.type.classifier shouldBe kClass
}

assertColumnType("citric acid", Double::class)
assertColumnType("alcohol", Double::class)
assertColumnType("quality", Int::class)
} finally {
Locale.setDefault(currentLocale)
}
}

@Test
fun `read with custom header`() {
val header = ('A'..'K').map { it.toString() }
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
package org.jetbrains.kotlinx.dataframe.io

import io.kotest.assertions.throwables.shouldThrow
import io.kotest.matchers.shouldBe
import kotlinx.datetime.LocalDateTime
import org.jetbrains.kotlinx.dataframe.DataColumn
import org.jetbrains.kotlinx.dataframe.DataFrame
import org.jetbrains.kotlinx.dataframe.api.cast
import org.jetbrains.kotlinx.dataframe.api.columnOf
import org.jetbrains.kotlinx.dataframe.api.convertTo
import org.jetbrains.kotlinx.dataframe.api.convertToDouble
import org.jetbrains.kotlinx.dataframe.api.parse
import org.jetbrains.kotlinx.dataframe.api.parser
import org.jetbrains.kotlinx.dataframe.api.tryParse
Expand Down Expand Up @@ -77,18 +79,73 @@ class ParserTests {
fun `converting String to Double in different locales`() {
val currentLocale = Locale.getDefault()
try {
val stringValues = listOf("1", "2.3", "4,5")
val stringColumn = DataColumn.createValueColumn("nums", stringValues, typeOf<String>())
Locale.setDefault(Locale.forLanguageTag("ru-RU"))
// Use comma as local decimal separator and dot as fallback default (as it is used in POSIX/C.UTF-8)
stringColumn.convertTo<Double>().shouldBe(
DataColumn.createValueColumn("nums", listOf(1.0, 2.3, 4.5), typeOf<Double>())
)
// Test 36 behaviour combinations:

// 3 source columns
val columnDot = columnOf("12.345", "67.890")
val columnComma = columnOf("12,345", "67,890")
val columnMixed = columnOf("12.345", "67,890")
// *
// (3 locales as converting parameter + original converting)
val parsingLocaleNotDefined: Locale? = null
val parsingLocaleUsesDot: Locale = Locale.forLanguageTag("en-US")
val parsingLocaleUsesComma: Locale = Locale.forLanguageTag("ru-RU")
// *
// 3 system locales

Locale.setDefault(Locale.forLanguageTag("C.UTF-8"))

columnDot.convertTo<Double>().shouldBe(columnOf(12.345, 67.89))
columnComma.convertTo<Double>().shouldBe(columnOf(12345.0, 67890.0))
columnMixed.convertTo<Double>().shouldBe(columnOf(12.345, 67890.0))

columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89))
columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12345.0, 67890.0))
columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0))

columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89))
columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0))
columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0))

shouldThrow<TypeConversionException> { columnDot.convertToDouble(parsingLocaleUsesComma) }
columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89))
shouldThrow<TypeConversionException> { columnMixed.convertToDouble(parsingLocaleUsesComma) }

Locale.setDefault(Locale.forLanguageTag("en-US"))
// Use dot as local decimal separator. Comma is ignored (as it is group separator in this locale).
stringColumn.convertTo<Double>().shouldBe(
DataColumn.createValueColumn("nums", listOf(1.0, 2.3, 45.0), typeOf<Double>())
)

columnDot.convertTo<Double>().shouldBe(columnOf(12.345, 67.89))
columnComma.convertTo<Double>().shouldBe(columnOf(12345.0, 67890.0))
columnMixed.convertTo<Double>().shouldBe(columnOf(12.345, 67890.0))

columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89))
columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12345.0, 67890.0))
columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0))

columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89))
columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0))
columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0))

shouldThrow<TypeConversionException> { columnDot.convertToDouble(parsingLocaleUsesComma) }
columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89))
shouldThrow<TypeConversionException> { columnMixed.convertToDouble(parsingLocaleUsesComma) }

Locale.setDefault(Locale.forLanguageTag("ru-RU"))

columnDot.convertTo<Double>().shouldBe(columnOf(12.345, 67.89))
columnComma.convertTo<Double>().shouldBe(columnOf(12345.0, 67890.0))
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Shouldn't it be 12.345, 67.890? Looks suspicious that default convertTo works identically for en_US and ru_RU.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed, thanks! But I have not found a better way than calling DataColumn<String>.convertToDouble directly from universal convertTo<>.

columnMixed.convertTo<Double>().shouldBe(columnOf(12.345, 67890.0))

columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89))
columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89))
columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0))

columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89))
columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0))
columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0))

shouldThrow<TypeConversionException> { columnDot.convertToDouble(parsingLocaleUsesComma) }
columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89))
shouldThrow<TypeConversionException> { columnMixed.convertToDouble(parsingLocaleUsesComma) }
} finally {
Locale.setDefault(currentLocale)
}
Expand Down