From 7a5fdfb0c5a50eb9c50254b4a7a15723ea838a83 Mon Sep 17 00:00:00 2001 From: Kopilov Aleksandr Date: Sun, 31 Jul 2022 11:44:11 +0300 Subject: [PATCH 1/7] POSIX Double parsing --- .../jetbrains/kotlinx/dataframe/impl/api/parse.kt | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index aaf5d34e3..675db46f4 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -171,7 +171,9 @@ internal object Parsers : GlobalParserOptions { return null } - private fun String.parseDouble(format: NumberFormat) = + private val posixNumberFormat = NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8")) + + private fun String.parseDouble(userNumberFormat: NumberFormat) = when (uppercase(Locale.getDefault())) { "NAN" -> Double.NaN "INF" -> Double.POSITIVE_INFINITY @@ -179,10 +181,13 @@ internal object Parsers : GlobalParserOptions { "INFINITY" -> Double.POSITIVE_INFINITY "-INFINITY" -> Double.NEGATIVE_INFINITY else -> { - val parsePosition = ParsePosition(0) - val result: Double? = format.parse(this, parsePosition)?.toDouble() - if (parsePosition.index != this.length) null - else result + fun parseWithFormat(format: NumberFormat): Double? { + val parsePosition = ParsePosition(0) + val result: Double? = format.parse(this, parsePosition)?.toDouble() + return if (parsePosition.index != this.length) null + else result + } + parseWithFormat(userNumberFormat) ?: parseWithFormat(posixNumberFormat) } } From 64cd9e898f8697b83cf69bc169129fecc9108d2c Mon Sep 17 00:00:00 2001 From: Kopilov Aleksandr Date: Sun, 31 Jul 2022 21:58:29 +0300 Subject: [PATCH 2/7] Unit test "converting string to double in different locales" --- .../kotlinx/dataframe/impl/api/convert.kt | 9 ++++---- .../kotlinx/dataframe/io/ParserTests.kt | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+), 4 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt index e7b139019..43676bcc9 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt @@ -36,6 +36,7 @@ import org.jetbrains.kotlinx.dataframe.type import java.math.BigDecimal import java.net.URL import java.time.LocalTime +import java.util.* import kotlin.math.roundToInt import kotlin.math.roundToLong import kotlin.reflect.KType @@ -81,7 +82,7 @@ internal fun AnyCol.convertToTypeImpl(to: KType): AnyCol { return when { from == to -> this from.isSubtypeOf(to) -> (this as DataColumnInternal<*>).changeType(to.withNullability(hasNulls())) - else -> when (val converter = getConverter(from, to)) { + else -> when (val converter = getConverter(from, to, ParserOptions(locale = Locale.getDefault()))) { null -> when (from.classifier) { Any::class, Number::class, java.io.Serializable::class -> { // find converter for every value @@ -89,7 +90,7 @@ internal fun AnyCol.convertToTypeImpl(to: KType): AnyCol { it?.let { val clazz = it.javaClass.kotlin val type = clazz.createStarProjectedType(false) - val converter = getConverter(type, to) ?: throw TypeConverterNotFoundException(from, to) + val converter = getConverter(type, to, ParserOptions(locale = Locale.getDefault())) ?: throw TypeConverterNotFoundException(from, to) converter(it) }.checkNulls() } @@ -107,9 +108,9 @@ internal fun AnyCol.convertToTypeImpl(to: KType): AnyCol { } } -internal val convertersCache = mutableMapOf, TypeConverter?>() +internal val convertersCache = mutableMapOf, TypeConverter?>() -internal fun getConverter(from: KType, to: KType): TypeConverter? = convertersCache.getOrPut(from to to) { createConverter(from, to) } +internal fun getConverter(from: KType, to: KType, options: ParserOptions? = null): TypeConverter? = convertersCache.getOrPut(Triple(from, to, options)) { createConverter(from, to, options) } internal typealias TypeConverter = (Any) -> Any? diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 1e8063993..5e79e6d69 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -2,12 +2,14 @@ package org.jetbrains.kotlinx.dataframe.io import io.kotest.matchers.shouldBe import kotlinx.datetime.LocalDateTime +import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.api.* import org.jetbrains.kotlinx.dataframe.api.columnOf import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException import org.junit.Test import java.math.BigDecimal +import java.util.* import kotlin.reflect.typeOf class ParserTests { @@ -58,4 +60,23 @@ class ParserTests { converted[0] shouldBe 1.0f converted[1] shouldBe 0.321f } + + @Test + fun `converting string to double in different locales`() { + val currentLocale = Locale.getDefault() + try { + val stringValues = listOf("1", "2.3", "4,5") + val stringColumn = DataColumn.createValueColumn("nums", stringValues, typeOf()) + Locale.setDefault(Locale.forLanguageTag("ru-RU")) + stringColumn.convertToDouble().shouldBe( + DataColumn.createValueColumn("nums", listOf(1.0, 2.3, 4.5), typeOf()) + ) + Locale.setDefault(Locale.forLanguageTag("en-US")) + stringColumn.convertToDouble().shouldBe( + DataColumn.createValueColumn("nums", listOf(1.0, 2.3, 45.0), typeOf()) + ) + } finally { + Locale.setDefault(currentLocale) + } + } } From 3e2329ecc69b773c2f7803b1cd572dd335012877 Mon Sep 17 00:00:00 2001 From: Kopilov Aleksandr Date: Fri, 29 Jul 2022 15:28:58 +0300 Subject: [PATCH 3/7] Converting to Boolean and LocalTime --- .../org/jetbrains/kotlinx/dataframe/impl/api/convert.kt | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt index 43676bcc9..20becd6a9 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt @@ -206,6 +206,7 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n Byte::class -> convert { it.toByte() } Short::class -> convert { it.toShort() } Long::class -> convert { it.toLong() } + Boolean::class -> convert { it != 0 } else -> null } Int::class -> when (toClass) { @@ -215,6 +216,7 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n Short::class -> convert { it.toShort() } Long::class -> convert { it.toLong() } BigDecimal::class -> convert { it.toBigDecimal() } + Boolean::class -> convert { it != 0 } LocalDateTime::class -> convert { it.toLong().toLocalDateTime(defaultTimeZone) } LocalDate::class -> convert { it.toLong().toLocalDate(defaultTimeZone) } java.time.LocalDateTime::class -> convert { it.toLocalDateTime(defaultTimeZone).toJavaLocalDateTime() } @@ -228,6 +230,7 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n Long::class -> convert { it.roundToLong() } Short::class -> convert { it.roundToInt().toShort() } BigDecimal::class -> convert { it.toBigDecimal() } + Boolean::class -> convert { it != 0.0 } else -> null } Long::class -> when (toClass) { @@ -237,6 +240,7 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n Short::class -> convert { it.toShort() } Int::class -> convert { it.toInt() } BigDecimal::class -> convert { it.toBigDecimal() } + Boolean::class -> convert { it != 0L } LocalDateTime::class -> convert { it.toLocalDateTime(defaultTimeZone) } LocalDate::class -> convert { it.toLocalDate(defaultTimeZone) } Instant::class -> convert { Instant.fromEpochMilliseconds(it) } @@ -271,6 +275,7 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n Int::class -> convert { it.roundToInt() } Short::class -> convert { it.roundToInt().toShort() } BigDecimal::class -> convert { it.toBigDecimal() } + Boolean::class -> convert { it != 0.0F } else -> null } BigDecimal::class -> when (toClass) { @@ -278,6 +283,7 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n Int::class -> convert { it.toInt() } Float::class -> convert { it.toFloat() } Long::class -> convert { it.toLong() } + Boolean::class -> convert { it != BigDecimal.ZERO } else -> null } LocalDateTime::class -> when (toClass) { @@ -286,6 +292,7 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n Long::class -> convert { it.toInstant(defaultTimeZone).toEpochMilliseconds() } java.time.LocalDateTime::class -> convert { it.toJavaLocalDateTime() } java.time.LocalDate::class -> convert { it.date.toJavaLocalDate() } + java.time.LocalTime::class -> convert { it.toJavaLocalDateTime().toLocalTime() } else -> null } java.time.LocalDateTime::class -> when (toClass) { @@ -294,6 +301,7 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n Instant::class -> convert { it.toKotlinLocalDateTime().toInstant(defaultTimeZone) } Long::class -> convert { it.toKotlinLocalDateTime().toInstant(defaultTimeZone).toEpochMilliseconds() } java.time.LocalDate::class -> convert { it.toLocalDate() } + java.time.LocalTime::class -> convert { it.toLocalTime() } else -> null } LocalDate::class -> when (toClass) { From 7572dbffe4652c8da948b04bf5aa1e6202bcb63a Mon Sep 17 00:00:00 2001 From: Kopilov Aleksandr Date: Mon, 1 Aug 2022 11:57:27 +0300 Subject: [PATCH 4/7] Unit test "convert to Boolean" --- .../kotlinx/dataframe/api/convert.kt | 2 +- .../kotlinx/dataframe/impl/api/convert.kt | 4 ++-- .../kotlinx/dataframe/io/ParserTests.kt | 24 +++++++++++++++---- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index d2aee5f57..536c84625 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -30,7 +30,7 @@ import org.jetbrains.kotlinx.dataframe.io.toDataFrame import java.math.BigDecimal import java.net.URL import java.time.LocalTime -import java.util.* +import java.util.Locale import kotlin.reflect.KProperty import kotlin.reflect.KType import kotlin.reflect.typeOf diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt index 20becd6a9..3182fbdd2 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/convert.kt @@ -36,7 +36,7 @@ import org.jetbrains.kotlinx.dataframe.type import java.math.BigDecimal import java.net.URL import java.time.LocalTime -import java.util.* +import java.util.Locale import kotlin.math.roundToInt import kotlin.math.roundToLong import kotlin.reflect.KType @@ -206,7 +206,7 @@ internal fun createConverter(from: KType, to: KType, options: ParserOptions? = n Byte::class -> convert { it.toByte() } Short::class -> convert { it.toShort() } Long::class -> convert { it.toLong() } - Boolean::class -> convert { it != 0 } + Boolean::class -> convert { it.toDouble() != 0.0 } else -> null } Int::class -> when (toClass) { diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 5e79e6d69..9488a84a8 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -4,12 +4,16 @@ import io.kotest.matchers.shouldBe import kotlinx.datetime.LocalDateTime import org.jetbrains.kotlinx.dataframe.DataColumn import org.jetbrains.kotlinx.dataframe.DataFrame -import org.jetbrains.kotlinx.dataframe.api.* +import org.jetbrains.kotlinx.dataframe.api.cast import org.jetbrains.kotlinx.dataframe.api.columnOf +import org.jetbrains.kotlinx.dataframe.api.convertTo +import org.jetbrains.kotlinx.dataframe.api.parse +import org.jetbrains.kotlinx.dataframe.api.parser +import org.jetbrains.kotlinx.dataframe.api.tryParse import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException import org.junit.Test import java.math.BigDecimal -import java.util.* +import java.util.Locale import kotlin.reflect.typeOf class ParserTests { @@ -62,17 +66,27 @@ class ParserTests { } @Test - fun `converting string to double in different locales`() { + fun `convert to Boolean`() { + val col by columnOf(BigDecimal(1.0), BigDecimal(0.0), 0, 1, 10L, 0.0, 0.1) + col.convertTo().shouldBe( + DataColumn.createValueColumn("col", listOf(true, false, false, true, true, false, true), typeOf()) + ) + } + + @Test + fun `converting String to Double in different locales`() { val currentLocale = Locale.getDefault() try { val stringValues = listOf("1", "2.3", "4,5") val stringColumn = DataColumn.createValueColumn("nums", stringValues, typeOf()) Locale.setDefault(Locale.forLanguageTag("ru-RU")) - stringColumn.convertToDouble().shouldBe( + // Use comma as local decimal separator and dot as fallback default (as it is used in POSIX/C.UTF-8) + stringColumn.convertTo().shouldBe( DataColumn.createValueColumn("nums", listOf(1.0, 2.3, 4.5), typeOf()) ) Locale.setDefault(Locale.forLanguageTag("en-US")) - stringColumn.convertToDouble().shouldBe( + // Use dot as local decimal separator. Comma is ignored (as it is group separator in this locale). + stringColumn.convertTo().shouldBe( DataColumn.createValueColumn("nums", listOf(1.0, 2.3, 45.0), typeOf()) ) } finally { From 4301051654f87e25c5ffb409dcc3c0e41589f95c Mon Sep 17 00:00:00 2001 From: Kopilov Aleksandr Date: Thu, 4 Aug 2022 16:39:58 +0300 Subject: [PATCH 5/7] POSIX Double parsing per column --- .../kotlinx/dataframe/api/convert.kt | 43 ++++++++++ .../kotlinx/dataframe/impl/api/parse.kt | 38 +++++---- .../kotlinx/dataframe/io/CsvTests.kt | 21 +++++ .../kotlinx/dataframe/io/ParserTests.kt | 79 ++++++++++++++++--- 4 files changed, 155 insertions(+), 26 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index 536c84625..b61a468eb 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -16,6 +16,7 @@ import org.jetbrains.kotlinx.dataframe.RowValueExpression import org.jetbrains.kotlinx.dataframe.columns.ColumnReference import org.jetbrains.kotlinx.dataframe.dataTypes.IFRAME import org.jetbrains.kotlinx.dataframe.dataTypes.IMG +import org.jetbrains.kotlinx.dataframe.exceptions.TypeConversionException import org.jetbrains.kotlinx.dataframe.impl.api.Parsers import org.jetbrains.kotlinx.dataframe.impl.api.convertRowColumnImpl import org.jetbrains.kotlinx.dataframe.impl.api.convertToTypeImpl @@ -125,6 +126,48 @@ public fun DataColumn.convertToString(): DataColumn = con public fun DataColumn.convertToDouble(): DataColumn = convertTo() public fun DataColumn.convertToDouble(): DataColumn = convertTo() +/** + * Parse String column to Double considering locale (number format). + * If [locale] parameter is defined, it's number format is used for parsing. + * If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used. + */ +@JvmName("convertToDoubleFromString") +public fun DataColumn.convertToDouble(locale: Locale? = null): DataColumn { + if (locale is Locale) { + val explicitConverter = Parsers.getDoubleConverter(locale) as (String) -> Double? + return map { explicitConverter(it.trim()) ?: error("Can't convert `$it` to Double") } + } else { + return try { + val defaultConverter = Parsers.getDoubleConverter() as (String) -> Double? + map { defaultConverter(it.trim()) ?: error("Can't convert `$it` to Double") } + } catch (e: TypeConversionException) { + val posixConverter = Parsers.getDoubleConverter(Locale.forLanguageTag("C.UTF-8")) as (String) -> Double? + map { posixConverter(it.trim()) ?: error("Can't convert `$it` to Double") } + } + } +} + +/** + * Parse String column to Double considering locale (number format). + * If [locale] parameter is defined, it's number format is used for parsing. + * If [locale] parameter is null, the current system locale is used. If column can not be parsed, then POSIX format is used. + */ +@JvmName("convertToDoubleFromStringNullable") +public fun DataColumn.convertToDouble(locale: Locale? = null): DataColumn { + if (locale is Locale) { + val explicitConverter = Parsers.getDoubleConverter(locale) as (String) -> Double? + return map { it?.let { explicitConverter(it.trim()) ?: error("Can't convert `$it` to Double") } } + } else { + return try { + val defaultConverter = Parsers.getDoubleConverter() as (String) -> Double? + map { it?.let { defaultConverter(it.trim()) ?: error("Can't convert `$it` to Double") } } + } catch (e: IllegalStateException) { + val posixConverter = Parsers.getDoubleConverter(Locale.forLanguageTag("C.UTF-8")) as (String) -> Double? + map { it?.let { posixConverter(it.trim()) ?: error("Can't convert `$it` to Double") } } + } + } +} + @JvmName("convertToFloatFromT") public fun DataColumn.convertToFloat(): DataColumn = convertTo() public fun DataColumn.convertToFloat(): DataColumn = convertTo() diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index 675db46f4..0af322eda 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -171,9 +171,7 @@ internal object Parsers : GlobalParserOptions { return null } - private val posixNumberFormat = NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8")) - - private fun String.parseDouble(userNumberFormat: NumberFormat) = + private fun String.parseDouble(format: NumberFormat) = when (uppercase(Locale.getDefault())) { "NAN" -> Double.NaN "INF" -> Double.POSITIVE_INFINITY @@ -181,13 +179,10 @@ internal object Parsers : GlobalParserOptions { "INFINITY" -> Double.POSITIVE_INFINITY "-INFINITY" -> Double.NEGATIVE_INFINITY else -> { - fun parseWithFormat(format: NumberFormat): Double? { - val parsePosition = ParsePosition(0) - val result: Double? = format.parse(this, parsePosition)?.toDouble() - return if (parsePosition.index != this.length) null - else result - } - parseWithFormat(userNumberFormat) ?: parseWithFormat(posixNumberFormat) + val parsePosition = ParsePosition(0) + val result: Double? = format.parse(this, parsePosition)?.toDouble() + if (parsePosition.index != this.length) null + else result } } @@ -199,6 +194,12 @@ internal object Parsers : GlobalParserOptions { inline fun stringParserWithOptions(noinline body: (ParserOptions?) -> ((String) -> T?)) = StringParserWithFormat(typeOf(), body) + private val parserToDoubleWithOptions = stringParserWithOptions { options -> + val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault()) + val parser = { it: String -> it.parseDouble(numberFormat) } + parser + } + private val parsersOrder = listOf( stringParser { it.toIntOrNull() }, stringParser { it.toLongOrNull() }, @@ -231,12 +232,12 @@ internal object Parsers : GlobalParserOptions { stringParser { it.toUrlOrNull() }, - stringParserWithOptions { options -> + // Double, with explicit number format or taken from current locale + parserToDoubleWithOptions, + + // Double, with POSIX format + stringParser { it.parseDouble(NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8"))) }, - val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault()) - val parser = { it: String -> it.parseDouble(numberFormat) } - parser - }, stringParser { it.toBooleanOrNull() }, stringParser { it.toBigDecimalOrNull() }, @@ -271,6 +272,13 @@ internal object Parsers : GlobalParserOptions { ) else null return parser.applyOptions(options) } + + internal fun getDoubleConverter(locale: Locale? = null): TypeConverter { + val options = if (locale != null) ParserOptions( + locale = locale + ) else null + return parserToDoubleWithOptions.toConverter(options) + } } internal fun DataColumn.tryParseImpl(options: ParserOptions?): DataColumn<*> { diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTests.kt index 9629f2ab3..958f117f7 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/CsvTests.kt @@ -104,6 +104,27 @@ class CsvTests { assertColumnType("quality", Int::class) } + @Test + fun `read standard CSV with floats when user has alternative locale`() { + val currentLocale = Locale.getDefault() + try { + Locale.setDefault(Locale.forLanguageTag("ru-RU")) + val df = DataFrame.readCSV(wineCsv, delimiter = ';') + val schema = df.schema() + fun assertColumnType(columnName: String, kClass: KClass<*>) { + val col = schema.columns[columnName] + col.shouldNotBeNull() + col.type.classifier shouldBe kClass + } + + assertColumnType("citric acid", Double::class) + assertColumnType("alcohol", Double::class) + assertColumnType("quality", Int::class) + } finally { + Locale.setDefault(currentLocale) + } + } + @Test fun `read with custom header`() { val header = ('A'..'K').map { it.toString() } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index 9488a84a8..c87e19c88 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -1,5 +1,6 @@ package org.jetbrains.kotlinx.dataframe.io +import io.kotest.assertions.throwables.shouldThrow import io.kotest.matchers.shouldBe import kotlinx.datetime.LocalDateTime import org.jetbrains.kotlinx.dataframe.DataColumn @@ -7,6 +8,7 @@ import org.jetbrains.kotlinx.dataframe.DataFrame import org.jetbrains.kotlinx.dataframe.api.cast import org.jetbrains.kotlinx.dataframe.api.columnOf import org.jetbrains.kotlinx.dataframe.api.convertTo +import org.jetbrains.kotlinx.dataframe.api.convertToDouble import org.jetbrains.kotlinx.dataframe.api.parse import org.jetbrains.kotlinx.dataframe.api.parser import org.jetbrains.kotlinx.dataframe.api.tryParse @@ -77,18 +79,73 @@ class ParserTests { fun `converting String to Double in different locales`() { val currentLocale = Locale.getDefault() try { - val stringValues = listOf("1", "2.3", "4,5") - val stringColumn = DataColumn.createValueColumn("nums", stringValues, typeOf()) - Locale.setDefault(Locale.forLanguageTag("ru-RU")) - // Use comma as local decimal separator and dot as fallback default (as it is used in POSIX/C.UTF-8) - stringColumn.convertTo().shouldBe( - DataColumn.createValueColumn("nums", listOf(1.0, 2.3, 4.5), typeOf()) - ) + // Test 36 behaviour combinations: + + // 3 source columns + val columnDot = columnOf("12.345", "67.890") + val columnComma = columnOf("12,345", "67,890") + val columnMixed = columnOf("12.345", "67,890") + // * + // (3 locales as converting parameter + original converting) + val parsingLocaleNotDefined: Locale? = null + val parsingLocaleUsesDot: Locale = Locale.forLanguageTag("en-US") + val parsingLocaleUsesComma: Locale = Locale.forLanguageTag("ru-RU") + // * + // 3 system locales + + Locale.setDefault(Locale.forLanguageTag("C.UTF-8")) + + columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) + columnComma.convertTo().shouldBe(columnOf(12345.0, 67890.0)) + columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + + columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) + columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12345.0, 67890.0)) + columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0)) + + columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89)) + columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0)) + columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0)) + + shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89)) + shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } + Locale.setDefault(Locale.forLanguageTag("en-US")) - // Use dot as local decimal separator. Comma is ignored (as it is group separator in this locale). - stringColumn.convertTo().shouldBe( - DataColumn.createValueColumn("nums", listOf(1.0, 2.3, 45.0), typeOf()) - ) + + columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) + columnComma.convertTo().shouldBe(columnOf(12345.0, 67890.0)) + columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + + columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) + columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12345.0, 67890.0)) + columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0)) + + columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89)) + columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0)) + columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0)) + + shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89)) + shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } + + Locale.setDefault(Locale.forLanguageTag("ru-RU")) + + columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) + columnComma.convertTo().shouldBe(columnOf(12345.0, 67890.0)) + columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) + + columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) + columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) + columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0)) + + columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89)) + columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0)) + columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0)) + + shouldThrow { columnDot.convertToDouble(parsingLocaleUsesComma) } + columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89)) + shouldThrow { columnMixed.convertToDouble(parsingLocaleUsesComma) } } finally { Locale.setDefault(currentLocale) } From fff2b98c402e0777a64292700767f526370a7c41 Mon Sep 17 00:00:00 2001 From: Kopilov Aleksandr Date: Sat, 6 Aug 2022 20:14:01 +0300 Subject: [PATCH 6/7] Call convertToDouble from convertTo for DataColumn --- .../kotlinx/dataframe/api/convert.kt | 35 ++++++++----------- .../kotlinx/dataframe/impl/api/parse.kt | 4 +-- .../kotlinx/dataframe/io/ParserTests.kt | 2 +- 3 files changed, 17 insertions(+), 24 deletions(-) diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt index b61a468eb..3aa6c55cd 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/api/convert.kt @@ -100,7 +100,11 @@ public fun Convert.to(columnConverter: DataFrame.(DataColumn) df.replace(columns).with { columnConverter(df, it) } public inline fun AnyCol.convertTo(): DataColumn = convertTo(typeOf()) as DataColumn -public fun AnyCol.convertTo(newType: KType): AnyCol = convertToTypeImpl(newType) +public fun AnyCol.convertTo(newType: KType): AnyCol { + if (this.type() == typeOf() && newType == typeOf()) return (this as DataColumn).convertToDouble() + if (this.type() == typeOf() && newType == typeOf()) return (this as DataColumn).convertToDouble() + return convertToTypeImpl(newType) +} @JvmName("convertToLocalDateTimeFromT") public fun DataColumn.convertToLocalDateTime(): DataColumn = convertTo() @@ -133,18 +137,7 @@ public fun DataColumn.convertToDouble(): DataColumn = con */ @JvmName("convertToDoubleFromString") public fun DataColumn.convertToDouble(locale: Locale? = null): DataColumn { - if (locale is Locale) { - val explicitConverter = Parsers.getDoubleConverter(locale) as (String) -> Double? - return map { explicitConverter(it.trim()) ?: error("Can't convert `$it` to Double") } - } else { - return try { - val defaultConverter = Parsers.getDoubleConverter() as (String) -> Double? - map { defaultConverter(it.trim()) ?: error("Can't convert `$it` to Double") } - } catch (e: TypeConversionException) { - val posixConverter = Parsers.getDoubleConverter(Locale.forLanguageTag("C.UTF-8")) as (String) -> Double? - map { posixConverter(it.trim()) ?: error("Can't convert `$it` to Double") } - } - } + return this.castToNullable().convertToDouble(locale).castToNotNullable() } /** @@ -154,16 +147,16 @@ public fun DataColumn.convertToDouble(locale: Locale? = null): DataColum */ @JvmName("convertToDoubleFromStringNullable") public fun DataColumn.convertToDouble(locale: Locale? = null): DataColumn { - if (locale is Locale) { - val explicitConverter = Parsers.getDoubleConverter(locale) as (String) -> Double? - return map { it?.let { explicitConverter(it.trim()) ?: error("Can't convert `$it` to Double") } } + if (locale != null) { + val explicitParser = Parsers.getDoubleParser(locale) + return map { it?.let { explicitParser(it.trim()) ?: throw TypeConversionException(it, typeOf(), typeOf()) } } } else { return try { - val defaultConverter = Parsers.getDoubleConverter() as (String) -> Double? - map { it?.let { defaultConverter(it.trim()) ?: error("Can't convert `$it` to Double") } } - } catch (e: IllegalStateException) { - val posixConverter = Parsers.getDoubleConverter(Locale.forLanguageTag("C.UTF-8")) as (String) -> Double? - map { it?.let { posixConverter(it.trim()) ?: error("Can't convert `$it` to Double") } } + val defaultParser = Parsers.getDoubleParser() + map { it?.let { defaultParser(it.trim()) ?: throw TypeConversionException(it, typeOf(), typeOf()) } } + } catch (e: TypeConversionException) { + val posixParser = Parsers.getDoubleParser(Locale.forLanguageTag("C.UTF-8")) + map { it?.let { posixParser(it.trim()) ?: throw TypeConversionException(it, typeOf(), typeOf()) } } } } } diff --git a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt index 0af322eda..6c4a59bf6 100644 --- a/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt +++ b/core/src/main/kotlin/org/jetbrains/kotlinx/dataframe/impl/api/parse.kt @@ -273,11 +273,11 @@ internal object Parsers : GlobalParserOptions { return parser.applyOptions(options) } - internal fun getDoubleConverter(locale: Locale? = null): TypeConverter { + internal fun getDoubleParser(locale: Locale? = null): (String) -> Double? { val options = if (locale != null) ParserOptions( locale = locale ) else null - return parserToDoubleWithOptions.toConverter(options) + return parserToDoubleWithOptions.applyOptions(options) } } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt index c87e19c88..be7fef78c 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/io/ParserTests.kt @@ -132,7 +132,7 @@ class ParserTests { Locale.setDefault(Locale.forLanguageTag("ru-RU")) columnDot.convertTo().shouldBe(columnOf(12.345, 67.89)) - columnComma.convertTo().shouldBe(columnOf(12345.0, 67890.0)) + columnComma.convertTo().shouldBe(columnOf(12.345, 67.89)) columnMixed.convertTo().shouldBe(columnOf(12.345, 67890.0)) columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) From d30824af3782d01c5987bf839a275b9f48615f25 Mon Sep 17 00:00:00 2001 From: Kopilov Aleksandr Date: Sun, 7 Aug 2022 18:48:43 +0300 Subject: [PATCH 7/7] Make tests running on alternative locale --- .../dataframe/jupyter/SampleNotebooksTests.kt | 25 ++-- .../kotlinx/dataframe/puzzles/BasicTests.kt | 4 +- .../dataframe/rendering/PrecisionTests.kt | 4 +- .../dataframe/rendering/RenderingTests.kt | 8 +- .../jetbrains/kotlinx/dataframe/api/parse.kt | 112 ++++++++++-------- 5 files changed, 92 insertions(+), 61 deletions(-) diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/SampleNotebooksTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/SampleNotebooksTests.kt index dcce21c62..fa750ab53 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/SampleNotebooksTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/jupyter/SampleNotebooksTests.kt @@ -6,6 +6,7 @@ import org.jetbrains.jupyter.parser.notebook.Output import org.junit.Ignore import org.junit.Test import java.io.File +import java.util.Locale class SampleNotebooksTests : DataFrameJupyterTest() { @Test @@ -39,13 +40,23 @@ class SampleNotebooksTests : DataFrameJupyterTest() { ) @Test - fun netflix() = exampleTest( - "netflix", - replacer = CodeReplacer.byMap( - testFile("netflix", "country_codes.csv"), - testFile("netflix", "netflix_titles.csv"), - ) - ) + fun netflix() { + val currentLocale = Locale.getDefault() + try { + // Set explicit locale as of test data contains locale-dependent values (date for parsing) + Locale.setDefault(Locale.forLanguageTag("en-US")) + + exampleTest( + "netflix", + replacer = CodeReplacer.byMap( + testFile("netflix", "country_codes.csv"), + testFile("netflix", "netflix_titles.csv"), + ) + ) + } finally { + Locale.setDefault(currentLocale) + } + } @Test @Ignore diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/puzzles/BasicTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/puzzles/BasicTests.kt index 1b75e380a..18f13ec75 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/puzzles/BasicTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/puzzles/BasicTests.kt @@ -5,6 +5,7 @@ import org.jetbrains.kotlinx.dataframe.api.* import org.jetbrains.kotlinx.dataframe.api.columnOf import org.jetbrains.kotlinx.dataframe.api.dataFrameOf import org.junit.Test +import java.text.DecimalFormatSymbols import kotlin.reflect.typeOf class BasicTests { @@ -135,7 +136,8 @@ class BasicTests { fun `append and drop new row`() { val modifiedDf = df.append("dog", 5.5, 2, "no") - modifiedDf[10].toString() shouldBe "{ animal:dog, age:5.500000, visits:2, priority:no }" + val d = DecimalFormatSymbols.getInstance().decimalSeparator + modifiedDf[10].toString() shouldBe "{ animal:dog, age:5${d}500000, visits:2, priority:no }" modifiedDf.dropLast() shouldBe df } diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/rendering/PrecisionTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/rendering/PrecisionTests.kt index 5688b272f..1cc860111 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/rendering/PrecisionTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/rendering/PrecisionTests.kt @@ -8,6 +8,7 @@ import org.jetbrains.kotlinx.dataframe.io.RendererDecimalFormat import org.jetbrains.kotlinx.dataframe.io.defaultPrecision import org.jetbrains.kotlinx.dataframe.io.format import org.junit.Test +import java.text.DecimalFormatSymbols class PrecisionTests { @@ -26,8 +27,9 @@ class PrecisionTests { @Test fun format() { + val d = DecimalFormatSymbols.getInstance().decimalSeparator val value = 1.2341 - val expected = "1.23" + val expected = "1${d}23" val digits = 2 val formatter = RendererDecimalFormat.fromPrecision(digits) value.format(formatter) shouldBe expected diff --git a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/rendering/RenderingTests.kt b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/rendering/RenderingTests.kt index 71049f3dd..b2a0b2584 100644 --- a/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/rendering/RenderingTests.kt +++ b/core/src/test/kotlin/org/jetbrains/kotlinx/dataframe/rendering/RenderingTests.kt @@ -23,6 +23,7 @@ import org.jetbrains.kotlinx.dataframe.jupyter.RenderedContent import org.jsoup.Jsoup import org.junit.Test import java.net.URL +import java.text.DecimalFormatSymbols import kotlin.reflect.typeOf class RenderingTests { @@ -108,10 +109,11 @@ class RenderingTests { @Test fun `render double with exponent`() { + val d = DecimalFormatSymbols.getInstance().decimalSeparator listOf( - dataFrameOf("col")(1E27) to "1.000000e+27", - dataFrameOf("col")(1.123) to "1.123", - dataFrameOf("col")(1.0) to "1.0", + dataFrameOf("col")(1E27) to "1${d}000000e+27", + dataFrameOf("col")(1.123) to "1${d}123", + dataFrameOf("col")(1.0) to "1${d}0", ).forEach { (df, rendered) -> df.toHTML().script shouldContain rendered } diff --git a/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt b/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt index 8d04ca772..e1fbcd607 100644 --- a/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt +++ b/tests/src/test/kotlin/org/jetbrains/kotlinx/dataframe/api/parse.kt @@ -9,6 +9,7 @@ import org.jetbrains.kotlinx.dataframe.type import org.junit.Test import java.time.LocalTime import java.time.Month +import java.util.Locale import kotlin.reflect.typeOf import kotlin.time.Duration.Companion.days import kotlin.time.Duration.Companion.hours @@ -18,61 +19,74 @@ import kotlin.time.Duration.Companion.seconds class ParseTests { @Test fun parseDate() { - val date by columnOf("January 1, 2020") - val pattern = "MMMM d, yyyy" - - val parsed = date.parse(ParserOptions(dateTimePattern = pattern)).cast() - - parsed.type() shouldBe typeOf() - with(parsed[0]) { - month shouldBe Month.JANUARY - dayOfMonth shouldBe 1 - year shouldBe 2020 + val currentLocale = Locale.getDefault() + try { + Locale.setDefault(Locale.forLanguageTag("en-US")) + val date by columnOf("January 1, 2020") + val pattern = "MMMM d, yyyy" + + val parsed = date.parse(ParserOptions(dateTimePattern = pattern)).cast() + + parsed.type() shouldBe typeOf() + with(parsed[0]) { + month shouldBe Month.JANUARY + dayOfMonth shouldBe 1 + year shouldBe 2020 + } + + date.convertToLocalDate(pattern) shouldBe parsed + with(date.toDataFrame()) { + convert { date }.toLocalDate(pattern)[date] shouldBe parsed + parse(ParserOptions(dateTimePattern = pattern))[date] shouldBe parsed + } + + DataFrame.parser.addDateTimePattern(pattern) + + date.parse() shouldBe parsed + date.convertToLocalDate() shouldBe parsed + + DataFrame.parser.resetToDefault() + } finally { + Locale.setDefault(currentLocale) } - - date.convertToLocalDate(pattern) shouldBe parsed - with(date.toDataFrame()) { - convert { date }.toLocalDate(pattern)[date] shouldBe parsed - parse(ParserOptions(dateTimePattern = pattern))[date] shouldBe parsed - } - - DataFrame.parser.addDateTimePattern(pattern) - - date.parse() shouldBe parsed - date.convertToLocalDate() shouldBe parsed - - DataFrame.parser.resetToDefault() } @Test fun parseDateTime() { - val dateTime by columnOf("3 Jun 2008 13:05:30") - val pattern = "d MMM yyyy HH:mm:ss" - - val parsed = dateTime.parse(ParserOptions(dateTimePattern = pattern)).cast() - - parsed.type() shouldBe typeOf() - with(parsed[0]) { - month shouldBe Month.JUNE - dayOfMonth shouldBe 3 - year shouldBe 2008 - hour shouldBe 13 - minute shouldBe 5 - second shouldBe 30 - } - - dateTime.convertToLocalDateTime(pattern) shouldBe parsed - with(dateTime.toDataFrame()) { - convert { dateTime }.toLocalDateTime(pattern)[dateTime] shouldBe parsed - parse(ParserOptions(dateTimePattern = pattern))[dateTime] shouldBe parsed + val currentLocale = Locale.getDefault() + try { + Locale.setDefault(Locale.forLanguageTag("en-US")) + val dateTime by columnOf("3 Jun 2008 13:05:30") + val pattern = "d MMM yyyy HH:mm:ss" + val locale = Locale.forLanguageTag("en-US") + + val parsed = dateTime.parse(ParserOptions(dateTimePattern = pattern, locale = locale)).cast() + + parsed.type() shouldBe typeOf() + with(parsed[0]) { + month shouldBe Month.JUNE + dayOfMonth shouldBe 3 + year shouldBe 2008 + hour shouldBe 13 + minute shouldBe 5 + second shouldBe 30 + } + + dateTime.convertToLocalDateTime(pattern, locale) shouldBe parsed + with(dateTime.toDataFrame()) { + convert { dateTime }.toLocalDateTime(pattern)[dateTime] shouldBe parsed + parse(ParserOptions(dateTimePattern = pattern))[dateTime] shouldBe parsed + } + + DataFrame.parser.addDateTimePattern(pattern) + + dateTime.parse(ParserOptions(locale = locale)) shouldBe parsed + dateTime.convertToLocalDateTime(pattern, locale) shouldBe parsed + + DataFrame.parser.resetToDefault() + } finally { + Locale.setDefault(currentLocale) } - - DataFrame.parser.addDateTimePattern(pattern) - - dateTime.parse() shouldBe parsed - dateTime.convertToLocalDateTime() shouldBe parsed - - DataFrame.parser.resetToDefault() } @Test