-
Notifications
You must be signed in to change notification settings - Fork 63
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
More Converting operations #133
Changes from 1 commit
7a5fdfb
64cd9e8
3e2329e
7572dbf
4301051
fff2b98
d30824a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -171,23 +171,18 @@ internal object Parsers : GlobalParserOptions { | |
return null | ||
} | ||
|
||
private val posixNumberFormat = NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8")) | ||
|
||
private fun String.parseDouble(userNumberFormat: NumberFormat) = | ||
private fun String.parseDouble(format: NumberFormat) = | ||
when (uppercase(Locale.getDefault())) { | ||
"NAN" -> Double.NaN | ||
"INF" -> Double.POSITIVE_INFINITY | ||
"-INF" -> Double.NEGATIVE_INFINITY | ||
"INFINITY" -> Double.POSITIVE_INFINITY | ||
"-INFINITY" -> Double.NEGATIVE_INFINITY | ||
else -> { | ||
fun parseWithFormat(format: NumberFormat): Double? { | ||
val parsePosition = ParsePosition(0) | ||
val result: Double? = format.parse(this, parsePosition)?.toDouble() | ||
return if (parsePosition.index != this.length) null | ||
else result | ||
} | ||
parseWithFormat(userNumberFormat) ?: parseWithFormat(posixNumberFormat) | ||
val parsePosition = ParsePosition(0) | ||
val result: Double? = format.parse(this, parsePosition)?.toDouble() | ||
if (parsePosition.index != this.length) null | ||
else result | ||
} | ||
} | ||
|
||
|
@@ -199,6 +194,12 @@ internal object Parsers : GlobalParserOptions { | |
inline fun <reified T : Any> stringParserWithOptions(noinline body: (ParserOptions?) -> ((String) -> T?)) = | ||
StringParserWithFormat(typeOf<T>(), body) | ||
|
||
private val parserToDoubleWithOptions = stringParserWithOptions { options -> | ||
val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault()) | ||
val parser = { it: String -> it.parseDouble(numberFormat) } | ||
parser | ||
} | ||
|
||
private val parsersOrder = listOf( | ||
stringParser { it.toIntOrNull() }, | ||
stringParser { it.toLongOrNull() }, | ||
|
@@ -231,12 +232,12 @@ internal object Parsers : GlobalParserOptions { | |
|
||
stringParser { it.toUrlOrNull() }, | ||
|
||
stringParserWithOptions { options -> | ||
// Double, with explicit number format or taken from current locale | ||
parserToDoubleWithOptions, | ||
|
||
// Double, with POSIX format | ||
stringParser { it.parseDouble(NumberFormat.getInstance(Locale.forLanguageTag("C.UTF-8"))) }, | ||
|
||
val numberFormat = NumberFormat.getInstance(options?.locale ?: Locale.getDefault()) | ||
val parser = { it: String -> it.parseDouble(numberFormat) } | ||
parser | ||
}, | ||
stringParser { it.toBooleanOrNull() }, | ||
stringParser { it.toBigDecimalOrNull() }, | ||
|
||
|
@@ -271,6 +272,13 @@ internal object Parsers : GlobalParserOptions { | |
) else null | ||
return parser.applyOptions(options) | ||
} | ||
|
||
internal fun getDoubleConverter(locale: Locale? = null): TypeConverter { | ||
val options = if (locale != null) ParserOptions( | ||
locale = locale | ||
) else null | ||
return parserToDoubleWithOptions.toConverter(options) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. Originally I used |
||
} | ||
} | ||
|
||
internal fun DataColumn<String?>.tryParseImpl(options: ParserOptions?): DataColumn<*> { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,14 @@ | ||
package org.jetbrains.kotlinx.dataframe.io | ||
|
||
import io.kotest.assertions.throwables.shouldThrow | ||
import io.kotest.matchers.shouldBe | ||
import kotlinx.datetime.LocalDateTime | ||
import org.jetbrains.kotlinx.dataframe.DataColumn | ||
import org.jetbrains.kotlinx.dataframe.DataFrame | ||
import org.jetbrains.kotlinx.dataframe.api.cast | ||
import org.jetbrains.kotlinx.dataframe.api.columnOf | ||
import org.jetbrains.kotlinx.dataframe.api.convertTo | ||
import org.jetbrains.kotlinx.dataframe.api.convertToDouble | ||
import org.jetbrains.kotlinx.dataframe.api.parse | ||
import org.jetbrains.kotlinx.dataframe.api.parser | ||
import org.jetbrains.kotlinx.dataframe.api.tryParse | ||
|
@@ -77,18 +79,73 @@ class ParserTests { | |
fun `converting String to Double in different locales`() { | ||
val currentLocale = Locale.getDefault() | ||
try { | ||
val stringValues = listOf("1", "2.3", "4,5") | ||
val stringColumn = DataColumn.createValueColumn("nums", stringValues, typeOf<String>()) | ||
Locale.setDefault(Locale.forLanguageTag("ru-RU")) | ||
// Use comma as local decimal separator and dot as fallback default (as it is used in POSIX/C.UTF-8) | ||
stringColumn.convertTo<Double>().shouldBe( | ||
DataColumn.createValueColumn("nums", listOf(1.0, 2.3, 4.5), typeOf<Double>()) | ||
) | ||
// Test 36 behaviour combinations: | ||
|
||
// 3 source columns | ||
val columnDot = columnOf("12.345", "67.890") | ||
val columnComma = columnOf("12,345", "67,890") | ||
val columnMixed = columnOf("12.345", "67,890") | ||
// * | ||
// (3 locales as converting parameter + original converting) | ||
val parsingLocaleNotDefined: Locale? = null | ||
val parsingLocaleUsesDot: Locale = Locale.forLanguageTag("en-US") | ||
val parsingLocaleUsesComma: Locale = Locale.forLanguageTag("ru-RU") | ||
// * | ||
// 3 system locales | ||
|
||
Locale.setDefault(Locale.forLanguageTag("C.UTF-8")) | ||
|
||
columnDot.convertTo<Double>().shouldBe(columnOf(12.345, 67.89)) | ||
columnComma.convertTo<Double>().shouldBe(columnOf(12345.0, 67890.0)) | ||
columnMixed.convertTo<Double>().shouldBe(columnOf(12.345, 67890.0)) | ||
|
||
columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) | ||
columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12345.0, 67890.0)) | ||
columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0)) | ||
|
||
columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89)) | ||
columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0)) | ||
columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0)) | ||
|
||
shouldThrow<TypeConversionException> { columnDot.convertToDouble(parsingLocaleUsesComma) } | ||
columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89)) | ||
shouldThrow<TypeConversionException> { columnMixed.convertToDouble(parsingLocaleUsesComma) } | ||
|
||
Locale.setDefault(Locale.forLanguageTag("en-US")) | ||
// Use dot as local decimal separator. Comma is ignored (as it is group separator in this locale). | ||
stringColumn.convertTo<Double>().shouldBe( | ||
DataColumn.createValueColumn("nums", listOf(1.0, 2.3, 45.0), typeOf<Double>()) | ||
) | ||
|
||
columnDot.convertTo<Double>().shouldBe(columnOf(12.345, 67.89)) | ||
columnComma.convertTo<Double>().shouldBe(columnOf(12345.0, 67890.0)) | ||
columnMixed.convertTo<Double>().shouldBe(columnOf(12.345, 67890.0)) | ||
|
||
columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) | ||
columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12345.0, 67890.0)) | ||
columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0)) | ||
|
||
columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89)) | ||
columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0)) | ||
columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0)) | ||
|
||
shouldThrow<TypeConversionException> { columnDot.convertToDouble(parsingLocaleUsesComma) } | ||
columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89)) | ||
shouldThrow<TypeConversionException> { columnMixed.convertToDouble(parsingLocaleUsesComma) } | ||
|
||
Locale.setDefault(Locale.forLanguageTag("ru-RU")) | ||
|
||
columnDot.convertTo<Double>().shouldBe(columnOf(12.345, 67.89)) | ||
columnComma.convertTo<Double>().shouldBe(columnOf(12345.0, 67890.0)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't it be 12.345, 67.890? Looks suspicious that default convertTo works identically for en_US and ru_RU. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fixed, thanks! But I have not found a better way than calling |
||
columnMixed.convertTo<Double>().shouldBe(columnOf(12.345, 67890.0)) | ||
|
||
columnDot.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) | ||
columnComma.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67.89)) | ||
columnMixed.convertToDouble(parsingLocaleNotDefined).shouldBe(columnOf(12.345, 67890.0)) | ||
|
||
columnDot.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67.89)) | ||
columnComma.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12345.0, 67890.0)) | ||
columnMixed.convertToDouble(parsingLocaleUsesDot).shouldBe(columnOf(12.345, 67890.0)) | ||
|
||
shouldThrow<TypeConversionException> { columnDot.convertToDouble(parsingLocaleUsesComma) } | ||
columnComma.convertToDouble(parsingLocaleUsesComma).shouldBe(columnOf(12.345, 67.89)) | ||
shouldThrow<TypeConversionException> { columnMixed.convertToDouble(parsingLocaleUsesComma) } | ||
} finally { | ||
Locale.setDefault(currentLocale) | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What do you think?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And then simply call
map { converter(it) }
? This doesn't seem to work because we wanted the scalar converter not to use fallback logic. And it is implemented here now…