Skip to content

Commit

Permalink
fix(rust, python): read_csv was parsing dates incorrectly when the dt…
Browse files Browse the repository at this point in the history
…ype was overridden (#9420)
  • Loading branch information
MarcoGorelli authored Jun 18, 2023
1 parent 80751fa commit ff2a97e
Show file tree
Hide file tree
Showing 4 changed files with 151 additions and 46 deletions.
52 changes: 37 additions & 15 deletions polars/polars-io/src/csv/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@ use polars_core::prelude::*;
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
use polars_time::chunkedarray::utf8::Pattern;
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
use polars_time::prelude::utf8::infer::{infer_pattern_single, DatetimeInfer, StrpTimeParser};
use polars_time::prelude::utf8::infer::{
infer_pattern_single, DatetimeInfer, StrpTimeParser, TryFromWithUnit,
};

use crate::csv::parser::{is_whitespace, skip_whitespace};
use crate::csv::read_impl::RunningSize;
Expand Down Expand Up @@ -61,6 +63,7 @@ trait ParsedBuffer {
ignore_errors: bool,
_needs_escaping: bool,
_missing_is_null: bool,
_time_unit: Option<TimeUnit>,
) -> PolarsResult<()>;
}

Expand All @@ -75,6 +78,7 @@ where
ignore_errors: bool,
needs_escaping: bool,
_missing_is_null: bool,
_time_unit: Option<TimeUnit>,
) -> PolarsResult<()> {
if bytes.is_empty() {
self.append_null()
Expand All @@ -100,6 +104,7 @@ where
ignore_errors,
false, // escaping was already done
_missing_is_null,
None,
);
}
polars_ensure!(
Expand Down Expand Up @@ -169,6 +174,7 @@ impl ParsedBuffer for Utf8Field {
ignore_errors: bool,
needs_escaping: bool,
missing_is_null: bool,
_time_unit: Option<TimeUnit>,
) -> PolarsResult<()> {
if bytes.is_empty() {
// append null
Expand Down Expand Up @@ -277,6 +283,7 @@ impl<'a> CategoricalField<'a> {
ignore_errors: bool,
needs_escaping: bool,
_missing_is_null: bool,
_time_unit: Option<TimeUnit>,
) -> PolarsResult<()> {
if bytes.is_empty() {
self.builder.append_null();
Expand Down Expand Up @@ -360,6 +367,7 @@ impl ParsedBuffer for BooleanChunkedBuilder {
ignore_errors: bool,
needs_escaping: bool,
_missing_is_null: bool,
_time_unit: Option<TimeUnit>,
) -> PolarsResult<()> {
let bytes = if needs_escaping {
&bytes[1..bytes.len() - 1]
Expand Down Expand Up @@ -403,11 +411,12 @@ impl<T: PolarsNumericType> DatetimeField<T> {
fn slow_datetime_parser<T>(
buf: &mut DatetimeField<T>,
bytes: &[u8],
time_unit: Option<TimeUnit>,
ignore_errors: bool,
) -> PolarsResult<()>
where
T: PolarsNumericType,
DatetimeInfer<T::Native>: TryFrom<Pattern>,
DatetimeInfer<T::Native>: TryFromWithUnit<Pattern>,
{
let val = if bytes.is_ascii() {
// Safety:
Expand All @@ -433,7 +442,7 @@ where
}
},
};
match DatetimeInfer::<T::Native>::try_from(pattern) {
match DatetimeInfer::<T::Native>::try_from_with_unit(pattern, time_unit) {
Ok(mut infer) => {
let parsed = infer.parse(val);
buf.compiled = Some(infer);
Expand All @@ -451,7 +460,7 @@ where
impl<T> ParsedBuffer for DatetimeField<T>
where
T: PolarsNumericType,
DatetimeInfer<T::Native>: TryFrom<Pattern> + StrpTimeParser<T::Native>,
DatetimeInfer<T::Native>: TryFromWithUnit<Pattern> + StrpTimeParser<T::Native>,
{
#[inline]
fn parse_bytes(
Expand All @@ -460,23 +469,24 @@ where
ignore_errors: bool,
needs_escaping: bool,
_missing_is_null: bool,
time_unit: Option<TimeUnit>,
) -> PolarsResult<()> {
if needs_escaping && bytes.len() > 2 {
bytes = &bytes[1..bytes.len() - 1]
}

match &mut self.compiled {
None => slow_datetime_parser(self, bytes, ignore_errors),
None => slow_datetime_parser(self, bytes, time_unit, ignore_errors),
Some(compiled) => {
match compiled.parse_bytes(bytes) {
match compiled.parse_bytes(bytes, time_unit) {
Some(parsed) => {
self.builder.append_value(parsed);
Ok(())
}
// fall back on chrono parser
// this is a lot slower, we need to do utf8 checking and use
// the slower parser
None => slow_datetime_parser(self, bytes, ignore_errors),
None => slow_datetime_parser(self, bytes, time_unit, ignore_errors),
}
}
}
Expand Down Expand Up @@ -727,77 +737,89 @@ impl<'a> Buffer<'a> {
ignore_errors,
needs_escaping,
missing_is_null,
None,
),
Int32(buf) => <PrimitiveChunkedBuilder<Int32Type> as ParsedBuffer>::parse_bytes(
buf,
bytes,
ignore_errors,
needs_escaping,
missing_is_null,
None,
),
Int64(buf) => <PrimitiveChunkedBuilder<Int64Type> as ParsedBuffer>::parse_bytes(
buf,
bytes,
ignore_errors,
needs_escaping,
missing_is_null,
None,
),
UInt64(buf) => <PrimitiveChunkedBuilder<UInt64Type> as ParsedBuffer>::parse_bytes(
buf,
bytes,
ignore_errors,
needs_escaping,
missing_is_null,
None,
),
UInt32(buf) => <PrimitiveChunkedBuilder<UInt32Type> as ParsedBuffer>::parse_bytes(
buf,
bytes,
ignore_errors,
needs_escaping,
missing_is_null,
None,
),
Float32(buf) => <PrimitiveChunkedBuilder<Float32Type> as ParsedBuffer>::parse_bytes(
buf,
bytes,
ignore_errors,
needs_escaping,
missing_is_null,
None,
),
Float64(buf) => <PrimitiveChunkedBuilder<Float64Type> as ParsedBuffer>::parse_bytes(
buf,
bytes,
ignore_errors,
needs_escaping,
missing_is_null,
None,
),
Utf8(buf) => <Utf8Field as ParsedBuffer>::parse_bytes(
buf,
bytes,
ignore_errors,
needs_escaping,
missing_is_null,
None,
),
#[cfg(feature = "dtype-datetime")]
Datetime { buf, .. } => <DatetimeField<Int64Type> as ParsedBuffer>::parse_bytes(
buf,
bytes,
ignore_errors,
needs_escaping,
missing_is_null,
),
Datetime { buf, time_unit, .. } => {
<DatetimeField<Int64Type> as ParsedBuffer>::parse_bytes(
buf,
bytes,
ignore_errors,
needs_escaping,
missing_is_null,
Some(*time_unit),
)
}
#[cfg(feature = "dtype-date")]
Date(buf) => <DatetimeField<Int32Type> as ParsedBuffer>::parse_bytes(
buf,
bytes,
ignore_errors,
needs_escaping,
missing_is_null,
None,
),
#[allow(unused_variables)]
Categorical(buf) => {
#[cfg(feature = "dtype-categorical")]
{
buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null)
buf.parse_bytes(bytes, ignore_errors, needs_escaping, missing_is_null, None)
}

#[cfg(not(feature = "dtype-categorical"))]
Expand Down
7 changes: 4 additions & 3 deletions polars/polars-io/src/ndjson/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ use num_traits::NumCast;
use polars_core::frame::row::AnyValueBuffer;
use polars_core::prelude::*;
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
use polars_time::prelude::utf8::infer::{infer_pattern_single, DatetimeInfer};
use polars_time::prelude::utf8::infer::{infer_pattern_single, DatetimeInfer, TryFromWithUnit};
#[cfg(any(feature = "dtype-datetime", feature = "dtype-date"))]
use polars_time::prelude::utf8::Pattern;
use simd_json::{BorrowedValue as Value, KnownKey, StaticNode};
Expand Down Expand Up @@ -148,14 +148,15 @@ fn deserialize_number<T: NativeType + NumCast>(value: &Value) -> Option<T> {
fn deserialize_datetime<T>(value: &Value) -> Option<T::Native>
where
T: PolarsNumericType,
DatetimeInfer<T::Native>: TryFrom<Pattern>,
DatetimeInfer<T::Native>: TryFromWithUnit<Pattern>,
{
let val = match value {
Value::String(s) => s,
_ => return None,
};
infer_pattern_single(val).and_then(|pattern| {
match DatetimeInfer::<T::Native>::try_from(pattern) {
match DatetimeInfer::<T::Native>::try_from_with_unit(pattern, Some(TimeUnit::Microseconds))
{
Ok(mut infer) => infer.parse(val),
Err(_) => None,
}
Expand Down
Loading

0 comments on commit ff2a97e

Please sign in to comment.