From 48a38a04cb154697763c06fb084991567e8e680b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Tue, 8 Nov 2022 08:26:41 +1300 Subject: [PATCH 1/2] Move reader_parser to arrow-cast (#3022) --- arrow-cast/src/parse.rs | 122 +++++++++++++++++++++++++++ arrow/src/csv/reader.rs | 2 +- arrow/src/json/reader.rs | 2 +- arrow/src/util/mod.rs | 1 - arrow/src/util/reader_parser.rs | 142 -------------------------------- 5 files changed, 124 insertions(+), 145 deletions(-) delete mode 100644 arrow/src/util/reader_parser.rs diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 8a9d34b4c637..3e8b56398f9b 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -15,8 +15,10 @@ // specific language governing permissions and limitations // under the License. +use arrow_array::types::*; use arrow_schema::ArrowError; use chrono::prelude::*; +use arrow_array::ArrowPrimitiveType; /// Accepts a string in RFC3339 / ISO8601 standard format and some /// variants and converts it to a nanosecond precision timestamp. @@ -130,6 +132,126 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result { ))) } +/// Specialized parsing implementations +/// used by csv and json reader +pub trait Parser: ArrowPrimitiveType { + fn parse(string: &str) -> Option; + + fn parse_formatted(string: &str, _format: &str) -> Option { + Self::parse(string) + } +} + +impl Parser for Float32Type { + fn parse(string: &str) -> Option { + lexical_core::parse(string.as_bytes()).ok() + } +} + +impl Parser for Float64Type { + fn parse(string: &str) -> Option { + lexical_core::parse(string.as_bytes()).ok() + } +} + +macro_rules! parser_primitive { + ($t:ty) => { + impl Parser for $t { + fn parse(string: &str) -> Option { + string.parse::().ok() + } + } + }; +} +parser_primitive!(UInt64Type); +parser_primitive!(UInt32Type); +parser_primitive!(UInt16Type); +parser_primitive!(UInt8Type); +parser_primitive!(Int64Type); +parser_primitive!(Int32Type); +parser_primitive!(Int16Type); +parser_primitive!(Int8Type); + +impl Parser for TimestampNanosecondType { + fn parse(string: &str) -> Option { + string_to_timestamp_nanos(string).ok() + } +} + +impl Parser for TimestampMicrosecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1000) + } +} + +impl Parser for TimestampMillisecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1_000_000) + } +} + +impl Parser for TimestampSecondType { + fn parse(string: &str) -> Option { + let nanos = string_to_timestamp_nanos(string).ok(); + nanos.map(|x| x / 1_000_000_000) + } +} + +parser_primitive!(Time64NanosecondType); +parser_primitive!(Time64MicrosecondType); +parser_primitive!(Time32MillisecondType); +parser_primitive!(Time32SecondType); + +/// Number of days between 0001-01-01 and 1970-01-01 +const EPOCH_DAYS_FROM_CE: i32 = 719_163; + +impl Parser for Date32Type { + fn parse(string: &str) -> Option { + let date = string.parse::().ok()?; + Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + let date = chrono::NaiveDate::parse_from_str(string, format).ok()?; + Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) + } +} + +impl Parser for Date64Type { + fn parse(string: &str) -> Option { + let date_time = string.parse::().ok()?; + Some(date_time.timestamp_millis()) + } + + fn parse_formatted(string: &str, format: &str) -> Option { + use chrono::format::Fixed; + use chrono::format::StrftimeItems; + let fmt = StrftimeItems::new(format); + let has_zone = fmt.into_iter().any(|item| match item { + chrono::format::Item::Fixed(fixed_item) => matches!( + fixed_item, + Fixed::RFC2822 + | Fixed::RFC3339 + | Fixed::TimezoneName + | Fixed::TimezoneOffsetColon + | Fixed::TimezoneOffsetColonZ + | Fixed::TimezoneOffset + | Fixed::TimezoneOffsetZ + ), + _ => false, + }); + if has_zone { + let date_time = chrono::DateTime::parse_from_str(string, format).ok()?; + Some(date_time.timestamp_millis()) + } else { + let date_time = NaiveDateTime::parse_from_str(string, format).ok()?; + Some(date_time.timestamp_millis()) + } + } +} + #[cfg(test)] mod tests { use super::*; diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index ff6df5514983..404f37e9309a 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -56,7 +56,7 @@ use crate::array::{ use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::record_batch::{RecordBatch, RecordBatchOptions}; -use crate::util::reader_parser::Parser; +use arrow_cast::parse::Parser; use crate::csv::map_csv_error; use csv_crate::{ByteRecord, StringRecord}; diff --git a/arrow/src/json/reader.rs b/arrow/src/json/reader.rs index a7382128e1c8..78c51559a7dd 100644 --- a/arrow/src/json/reader.rs +++ b/arrow/src/json/reader.rs @@ -60,8 +60,8 @@ use crate::datatypes::*; use crate::error::{ArrowError, Result}; use crate::record_batch::{RecordBatch, RecordBatchOptions}; use crate::util::bit_util; -use crate::util::reader_parser::Parser; use crate::{array::*, buffer::Buffer}; +use arrow_cast::parse::Parser; #[derive(Debug, Clone)] enum InferredType { diff --git a/arrow/src/util/mod.rs b/arrow/src/util/mod.rs index 9a0ca852a114..4369ebe7dd45 100644 --- a/arrow/src/util/mod.rs +++ b/arrow/src/util/mod.rs @@ -31,4 +31,3 @@ pub mod string_writer; pub mod test_util; pub use arrow_cast::display; -pub(crate) mod reader_parser; diff --git a/arrow/src/util/reader_parser.rs b/arrow/src/util/reader_parser.rs deleted file mode 100644 index efee629056df..000000000000 --- a/arrow/src/util/reader_parser.rs +++ /dev/null @@ -1,142 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -use arrow_array::types::*; -use arrow_array::*; -use arrow_cast::parse::string_to_timestamp_nanos; - -/// Specialized parsing implementations -/// used by csv and json reader -pub(crate) trait Parser: ArrowPrimitiveType { - fn parse(string: &str) -> Option; - - fn parse_formatted(string: &str, _format: &str) -> Option { - Self::parse(string) - } -} - -impl Parser for Float32Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()).ok() - } -} - -impl Parser for Float64Type { - fn parse(string: &str) -> Option { - lexical_core::parse(string.as_bytes()).ok() - } -} - -macro_rules! parser_primitive { - ($t:ty) => { - impl Parser for $t { - fn parse(string: &str) -> Option { - string.parse::().ok() - } - } - }; -} -parser_primitive!(UInt64Type); -parser_primitive!(UInt32Type); -parser_primitive!(UInt16Type); -parser_primitive!(UInt8Type); -parser_primitive!(Int64Type); -parser_primitive!(Int32Type); -parser_primitive!(Int16Type); -parser_primitive!(Int8Type); - -impl Parser for TimestampNanosecondType { - fn parse(string: &str) -> Option { - string_to_timestamp_nanos(string).ok() - } -} - -impl Parser for TimestampMicrosecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1000) - } -} - -impl Parser for TimestampMillisecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1_000_000) - } -} - -impl Parser for TimestampSecondType { - fn parse(string: &str) -> Option { - let nanos = string_to_timestamp_nanos(string).ok(); - nanos.map(|x| x / 1_000_000_000) - } -} - -parser_primitive!(Time64NanosecondType); -parser_primitive!(Time64MicrosecondType); -parser_primitive!(Time32MillisecondType); -parser_primitive!(Time32SecondType); - -/// Number of days between 0001-01-01 and 1970-01-01 -const EPOCH_DAYS_FROM_CE: i32 = 719_163; - -impl Parser for Date32Type { - fn parse(string: &str) -> Option { - use chrono::Datelike; - let date = string.parse::().ok()?; - Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - use chrono::Datelike; - let date = chrono::NaiveDate::parse_from_str(string, format).ok()?; - Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE) - } -} - -impl Parser for Date64Type { - fn parse(string: &str) -> Option { - let date_time = string.parse::().ok()?; - Some(date_time.timestamp_millis()) - } - - fn parse_formatted(string: &str, format: &str) -> Option { - use chrono::format::Fixed; - use chrono::format::StrftimeItems; - let fmt = StrftimeItems::new(format); - let has_zone = fmt.into_iter().any(|item| match item { - chrono::format::Item::Fixed(fixed_item) => matches!( - fixed_item, - Fixed::RFC2822 - | Fixed::RFC3339 - | Fixed::TimezoneName - | Fixed::TimezoneOffsetColon - | Fixed::TimezoneOffsetColonZ - | Fixed::TimezoneOffset - | Fixed::TimezoneOffsetZ - ), - _ => false, - }); - if has_zone { - let date_time = chrono::DateTime::parse_from_str(string, format).ok()?; - Some(date_time.timestamp_millis()) - } else { - let date_time = chrono::NaiveDateTime::parse_from_str(string, format).ok()?; - Some(date_time.timestamp_millis()) - } - } -} From 4fc79203ca243ac93c781efb05c336c4723b870b Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Tue, 8 Nov 2022 08:35:22 +1300 Subject: [PATCH 2/2] Format --- arrow-cast/src/parse.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index 3e8b56398f9b..126beb902a55 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -16,9 +16,9 @@ // under the License. use arrow_array::types::*; +use arrow_array::ArrowPrimitiveType; use arrow_schema::ArrowError; use chrono::prelude::*; -use arrow_array::ArrowPrimitiveType; /// Accepts a string in RFC3339 / ISO8601 standard format and some /// variants and converts it to a nanosecond precision timestamp.