Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move reader_parser to arrow-cast (#3022) #3043

Merged
merged 2 commits into from
Nov 7, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 122 additions & 0 deletions arrow-cast/src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
// specific language governing permissions and limitations
// under the License.

use arrow_array::types::*;
use arrow_array::ArrowPrimitiveType;
use arrow_schema::ArrowError;
use chrono::prelude::*;

Expand Down Expand Up @@ -130,6 +132,126 @@ pub fn string_to_timestamp_nanos(s: &str) -> Result<i64, ArrowError> {
)))
}

/// Specialized parsing implementations
/// used by csv and json reader
pub trait Parser: ArrowPrimitiveType {
fn parse(string: &str) -> Option<Self::Native>;

fn parse_formatted(string: &str, _format: &str) -> Option<Self::Native> {
Self::parse(string)
}
}

impl Parser for Float32Type {
fn parse(string: &str) -> Option<f32> {
lexical_core::parse(string.as_bytes()).ok()
}
}

impl Parser for Float64Type {
fn parse(string: &str) -> Option<f64> {
lexical_core::parse(string.as_bytes()).ok()
}
}

macro_rules! parser_primitive {
($t:ty) => {
impl Parser for $t {
fn parse(string: &str) -> Option<Self::Native> {
string.parse::<Self::Native>().ok()
}
}
};
}
parser_primitive!(UInt64Type);
parser_primitive!(UInt32Type);
parser_primitive!(UInt16Type);
parser_primitive!(UInt8Type);
parser_primitive!(Int64Type);
parser_primitive!(Int32Type);
parser_primitive!(Int16Type);
parser_primitive!(Int8Type);

impl Parser for TimestampNanosecondType {
fn parse(string: &str) -> Option<i64> {
string_to_timestamp_nanos(string).ok()
}
}

impl Parser for TimestampMicrosecondType {
fn parse(string: &str) -> Option<i64> {
let nanos = string_to_timestamp_nanos(string).ok();
nanos.map(|x| x / 1000)
}
}

impl Parser for TimestampMillisecondType {
fn parse(string: &str) -> Option<i64> {
let nanos = string_to_timestamp_nanos(string).ok();
nanos.map(|x| x / 1_000_000)
}
}

impl Parser for TimestampSecondType {
fn parse(string: &str) -> Option<i64> {
let nanos = string_to_timestamp_nanos(string).ok();
nanos.map(|x| x / 1_000_000_000)
}
}

parser_primitive!(Time64NanosecondType);
parser_primitive!(Time64MicrosecondType);
parser_primitive!(Time32MillisecondType);
parser_primitive!(Time32SecondType);

/// Number of days between 0001-01-01 and 1970-01-01
const EPOCH_DAYS_FROM_CE: i32 = 719_163;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Eventually it would be nice to move this into some sort of arrow-common so we had all the date/time constants in the same place (I feel like they are scattered around now)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think arrow_array::temporal_conversions has most of them now. In general I hope we can avoid an arrow-common, util libraries have a tendency to become generic dumping grounds without clear focus 😅


impl Parser for Date32Type {
fn parse(string: &str) -> Option<i32> {
let date = string.parse::<chrono::NaiveDate>().ok()?;
Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
}

fn parse_formatted(string: &str, format: &str) -> Option<i32> {
let date = chrono::NaiveDate::parse_from_str(string, format).ok()?;
Some(date.num_days_from_ce() - EPOCH_DAYS_FROM_CE)
}
}

impl Parser for Date64Type {
fn parse(string: &str) -> Option<i64> {
let date_time = string.parse::<NaiveDateTime>().ok()?;
Some(date_time.timestamp_millis())
}

fn parse_formatted(string: &str, format: &str) -> Option<i64> {
use chrono::format::Fixed;
use chrono::format::StrftimeItems;
let fmt = StrftimeItems::new(format);
let has_zone = fmt.into_iter().any(|item| match item {
chrono::format::Item::Fixed(fixed_item) => matches!(
fixed_item,
Fixed::RFC2822
| Fixed::RFC3339
| Fixed::TimezoneName
| Fixed::TimezoneOffsetColon
| Fixed::TimezoneOffsetColonZ
| Fixed::TimezoneOffset
| Fixed::TimezoneOffsetZ
),
_ => false,
});
if has_zone {
let date_time = chrono::DateTime::parse_from_str(string, format).ok()?;
Some(date_time.timestamp_millis())
} else {
let date_time = NaiveDateTime::parse_from_str(string, format).ok()?;
Some(date_time.timestamp_millis())
}
}
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
2 changes: 1 addition & 1 deletion arrow/src/csv/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ use crate::array::{
use crate::datatypes::*;
use crate::error::{ArrowError, Result};
use crate::record_batch::{RecordBatch, RecordBatchOptions};
use crate::util::reader_parser::Parser;
use arrow_cast::parse::Parser;

use crate::csv::map_csv_error;
use csv_crate::{ByteRecord, StringRecord};
Expand Down
2 changes: 1 addition & 1 deletion arrow/src/json/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,8 @@ use crate::datatypes::*;
use crate::error::{ArrowError, Result};
use crate::record_batch::{RecordBatch, RecordBatchOptions};
use crate::util::bit_util;
use crate::util::reader_parser::Parser;
use crate::{array::*, buffer::Buffer};
use arrow_cast::parse::Parser;

#[derive(Debug, Clone)]
enum InferredType {
Expand Down
1 change: 0 additions & 1 deletion arrow/src/util/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,3 @@ pub mod string_writer;
pub mod test_util;

pub use arrow_cast::display;
pub(crate) mod reader_parser;
142 changes: 0 additions & 142 deletions arrow/src/util/reader_parser.rs

This file was deleted.