From 42e10aca3a42f7909d33193dd1767b3960565acc Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 9 Apr 2021 10:32:02 -0400 Subject: [PATCH] ARROW-12278: [Rust][DataFusion] Use Timestamp(Nanosecond, None) for SQL TIMESTAMP Type # Rationale Running the query `CREATE EXTERNAL TABLE .. (c TIMESTAMP)` today in DataFusion will result in a data type pf "Date64" which means that anything more specific than the date will be ignored. This leads to strange behavior such as ```shell echo "Jorge,2018-12-13T12:12:10.011" >> /tmp/foo.csv echo "Andrew,2018-11-13T17:11:10.011" > /tmp/foo.csv cargo run -p datafusion --bin datafusion-cli Finished dev [unoptimized + debuginfo] target(s) in 0.23s Running `target/debug/datafusion-cli` > CREATE EXTERNAL TABLE t(a varchar, b TIMESTAMP) STORED AS CSV LOCATION '/tmp/foo.csv'; 0 rows in set. Query took 0 seconds. > select * from t; +--------+------------+ | a | b | +--------+------------+ | Andrew | 2018-11-13 | | Jorge | 2018-12-13 | +--------+------------+ ``` (note that the Time part is chopped off) # Changes This PR changes the default mapping from SQL type `TIMESTAMP` Closes #9936 from alamb/ARROW-12278-timestamps-for-timestamps Authored-by: Andrew Lamb Signed-off-by: Andrew Lamb --- rust/datafusion/src/execution/context.rs | 51 ++++++++++++++++++++++-- rust/datafusion/src/sql/planner.rs | 2 +- 2 files changed, 48 insertions(+), 5 deletions(-) diff --git a/rust/datafusion/src/execution/context.rs b/rust/datafusion/src/execution/context.rs index aabe587a0d8ef..ce0ea6d0050f3 100644 --- a/rust/datafusion/src/execution/context.rs +++ b/rust/datafusion/src/execution/context.rs @@ -2837,6 +2837,52 @@ mod tests { Ok(()) } + #[tokio::test] + async fn create_external_table_with_timestamps() { + let mut ctx = ExecutionContext::new(); + + let data = "Jorge,2018-12-13T12:12:10.011\n\ + Andrew,2018-11-13T17:11:10.011"; + + let tmp_dir = TempDir::new().unwrap(); + let file_path = tmp_dir.path().join("timestamps.csv"); + + // scope to ensure the file is closed and written + { + File::create(&file_path) + .expect("creating temp file") + .write_all(data.as_bytes()) + .expect("writing data"); + } + + let sql = format!( + "CREATE EXTERNAL TABLE csv_with_timestamps ( + name VARCHAR, + ts TIMESTAMP + ) + STORED AS CSV + LOCATION '{}' + ", + file_path.to_str().expect("path is utf8") + ); + + plan_and_collect(&mut ctx, &sql) + .await + .expect("Executing CREATE EXTERNAL TABLE"); + + let sql = "SELECT * from csv_with_timestamps"; + let result = plan_and_collect(&mut ctx, &sql).await.unwrap(); + let expected = vec![ + "+--------+-------------------------+", + "| name | ts |", + "+--------+-------------------------+", + "| Andrew | 2018-11-13 17:11:10.011 |", + "| Jorge | 2018-12-13 12:12:10.011 |", + "+--------+-------------------------+", + ]; + assert_batches_sorted_eq!(expected, &result); + } + struct MyPhysicalPlanner {} impl PhysicalPlanner for MyPhysicalPlanner { @@ -2869,10 +2915,7 @@ mod tests { ctx: &mut ExecutionContext, sql: &str, ) -> Result> { - let logical_plan = ctx.create_logical_plan(sql)?; - let logical_plan = ctx.optimize(&logical_plan)?; - let physical_plan = ctx.create_physical_plan(&logical_plan)?; - collect(physical_plan).await + ctx.sql(sql)?.collect().await } /// Execute SQL and return results diff --git a/rust/datafusion/src/sql/planner.rs b/rust/datafusion/src/sql/planner.rs index f3ea7c9e34d84..f3cba232a23ab 100644 --- a/rust/datafusion/src/sql/planner.rs +++ b/rust/datafusion/src/sql/planner.rs @@ -298,7 +298,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { SQLDataType::Boolean => Ok(DataType::Boolean), SQLDataType::Date => Ok(DataType::Date32), SQLDataType::Time => Ok(DataType::Time64(TimeUnit::Millisecond)), - SQLDataType::Timestamp => Ok(DataType::Date64), + SQLDataType::Timestamp => Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)), _ => Err(DataFusionError::NotImplemented(format!( "The SQL data type {:?} is not implemented", sql_type