diff --git a/arrow-csv/src/reader.rs b/arrow-csv/src/reader.rs index 459c23ad2616..2fb6493e1be6 100644 --- a/arrow-csv/src/reader.rs +++ b/arrow-csv/src/reader.rs @@ -22,7 +22,7 @@ //! //! Example: //! -//! ```no_run +//! ``` //! # use arrow_schema::*; //! # use arrow_csv::Reader; //! # use std::fs::File; @@ -1131,11 +1131,432 @@ impl ReaderBuilder { mod tests { use super::*; - use std::io::Write; + use std::io::{Cursor, Write}; use tempfile::NamedTempFile; use chrono::prelude::*; + #[test] + fn test_csv() { + let _: Vec<()> = vec![None, Some("%Y-%m-%dT%H:%M:%S%.f%:z".to_string())] + .into_iter() + .map(|format| { + let schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ]); + + let file = File::open("test/data/uk_cities.csv").unwrap(); + let mut csv = Reader::new( + file, + Arc::new(schema.clone()), + false, + None, + 1024, + None, + None, + format, + ); + assert_eq!(Arc::new(schema), csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(57.653484, lat.value(0)); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); + }) + .collect(); + } + + #[test] + fn test_csv_schema_metadata() { + let mut metadata = std::collections::HashMap::new(); + metadata.insert("foo".to_owned(), "bar".to_owned()); + let schema = Schema::new_with_metadata( + vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ], + metadata.clone(), + ); + + let file = File::open("test/data/uk_cities.csv").unwrap(); + + let mut csv = Reader::new( + file, + Arc::new(schema.clone()), + false, + None, + 1024, + None, + None, + None, + ); + assert_eq!(Arc::new(schema), csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + assert_eq!(&metadata, batch.schema().metadata()); + } + + #[test] + fn test_csv_reader_with_decimal() { + let schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Decimal128(38, 6), false), + Field::new("lng", DataType::Decimal128(38, 6), false), + ]); + + let file = File::open("test/data/decimal_test.csv").unwrap(); + + let mut csv = + Reader::new(file, Arc::new(schema), false, None, 1024, None, None, None); + let batch = csv.next().unwrap().unwrap(); + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("57.653484", lat.value_as_string(0)); + assert_eq!("53.002666", lat.value_as_string(1)); + assert_eq!("52.412811", lat.value_as_string(2)); + assert_eq!("51.481583", lat.value_as_string(3)); + assert_eq!("12.123456", lat.value_as_string(4)); + assert_eq!("50.760000", lat.value_as_string(5)); + assert_eq!("0.123000", lat.value_as_string(6)); + assert_eq!("123.000000", lat.value_as_string(7)); + assert_eq!("123.000000", lat.value_as_string(8)); + assert_eq!("-50.760000", lat.value_as_string(9)); + } + + #[test] + fn test_csv_from_buf_reader() { + let schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ]); + + let file_with_headers = + File::open("test/data/uk_cities_with_headers.csv").unwrap(); + let file_without_headers = File::open("test/data/uk_cities.csv").unwrap(); + let both_files = file_with_headers + .chain(Cursor::new("\n".to_string())) + .chain(file_without_headers); + let mut csv = Reader::from_reader( + both_files, + Arc::new(schema), + true, + None, + 1024, + None, + None, + None, + ); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(74, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + } + + #[test] + fn test_csv_with_schema_inference() { + let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); + + let builder = ReaderBuilder::new().has_header(true).infer_schema(None); + + let mut csv = builder.build(file).unwrap(); + let expected_schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, true), + Field::new("lat", DataType::Float64, true), + Field::new("lng", DataType::Float64, true), + ]); + assert_eq!(Arc::new(expected_schema), csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(57.653484, lat.value(0)); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); + } + + #[test] + fn test_csv_with_schema_inference_no_headers() { + let file = File::open("test/data/uk_cities.csv").unwrap(); + + let builder = ReaderBuilder::new().infer_schema(None); + + let mut csv = builder.build(file).unwrap(); + + // csv field names should be 'column_{number}' + let schema = csv.schema(); + assert_eq!("column_1", schema.field(0).name()); + assert_eq!("column_2", schema.field(1).name()); + assert_eq!("column_3", schema.field(2).name()); + let batch = csv.next().unwrap().unwrap(); + let batch_schema = batch.schema(); + + assert_eq!(schema, batch_schema); + assert_eq!(37, batch.num_rows()); + assert_eq!(3, batch.num_columns()); + + // access data from a primitive array + let lat = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(57.653484, lat.value(0)); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); + } + + #[test] + fn test_csv_builder_with_bounds() { + let file = File::open("test/data/uk_cities.csv").unwrap(); + + // Set the bounds to the lines 0, 1 and 2. + let mut csv = ReaderBuilder::new().with_bounds(0, 2).build(file).unwrap(); + let batch = csv.next().unwrap().unwrap(); + + // access data from a string array (ListArray) + let city = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + + // The value on line 0 is within the bounds + assert_eq!("Elgin, Scotland, the UK", city.value(0)); + + // The value on line 13 is outside of the bounds. Therefore + // the call to .value() will panic. + let result = std::panic::catch_unwind(|| city.value(13)); + assert!(result.is_err()); + } + + #[test] + fn test_csv_with_projection() { + let schema = Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ]); + + let file = File::open("test/data/uk_cities.csv").unwrap(); + + let mut csv = Reader::new( + file, + Arc::new(schema), + false, + None, + 1024, + None, + Some(vec![0, 1]), + None, + ); + let projected_schema = Arc::new(Schema::new(vec![ + Field::new("city", DataType::Utf8, false), + Field::new("lat", DataType::Float64, false), + ])); + assert_eq!(projected_schema, csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(projected_schema, batch.schema()); + assert_eq!(37, batch.num_rows()); + assert_eq!(2, batch.num_columns()); + } + + #[test] + fn test_csv_with_dictionary() { + let schema = Schema::new(vec![ + Field::new( + "city", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new("lat", DataType::Float64, false), + Field::new("lng", DataType::Float64, false), + ]); + + let file = File::open("test/data/uk_cities.csv").unwrap(); + + let mut csv = Reader::new( + file, + Arc::new(schema), + false, + None, + 1024, + None, + Some(vec![0, 1]), + None, + ); + let projected_schema = Arc::new(Schema::new(vec![ + Field::new( + "city", + DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), + false, + ), + Field::new("lat", DataType::Float64, false), + ])); + assert_eq!(projected_schema, csv.schema()); + let batch = csv.next().unwrap().unwrap(); + assert_eq!(projected_schema, batch.schema()); + assert_eq!(37, batch.num_rows()); + assert_eq!(2, batch.num_columns()); + + let strings = arrow_cast::cast(batch.column(0), &DataType::Utf8).unwrap(); + let strings = strings.as_any().downcast_ref::().unwrap(); + + assert_eq!(strings.value(0), "Elgin, Scotland, the UK"); + assert_eq!(strings.value(4), "Eastbourne, East Sussex, UK"); + assert_eq!(strings.value(29), "Uckfield, East Sussex, UK"); + } + + #[test] + fn test_nulls() { + let schema = Schema::new(vec![ + Field::new("c_int", DataType::UInt64, false), + Field::new("c_float", DataType::Float32, true), + Field::new("c_string", DataType::Utf8, false), + ]); + + let file = File::open("test/data/null_test.csv").unwrap(); + + let mut csv = + Reader::new(file, Arc::new(schema), true, None, 1024, None, None, None); + let batch = csv.next().unwrap().unwrap(); + + assert!(!batch.column(1).is_null(0)); + assert!(!batch.column(1).is_null(1)); + assert!(batch.column(1).is_null(2)); + assert!(!batch.column(1).is_null(3)); + assert!(!batch.column(1).is_null(4)); + } + + #[test] + fn test_nulls_with_inference() { + let file = File::open("test/data/various_types.csv").unwrap(); + + let builder = ReaderBuilder::new() + .infer_schema(None) + .has_header(true) + .with_delimiter(b'|') + .with_batch_size(512) + .with_projection(vec![0, 1, 2, 3, 4, 5]); + + let mut csv = builder.build(file).unwrap(); + let batch = csv.next().unwrap().unwrap(); + + assert_eq!(7, batch.num_rows()); + assert_eq!(6, batch.num_columns()); + + let schema = batch.schema(); + + assert_eq!(&DataType::Int64, schema.field(0).data_type()); + assert_eq!(&DataType::Float64, schema.field(1).data_type()); + assert_eq!(&DataType::Float64, schema.field(2).data_type()); + assert_eq!(&DataType::Boolean, schema.field(3).data_type()); + assert_eq!(&DataType::Date32, schema.field(4).data_type()); + assert_eq!(&DataType::Date64, schema.field(5).data_type()); + + let names: Vec<&str> = + schema.fields().iter().map(|x| x.name().as_str()).collect(); + assert_eq!( + names, + vec![ + "c_int", + "c_float", + "c_string", + "c_bool", + "c_date", + "c_datetime" + ] + ); + + assert!(schema.field(0).is_nullable()); + assert!(schema.field(1).is_nullable()); + assert!(schema.field(2).is_nullable()); + assert!(schema.field(3).is_nullable()); + assert!(schema.field(4).is_nullable()); + assert!(schema.field(5).is_nullable()); + + assert!(!batch.column(1).is_null(0)); + assert!(!batch.column(1).is_null(1)); + assert!(batch.column(1).is_null(2)); + assert!(!batch.column(1).is_null(3)); + assert!(!batch.column(1).is_null(4)); + } + + #[test] + fn test_parse_invalid_csv() { + let file = File::open("test/data/various_types_invalid.csv").unwrap(); + + let schema = Schema::new(vec![ + Field::new("c_int", DataType::UInt64, false), + Field::new("c_float", DataType::Float32, false), + Field::new("c_string", DataType::Utf8, false), + Field::new("c_bool", DataType::Boolean, false), + ]); + + let builder = ReaderBuilder::new() + .with_schema(Arc::new(schema)) + .has_header(true) + .with_delimiter(b'|') + .with_batch_size(512) + .with_projection(vec![0, 1, 2, 3]); + + let mut csv = builder.build(file).unwrap(); + match csv.next() { + Some(e) => match e { + Err(e) => assert_eq!( + "ParseError(\"Error while parsing value 4.x4 for column 1 at line 4\")", + format!("{:?}", e) + ), + Ok(_) => panic!("should have failed"), + }, + None => panic!("should have failed"), + } + } + #[test] fn test_infer_field_schema() { assert_eq!(infer_field_schema("A", None), DataType::Utf8); diff --git a/arrow/test/data/decimal_test.csv b/arrow-csv/test/data/decimal_test.csv similarity index 100% rename from arrow/test/data/decimal_test.csv rename to arrow-csv/test/data/decimal_test.csv diff --git a/arrow/test/data/null_test.csv b/arrow-csv/test/data/null_test.csv similarity index 100% rename from arrow/test/data/null_test.csv rename to arrow-csv/test/data/null_test.csv diff --git a/arrow/test/data/uk_cities.csv b/arrow-csv/test/data/uk_cities.csv similarity index 100% rename from arrow/test/data/uk_cities.csv rename to arrow-csv/test/data/uk_cities.csv diff --git a/arrow/test/data/uk_cities_with_headers.csv b/arrow-csv/test/data/uk_cities_with_headers.csv similarity index 100% rename from arrow/test/data/uk_cities_with_headers.csv rename to arrow-csv/test/data/uk_cities_with_headers.csv diff --git a/arrow/test/data/various_types.csv b/arrow-csv/test/data/various_types.csv similarity index 100% rename from arrow/test/data/various_types.csv rename to arrow-csv/test/data/various_types.csv diff --git a/arrow/test/data/various_types_invalid.csv b/arrow-csv/test/data/various_types_invalid.csv similarity index 100% rename from arrow/test/data/various_types_invalid.csv rename to arrow-csv/test/data/various_types_invalid.csv diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 2e33014dbdea..452cc4bbd2a6 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -267,4 +267,4 @@ required-features = ["test_utils", "ipc"] [[test]] name = "csv" -required-features = ["csv"] +required-features = ["csv", "chrono-tz"] diff --git a/arrow/examples/read_csv.rs b/arrow/examples/read_csv.rs index a1a592134eba..efb55c6d2876 100644 --- a/arrow/examples/read_csv.rs +++ b/arrow/examples/read_csv.rs @@ -31,7 +31,10 @@ fn main() { Field::new("lng", DataType::Float64, false), ]); - let path = format!("{}/test/data/uk_cities.csv", env!("CARGO_MANIFEST_DIR")); + let path = format!( + "{}/../arrow-csv/test/data/uk_cities.csv", + env!("CARGO_MANIFEST_DIR") + ); let file = File::open(path).unwrap(); let mut csv = diff --git a/arrow/examples/read_csv_infer_schema.rs b/arrow/examples/read_csv_infer_schema.rs index 120a7b81910b..2a713ba6109c 100644 --- a/arrow/examples/read_csv_infer_schema.rs +++ b/arrow/examples/read_csv_infer_schema.rs @@ -23,7 +23,7 @@ use std::fs::File; fn main() { let path = format!( - "{}/test/data/uk_cities_with_headers.csv", + "{}/../arrow-csv/test/data/uk_cities_with_headers.csv", env!("CARGO_MANIFEST_DIR") ); let file = File::open(path).unwrap(); diff --git a/arrow/tests/csv.rs b/arrow/tests/csv.rs index 11e1b30e1488..83a279ce4794 100644 --- a/arrow/tests/csv.rs +++ b/arrow/tests/csv.rs @@ -15,16 +15,12 @@ // specific language governing permissions and limitations // under the License. -use std::fs::File; -use std::io::{Cursor, Read}; use std::sync::Arc; use arrow_array::*; -use arrow_csv::{Reader, ReaderBuilder}; use arrow_schema::*; #[test] -#[cfg(feature = "chrono-tz")] fn test_export_csv_timestamps() { let schema = Schema::new(vec![ Field::new( @@ -66,421 +62,3 @@ fn test_export_csv_timestamps() { let right = String::from_utf8(sw).unwrap(); assert_eq!(left, right); } - -#[test] -fn test_csv() { - let _: Vec<()> = vec![None, Some("%Y-%m-%dT%H:%M:%S%.f%:z".to_string())] - .into_iter() - .map(|format| { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - let mut csv = Reader::new( - file, - Arc::new(schema.clone()), - false, - None, - 1024, - None, - None, - format, - ); - assert_eq!(Arc::new(schema), csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(57.653484, lat.value(0)); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); - }) - .collect(); -} - -#[test] -fn test_csv_schema_metadata() { - let mut metadata = std::collections::HashMap::new(); - metadata.insert("foo".to_owned(), "bar".to_owned()); - let schema = Schema::new_with_metadata( - vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ], - metadata.clone(), - ); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let mut csv = Reader::new( - file, - Arc::new(schema.clone()), - false, - None, - 1024, - None, - None, - None, - ); - assert_eq!(Arc::new(schema), csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - assert_eq!(&metadata, batch.schema().metadata()); -} - -#[test] -fn test_csv_reader_with_decimal() { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Decimal128(38, 6), false), - Field::new("lng", DataType::Decimal128(38, 6), false), - ]); - - let file = File::open("test/data/decimal_test.csv").unwrap(); - - let mut csv = - Reader::new(file, Arc::new(schema), false, None, 1024, None, None, None); - let batch = csv.next().unwrap().unwrap(); - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("57.653484", lat.value_as_string(0)); - assert_eq!("53.002666", lat.value_as_string(1)); - assert_eq!("52.412811", lat.value_as_string(2)); - assert_eq!("51.481583", lat.value_as_string(3)); - assert_eq!("12.123456", lat.value_as_string(4)); - assert_eq!("50.760000", lat.value_as_string(5)); - assert_eq!("0.123000", lat.value_as_string(6)); - assert_eq!("123.000000", lat.value_as_string(7)); - assert_eq!("123.000000", lat.value_as_string(8)); - assert_eq!("-50.760000", lat.value_as_string(9)); -} - -#[test] -fn test_csv_from_buf_reader() { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file_with_headers = File::open("test/data/uk_cities_with_headers.csv").unwrap(); - let file_without_headers = File::open("test/data/uk_cities.csv").unwrap(); - let both_files = file_with_headers - .chain(Cursor::new("\n".to_string())) - .chain(file_without_headers); - let mut csv = Reader::from_reader( - both_files, - Arc::new(schema), - true, - None, - 1024, - None, - None, - None, - ); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(74, batch.num_rows()); - assert_eq!(3, batch.num_columns()); -} - -#[test] -fn test_csv_with_schema_inference() { - let file = File::open("test/data/uk_cities_with_headers.csv").unwrap(); - - let builder = ReaderBuilder::new().has_header(true).infer_schema(None); - - let mut csv = builder.build(file).unwrap(); - let expected_schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, true), - Field::new("lat", DataType::Float64, true), - Field::new("lng", DataType::Float64, true), - ]); - assert_eq!(Arc::new(expected_schema), csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(57.653484, lat.value(0)); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); -} - -#[test] -fn test_csv_with_schema_inference_no_headers() { - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let builder = ReaderBuilder::new().infer_schema(None); - - let mut csv = builder.build(file).unwrap(); - - // csv field names should be 'column_{number}' - let schema = csv.schema(); - assert_eq!("column_1", schema.field(0).name()); - assert_eq!("column_2", schema.field(1).name()); - assert_eq!("column_3", schema.field(2).name()); - let batch = csv.next().unwrap().unwrap(); - let batch_schema = batch.schema(); - - assert_eq!(schema, batch_schema); - assert_eq!(37, batch.num_rows()); - assert_eq!(3, batch.num_columns()); - - // access data from a primitive array - let lat = batch - .column(1) - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(57.653484, lat.value(0)); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - assert_eq!("Aberdeen, Aberdeen City, UK", city.value(13)); -} - -#[test] -fn test_csv_builder_with_bounds() { - let file = File::open("test/data/uk_cities.csv").unwrap(); - - // Set the bounds to the lines 0, 1 and 2. - let mut csv = ReaderBuilder::new().with_bounds(0, 2).build(file).unwrap(); - let batch = csv.next().unwrap().unwrap(); - - // access data from a string array (ListArray) - let city = batch - .column(0) - .as_any() - .downcast_ref::() - .unwrap(); - - // The value on line 0 is within the bounds - assert_eq!("Elgin, Scotland, the UK", city.value(0)); - - // The value on line 13 is outside of the bounds. Therefore - // the call to .value() will panic. - let result = std::panic::catch_unwind(|| city.value(13)); - assert!(result.is_err()); -} - -#[test] -fn test_csv_with_projection() { - let schema = Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let mut csv = Reader::new( - file, - Arc::new(schema), - false, - None, - 1024, - None, - Some(vec![0, 1]), - None, - ); - let projected_schema = Arc::new(Schema::new(vec![ - Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Float64, false), - ])); - assert_eq!(projected_schema, csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(projected_schema, batch.schema()); - assert_eq!(37, batch.num_rows()); - assert_eq!(2, batch.num_columns()); -} - -#[test] -fn test_csv_with_dictionary() { - let schema = Schema::new(vec![ - Field::new( - "city", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, - ), - Field::new("lat", DataType::Float64, false), - Field::new("lng", DataType::Float64, false), - ]); - - let file = File::open("test/data/uk_cities.csv").unwrap(); - - let mut csv = Reader::new( - file, - Arc::new(schema), - false, - None, - 1024, - None, - Some(vec![0, 1]), - None, - ); - let projected_schema = Arc::new(Schema::new(vec![ - Field::new( - "city", - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)), - false, - ), - Field::new("lat", DataType::Float64, false), - ])); - assert_eq!(projected_schema, csv.schema()); - let batch = csv.next().unwrap().unwrap(); - assert_eq!(projected_schema, batch.schema()); - assert_eq!(37, batch.num_rows()); - assert_eq!(2, batch.num_columns()); - - let strings = arrow_cast::cast(batch.column(0), &DataType::Utf8).unwrap(); - let strings = strings.as_any().downcast_ref::().unwrap(); - - assert_eq!(strings.value(0), "Elgin, Scotland, the UK"); - assert_eq!(strings.value(4), "Eastbourne, East Sussex, UK"); - assert_eq!(strings.value(29), "Uckfield, East Sussex, UK"); -} - -#[test] -fn test_nulls() { - let schema = Schema::new(vec![ - Field::new("c_int", DataType::UInt64, false), - Field::new("c_float", DataType::Float32, true), - Field::new("c_string", DataType::Utf8, false), - ]); - - let file = File::open("test/data/null_test.csv").unwrap(); - - let mut csv = Reader::new(file, Arc::new(schema), true, None, 1024, None, None, None); - let batch = csv.next().unwrap().unwrap(); - - assert!(!batch.column(1).is_null(0)); - assert!(!batch.column(1).is_null(1)); - assert!(batch.column(1).is_null(2)); - assert!(!batch.column(1).is_null(3)); - assert!(!batch.column(1).is_null(4)); -} - -#[test] -fn test_nulls_with_inference() { - let file = File::open("test/data/various_types.csv").unwrap(); - - let builder = ReaderBuilder::new() - .infer_schema(None) - .has_header(true) - .with_delimiter(b'|') - .with_batch_size(512) - .with_projection(vec![0, 1, 2, 3, 4, 5]); - - let mut csv = builder.build(file).unwrap(); - let batch = csv.next().unwrap().unwrap(); - - assert_eq!(7, batch.num_rows()); - assert_eq!(6, batch.num_columns()); - - let schema = batch.schema(); - - assert_eq!(&DataType::Int64, schema.field(0).data_type()); - assert_eq!(&DataType::Float64, schema.field(1).data_type()); - assert_eq!(&DataType::Float64, schema.field(2).data_type()); - assert_eq!(&DataType::Boolean, schema.field(3).data_type()); - assert_eq!(&DataType::Date32, schema.field(4).data_type()); - assert_eq!(&DataType::Date64, schema.field(5).data_type()); - - let names: Vec<&str> = schema.fields().iter().map(|x| x.name().as_str()).collect(); - assert_eq!( - names, - vec![ - "c_int", - "c_float", - "c_string", - "c_bool", - "c_date", - "c_datetime" - ] - ); - - assert!(schema.field(0).is_nullable()); - assert!(schema.field(1).is_nullable()); - assert!(schema.field(2).is_nullable()); - assert!(schema.field(3).is_nullable()); - assert!(schema.field(4).is_nullable()); - assert!(schema.field(5).is_nullable()); - - assert!(!batch.column(1).is_null(0)); - assert!(!batch.column(1).is_null(1)); - assert!(batch.column(1).is_null(2)); - assert!(!batch.column(1).is_null(3)); - assert!(!batch.column(1).is_null(4)); -} - -#[test] -fn test_parse_invalid_csv() { - let file = File::open("test/data/various_types_invalid.csv").unwrap(); - - let schema = Schema::new(vec![ - Field::new("c_int", DataType::UInt64, false), - Field::new("c_float", DataType::Float32, false), - Field::new("c_string", DataType::Utf8, false), - Field::new("c_bool", DataType::Boolean, false), - ]); - - let builder = ReaderBuilder::new() - .with_schema(Arc::new(schema)) - .has_header(true) - .with_delimiter(b'|') - .with_batch_size(512) - .with_projection(vec![0, 1, 2, 3]); - - let mut csv = builder.build(file).unwrap(); - match csv.next() { - Some(e) => match e { - Err(e) => assert_eq!( - "ParseError(\"Error while parsing value 4.x4 for column 1 at line 4\")", - format!("{:?}", e) - ), - Ok(_) => panic!("should have failed"), - }, - None => panic!("should have failed"), - } -} diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 0ca2ab91a5e8..fad1a5a7d1dd 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -3,6 +3,7 @@ testing/* target/* dev/release/rat_exclude_files.txt arrow/test/data/* +arrow-csv/test/data/* arrow-json/test/data/* arrow/test/dependency/* arrow-integration-test/data/*