diff --git a/crates/datasources/src/excel/errors.rs b/crates/datasources/src/excel/errors.rs index 306bfb7b8..e370ee03f 100644 --- a/crates/datasources/src/excel/errors.rs +++ b/crates/datasources/src/excel/errors.rs @@ -11,6 +11,8 @@ use crate::object_store::errors::ObjectStoreSourceError; pub enum ExcelError { #[error("Failed to load XLSX: {0}")] Load(String), + #[error("Cannot parse cell value")] + Parse, #[error("Failed to create record batch: {0}")] CreateRecordBatch(#[from] ArrowError), #[error(transparent)] diff --git a/crates/datasources/src/excel/mod.rs b/crates/datasources/src/excel/mod.rs index 43fb969a3..7e8e3390e 100644 --- a/crates/datasources/src/excel/mod.rs +++ b/crates/datasources/src/excel/mod.rs @@ -3,7 +3,14 @@ use std::io::Cursor; use std::sync::Arc; use calamine::{DataType as CalamineDataType, Range, Reader, Sheets}; -use datafusion::arrow::array::{ArrayRef, BooleanArray, Date64Array, PrimitiveArray, StringArray}; +use datafusion::arrow::array::{ + ArrayRef, + BooleanArray, + Date64Array, + NullArray, + PrimitiveArray, + StringArray, +}; use datafusion::arrow::datatypes::{DataType, Field, Float64Type, Int64Type, Schema}; use datafusion::arrow::record_batch::RecordBatch; use object_store::{ObjectMeta, ObjectStore}; @@ -121,11 +128,11 @@ fn xlsx_sheet_value_to_record_batch( .collect::(), ) as ArrayRef, DataType::Int64 => Arc::new( - rows.map(|r| r.get(i).and_then(|v| v.get_int())) + rows.map(|r| r.get(i).and_then(|v| v.as_i64())) .collect::>(), ) as ArrayRef, DataType::Float64 => Arc::new( - rows.map(|r| r.get(i).and_then(|v| v.get_float())) + rows.map(|r| r.get(i).and_then(|v| v.as_f64())) .collect::>(), ) as ArrayRef, DataType::Date64 => { @@ -140,8 +147,9 @@ fn xlsx_sheet_value_to_record_batch( } Arc::new(arr.finish()) } + DataType::Null => Arc::new(NullArray::new(rows.len())), _ => Arc::new( - rows.map(|r| r.get(i).map(|v| v.get_string().unwrap_or("null"))) + rows.map(|r| r.get(i).map(|v| v.as_string().unwrap_or_default())) .collect::(), ) as ArrayRef, } @@ -158,27 +166,30 @@ pub fn infer_schema( ) -> Result { let mut col_types: HashMap<&str, HashSet> = HashMap::new(); let mut rows = r.rows(); - let col_names: Vec = rows - .next() - .unwrap() - .iter() - .enumerate() - .map( - |(i, c)| match (has_header, c.get_string().map(|s| s.to_string())) { - (true, Some(s)) => Ok(s), - (true, None) => Err(ExcelError::Load("failed to parse header".to_string())), - (false, _) => Ok(format!("col{}", i)), - }, - ) - .collect::>()?; + let col_names: Vec = if has_header { + rows.next() + .unwrap() + .iter() + .enumerate() + .map( + |(i, c)| match (has_header, c.get_string().map(|s| s.to_string())) { + (true, Some(s)) => Ok(s), + (true, None) => Err(ExcelError::Load("failed to parse header".to_string())), + (false, _) => Ok(format!("col{}", i)), + }, + ) + .collect::>()? + } else { + (0..r.rows().next().unwrap_or_default().len()) + .map(|n| format!("{}", n)) + .collect() + }; + for row in rows.take(infer_schema_length) { for (i, col_val) in row.iter().enumerate() { let col_name = col_names.get(i).unwrap(); if let Ok(col_type) = infer_value_type(col_val) { - if col_type == DataType::Null { - continue; - } let entry = col_types.entry(col_name).or_default(); entry.insert(col_type); } else { @@ -196,8 +207,27 @@ pub fn infer_schema( set }); - let dt = set.iter().next().cloned().unwrap_or(DataType::Utf8); - Field::new(col_name.replace(' ', "_"), dt, true) + let field_name = col_name.replace(' ', "_"); + + if set.len() == 1 { + Field::new( + field_name, + set.iter().next().cloned().unwrap_or(DataType::Utf8), + true, + ) + } else if set.contains(&DataType::Utf8) { + Field::new(field_name, DataType::Utf8, true) + } else if set.contains(&DataType::Float64) { + Field::new(field_name, DataType::Float64, true) + } else if set.contains(&DataType::Int64) { + Field::new(field_name, DataType::Int64, true) + } else if set.contains(&DataType::Boolean) { + Field::new(field_name, DataType::Boolean, true) + } else if set.contains(&DataType::Null) { + Field::new(field_name, DataType::Null, true) + } else { + Field::new(field_name, DataType::Utf8, true) + } }) .collect(); @@ -210,11 +240,13 @@ fn infer_value_type(v: &calamine::Data) -> Result { calamine::Data::Float(_) => Ok(DataType::Float64), calamine::Data::Bool(_) => Ok(DataType::Boolean), calamine::Data::String(_) => Ok(DataType::Utf8), + // TODO: parsing value errors that we get from the calamine + // library, could either become nulls or could be + // errors. right now they are errors, and this should probably + // be configurable, however... calamine::Data::Error(e) => Err(ExcelError::Load(e.to_string())), calamine::Data::DateTime(_) => Ok(DataType::Date64), calamine::Data::Empty => Ok(DataType::Null), - _ => Err(ExcelError::Load( - "Failed to parse the cell value".to_owned(), - )), + _ => Err(ExcelError::Parse), } } diff --git a/testdata/sqllogictests/xlsx.slt b/testdata/sqllogictests/xlsx.slt index 62a12482a..adc3839f2 100644 --- a/testdata/sqllogictests/xlsx.slt +++ b/testdata/sqllogictests/xlsx.slt @@ -118,10 +118,10 @@ create external table bad_report from excel options(location='./invalid_path/ran # should default to has_header=true and the first sheet of the file, if not specified statement ok -create external table html_table from excel options(location='https://github.com/GlareDB/glaredb/raw/main/testdata/xlsx/multiple_sheets.xlsx'); +create external table http_table from excel options(location='https://github.com/GlareDB/glaredb/raw/main/testdata/xlsx/multiple_sheets.xlsx'); query -select "Resources", "Cost", "Revenue" from html_table; +select "Resources", "Cost", "Revenue" from http_table; ---- 1 10 100 2 20 200 @@ -131,7 +131,7 @@ select "Resources", "Cost", "Revenue" from html_table; # should default to has_header=true and the first sheet of the file, if not specified query -select "Resources", "Cost", "Revenue" from read_excel('https://github.com/GlareDB/glaredb/raw/main/testdata/xlsx/multiple_sheets.xlsx'); +select "Resources", "Cost", "Revenue" from read_excel('./testdata/xlsx/multiple_sheets.xlsx'); ---- 1 10 100 2 20 200 @@ -139,10 +139,20 @@ select "Resources", "Cost", "Revenue" from read_excel('https://github.com/GlareD 4 40 400 5 50 500 -query -select * from read_excel('https://github.com/GlareDB/glaredb/raw/main/testdata/xlsx/multiple_sheets.xlsx', sheet_name => 'other', has_header => false); +query T +select * from read_excel('./testdata/xlsx/multiple_sheets.xlsx', sheet_name => 'other', has_header => false); ---- -NULL +HEADING 1 2 -3 \ No newline at end of file +3 + +query +select * from read_excel('./testdata/xlsx/multiple_sheets.xlsx', sheet_name => 'multiple_data_types', has_header => true); +---- +1 1 foo foo +2 2 bar bar +3 3 baz baz +foo 4.0 4 4.0 +5 5.0 5 5.0 +bar (empty) (empty) (empty) diff --git a/testdata/xlsx/multiple_sheets.xlsx b/testdata/xlsx/multiple_sheets.xlsx index 6031f1c3a..eafe2e72d 100644 Binary files a/testdata/xlsx/multiple_sheets.xlsx and b/testdata/xlsx/multiple_sheets.xlsx differ