Skip to content

Commit

Permalink
fix: excel data type handling (#3051)
Browse files Browse the repository at this point in the history
update test .xlsx file
add excel tests to deal with columns with multiple data types

Closes #3052

---------

Co-authored-by: sam kleinman <[email protected]>
  • Loading branch information
talagluck and tychoish authored Jun 28, 2024
1 parent 2f9fff4 commit cd4127e
Show file tree
Hide file tree
Showing 4 changed files with 76 additions and 32 deletions.
2 changes: 2 additions & 0 deletions crates/datasources/src/excel/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ use crate::object_store::errors::ObjectStoreSourceError;
pub enum ExcelError {
#[error("Failed to load XLSX: {0}")]
Load(String),
#[error("Cannot parse cell value")]
Parse,
#[error("Failed to create record batch: {0}")]
CreateRecordBatch(#[from] ArrowError),
#[error(transparent)]
Expand Down
82 changes: 57 additions & 25 deletions crates/datasources/src/excel/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,14 @@ use std::io::Cursor;
use std::sync::Arc;

use calamine::{DataType as CalamineDataType, Range, Reader, Sheets};
use datafusion::arrow::array::{ArrayRef, BooleanArray, Date64Array, PrimitiveArray, StringArray};
use datafusion::arrow::array::{
ArrayRef,
BooleanArray,
Date64Array,
NullArray,
PrimitiveArray,
StringArray,
};
use datafusion::arrow::datatypes::{DataType, Field, Float64Type, Int64Type, Schema};
use datafusion::arrow::record_batch::RecordBatch;
use object_store::{ObjectMeta, ObjectStore};
Expand Down Expand Up @@ -121,11 +128,11 @@ fn xlsx_sheet_value_to_record_batch(
.collect::<BooleanArray>(),
) as ArrayRef,
DataType::Int64 => Arc::new(
rows.map(|r| r.get(i).and_then(|v| v.get_int()))
rows.map(|r| r.get(i).and_then(|v| v.as_i64()))
.collect::<PrimitiveArray<Int64Type>>(),
) as ArrayRef,
DataType::Float64 => Arc::new(
rows.map(|r| r.get(i).and_then(|v| v.get_float()))
rows.map(|r| r.get(i).and_then(|v| v.as_f64()))
.collect::<PrimitiveArray<Float64Type>>(),
) as ArrayRef,
DataType::Date64 => {
Expand All @@ -140,8 +147,9 @@ fn xlsx_sheet_value_to_record_batch(
}
Arc::new(arr.finish())
}
DataType::Null => Arc::new(NullArray::new(rows.len())),
_ => Arc::new(
rows.map(|r| r.get(i).map(|v| v.get_string().unwrap_or("null")))
rows.map(|r| r.get(i).map(|v| v.as_string().unwrap_or_default()))
.collect::<StringArray>(),
) as ArrayRef,
}
Expand All @@ -158,27 +166,30 @@ pub fn infer_schema(
) -> Result<Schema, ExcelError> {
let mut col_types: HashMap<&str, HashSet<DataType>> = HashMap::new();
let mut rows = r.rows();
let col_names: Vec<String> = rows
.next()
.unwrap()
.iter()
.enumerate()
.map(
|(i, c)| match (has_header, c.get_string().map(|s| s.to_string())) {
(true, Some(s)) => Ok(s),
(true, None) => Err(ExcelError::Load("failed to parse header".to_string())),
(false, _) => Ok(format!("col{}", i)),
},
)
.collect::<Result<_, _>>()?;
let col_names: Vec<String> = if has_header {
rows.next()
.unwrap()
.iter()
.enumerate()
.map(
|(i, c)| match (has_header, c.get_string().map(|s| s.to_string())) {
(true, Some(s)) => Ok(s),
(true, None) => Err(ExcelError::Load("failed to parse header".to_string())),
(false, _) => Ok(format!("col{}", i)),
},
)
.collect::<Result<_, _>>()?
} else {
(0..r.rows().next().unwrap_or_default().len())
.map(|n| format!("{}", n))
.collect()
};


for row in rows.take(infer_schema_length) {
for (i, col_val) in row.iter().enumerate() {
let col_name = col_names.get(i).unwrap();
if let Ok(col_type) = infer_value_type(col_val) {
if col_type == DataType::Null {
continue;
}
let entry = col_types.entry(col_name).or_default();
entry.insert(col_type);
} else {
Expand All @@ -196,8 +207,27 @@ pub fn infer_schema(
set
});

let dt = set.iter().next().cloned().unwrap_or(DataType::Utf8);
Field::new(col_name.replace(' ', "_"), dt, true)
let field_name = col_name.replace(' ', "_");

if set.len() == 1 {
Field::new(
field_name,
set.iter().next().cloned().unwrap_or(DataType::Utf8),
true,
)
} else if set.contains(&DataType::Utf8) {
Field::new(field_name, DataType::Utf8, true)
} else if set.contains(&DataType::Float64) {
Field::new(field_name, DataType::Float64, true)
} else if set.contains(&DataType::Int64) {
Field::new(field_name, DataType::Int64, true)
} else if set.contains(&DataType::Boolean) {
Field::new(field_name, DataType::Boolean, true)
} else if set.contains(&DataType::Null) {
Field::new(field_name, DataType::Null, true)
} else {
Field::new(field_name, DataType::Utf8, true)
}
})
.collect();

Expand All @@ -210,11 +240,13 @@ fn infer_value_type(v: &calamine::Data) -> Result<DataType, ExcelError> {
calamine::Data::Float(_) => Ok(DataType::Float64),
calamine::Data::Bool(_) => Ok(DataType::Boolean),
calamine::Data::String(_) => Ok(DataType::Utf8),
// TODO: parsing value errors that we get from the calamine
// library, could either become nulls or could be
// errors. right now they are errors, and this should probably
// be configurable, however...
calamine::Data::Error(e) => Err(ExcelError::Load(e.to_string())),
calamine::Data::DateTime(_) => Ok(DataType::Date64),
calamine::Data::Empty => Ok(DataType::Null),
_ => Err(ExcelError::Load(
"Failed to parse the cell value".to_owned(),
)),
_ => Err(ExcelError::Parse),
}
}
24 changes: 17 additions & 7 deletions testdata/sqllogictests/xlsx.slt
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,10 @@ create external table bad_report from excel options(location='./invalid_path/ran

# should default to has_header=true and the first sheet of the file, if not specified
statement ok
create external table html_table from excel options(location='https://github.com/GlareDB/glaredb/raw/main/testdata/xlsx/multiple_sheets.xlsx');
create external table http_table from excel options(location='https://github.com/GlareDB/glaredb/raw/main/testdata/xlsx/multiple_sheets.xlsx');

query
select "Resources", "Cost", "Revenue" from html_table;
select "Resources", "Cost", "Revenue" from http_table;
----
1 10 100
2 20 200
Expand All @@ -131,18 +131,28 @@ select "Resources", "Cost", "Revenue" from html_table;

# should default to has_header=true and the first sheet of the file, if not specified
query
select "Resources", "Cost", "Revenue" from read_excel('https://github.com/GlareDB/glaredb/raw/main/testdata/xlsx/multiple_sheets.xlsx');
select "Resources", "Cost", "Revenue" from read_excel('./testdata/xlsx/multiple_sheets.xlsx');
----
1 10 100
2 20 200
3 30 300
4 40 400
5 50 500

query
select * from read_excel('https://github.com/GlareDB/glaredb/raw/main/testdata/xlsx/multiple_sheets.xlsx', sheet_name => 'other', has_header => false);
query T
select * from read_excel('./testdata/xlsx/multiple_sheets.xlsx', sheet_name => 'other', has_header => false);
----
NULL
HEADING
1
2
3
3

query
select * from read_excel('./testdata/xlsx/multiple_sheets.xlsx', sheet_name => 'multiple_data_types', has_header => true);
----
1 1 foo foo
2 2 bar bar
3 3 baz baz
foo 4.0 4 4.0
5 5.0 5 5.0
bar (empty) (empty) (empty)
Binary file modified testdata/xlsx/multiple_sheets.xlsx
Binary file not shown.

0 comments on commit cd4127e

Please sign in to comment.