From 4efcc547f9db2801b28e67eef1838cb84354cf49 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 5 Aug 2023 14:02:20 +0800 Subject: [PATCH 1/4] first draft Signed-off-by: jayzhan211 --- .../tests/sqllogictests/test_files/array.slt | 24 +++++- .../physical-expr/src/array_expressions.rs | 74 ++++++++++++++----- 2 files changed, 77 insertions(+), 21 deletions(-) diff --git a/datafusion/core/tests/sqllogictests/test_files/array.slt b/datafusion/core/tests/sqllogictests/test_files/array.slt index 4088414296f9..8c786e0b1cba 100644 --- a/datafusion/core/tests/sqllogictests/test_files/array.slt +++ b/datafusion/core/tests/sqllogictests/test_files/array.slt @@ -19,10 +19,8 @@ ## Array Expressions Tests ############# - ### Tables - statement ok CREATE TABLE values( a INT, @@ -830,6 +828,28 @@ select array_slice(make_array(1, 2, 3, 4, 5), column2, column3), array_slice(col [1, 2, 3, 4, 5] [43, 44, 45, 46] [41, 42, 43, 44, 45] [5] [, 54, 55, 56, 57, 58, 59, 60] [55] +# make_array with nulls +query ???? +select make_array(make_array('a','b'), null), + make_array(make_array('a','b'), null, make_array('c','d')), + make_array(null, make_array('a','b'), null), + make_array(null, make_array('a','b'), null, null, make_array('c','d')); +---- +[[a, b], ] [[a, b], , [c, d]] [, [a, b], ] [, [a, b], , , [c, d]] + +query ? +select make_array(column5, null, column5) from arrays_values_without_nulls; +---- +[[2, 3], , [2, 3]] +[[4, 5], , [4, 5]] +[[6, 7], , [6, 7]] +[[8, 9], , [8, 9]] + +query ? +select make_array(['a','b'], null); +---- +[[a, b], ] + ## array_append (aliases: `list_append`, `array_push_back`, `list_push_back`) # array_append scalar function #1 diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index 74eca0b8cd74..61eb51f4c754 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -212,6 +212,12 @@ fn compute_array_dims(arr: Option) -> Result>>> } } +#[derive(Debug)] +enum ListOrNull<'a> { + List(&'a dyn Array), + Null, +} + /// Convert one or more [`ArrayRef`] of the same type into a /// `ListArray` /// @@ -238,18 +244,18 @@ fn compute_array_dims(arr: Option) -> Result>>> /// /// Calling `array(col1, col2)` where col1 and col2 are lists /// would return a single new `ListArray`, where each row was a list -/// of the corresponding elements of col1 and col2 flattened. +/// of the corresponding elements of col1 and col2. /// /// ``` text -/// ┌──────────────┐ ┌──────────────┐ ┌────────────────────────┐ -/// │ ┌──────────┐ │ │ ┌──────────┐ │ │ ┌────────────────────┐ │ -/// │ │ [A, X] │ │ │ │ [] │ │ │ │ [A, X] │ │ -/// │ ├──────────┤ │ │ ├──────────┤ │ │ ├────────────────────┤ │ -/// │ │[NULL, Y] │ │ │ │[Q, R, S] │ │───────▶│ │ [NULL, Y, Q, R, S] │ │ -/// │ ├──────────┤ │ │ ├──────────┤ │ │ ├────────────────────┤ │ -/// │ │ [C, Z] │ │ │ │ NULL │ │ │ │ [C, Z, NULL] │ │ -/// │ └──────────┘ │ │ └──────────┘ │ │ └────────────────────┘ │ -/// └──────────────┘ └──────────────┘ └────────────────────────┘ +/// ┌──────────────┐ ┌──────────────┐ ┌─────────────────────────────┐ +/// │ ┌──────────┐ │ │ ┌──────────┐ │ │ ┌────────────────────────┐ │ +/// │ │ [A, X] │ │ │ │ [] │ │ │ │ [[A, X], []] │ │ +/// │ ├──────────┤ │ │ ├──────────┤ │ │ ├────────────────────────┤ │ +/// │ │[NULL, Y] │ │ │ │[Q, R, S] │ │───────▶│ │ [[NULL, Y], [Q, R, S]] │ │ +/// │ ├──────────┤ │ │ ├──────────┤ │ │ ├────────────────────────│ │ +/// │ │ [C, Z] │ │ │ │ NULL │ │ │ │ [[C, Z], NULL] │ │ +/// │ └──────────┘ │ │ └──────────┘ │ │ └────────────────────────┘ │ +/// └──────────────┘ └──────────────┘ └─────────────────────────────┘ /// col1 col2 output /// ``` fn array_array(args: &[ArrayRef], data_type: DataType) -> Result { @@ -260,23 +266,53 @@ fn array_array(args: &[ArrayRef], data_type: DataType) -> Result { let res = match data_type { DataType::List(..) => { - let arrays = - downcast_vec!(args, ListArray).collect::>>()?; - - // Assume number of rows is the same for all arrays - let row_count = arrays[0].len(); + let mut arrays = vec![]; + let mut row_count = 0; + + for arg in args { + let list_arr = arg.as_list_opt::(); + if let Some(list_arr) = list_arr { + // Assume number of rows is the same for all arrays + row_count = list_arr.len(); + arrays.push(ListOrNull::List(list_arr)); + } else if arg.as_any().downcast_ref::().is_some() { + arrays.push(ListOrNull::Null); + } else { + return Err(DataFusionError::Internal( + "Unsupported argument type for array".to_string(), + )); + } + } - let capacity = Capacities::Array(arrays.iter().map(|a| a.len()).sum()); - let array_data = arrays.iter().map(|a| a.to_data()).collect::>(); + let mut total_capacity = 0; + let mut array_data = vec![]; + for arr in arrays.iter() { + if let ListOrNull::List(arr) = arr { + total_capacity += arr.len(); + array_data.push(arr.to_data()); + } + } + let capacity = Capacities::Array(total_capacity); let array_data = array_data.iter().collect(); + let mut mutable = MutableArrayData::with_capacities(array_data, true, capacity); for i in 0..row_count { - for (j, _) in arrays.iter().enumerate() { - mutable.extend(j, i, i + 1); + let mut nulls = 0; + for (j, arr) in arrays.iter().enumerate() { + match arr { + ListOrNull::List(_) => { + mutable.extend(j - nulls, i, i + 1); + } + ListOrNull::Null => { + mutable.extend_nulls(1); + nulls += 1; + } + } } } + let list_data_type = DataType::List(Arc::new(Field::new("item", data_type, true))); From 1d960edbaedd8b61882055f6ce08001a1d52191c Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 5 Aug 2023 15:02:08 +0800 Subject: [PATCH 2/4] fix nulls Signed-off-by: jayzhan211 --- .../tests/sqllogictests/test_files/array.slt | 20 ++++++++++-------- .../physical-expr/src/array_expressions.rs | 21 +++++++++++++++---- 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/datafusion/core/tests/sqllogictests/test_files/array.slt b/datafusion/core/tests/sqllogictests/test_files/array.slt index 8c786e0b1cba..992886387e4b 100644 --- a/datafusion/core/tests/sqllogictests/test_files/array.slt +++ b/datafusion/core/tests/sqllogictests/test_files/array.slt @@ -513,7 +513,7 @@ select make_array(1, 2, NULL), make_array(make_array(NULL, 2), make_array(NULL, query ??? select make_array(NULL), make_array(NULL, NULL, NULL), make_array(make_array(NULL, NULL), make_array(NULL, NULL)); ---- -[] [] [[], []] +[] [, , ] [[, ], [, ]] # make_array with 1 columns query ??? @@ -829,13 +829,15 @@ select array_slice(make_array(1, 2, 3, 4, 5), column2, column3), array_slice(col [5] [, 54, 55, 56, 57, 58, 59, 60] [55] # make_array with nulls -query ???? +query ?????? select make_array(make_array('a','b'), null), make_array(make_array('a','b'), null, make_array('c','d')), make_array(null, make_array('a','b'), null), - make_array(null, make_array('a','b'), null, null, make_array('c','d')); + make_array(null, make_array('a','b'), null, null, make_array('c','d')), + make_array(['a', 'bc', 'def'], null, make_array('rust')), + make_array([1,2,3], null, make_array(4,5,6,7)); ---- -[[a, b], ] [[a, b], , [c, d]] [, [a, b], ] [, [a, b], , , [c, d]] +[[a, b], ] [[a, b], , [c, d]] [, [a, b], ] [, [a, b], , , [c, d]] [[a, bc, def], , [rust]] [[1, 2, 3], , [4, 5, 6, 7]] query ? select make_array(column5, null, column5) from arrays_values_without_nulls; @@ -1698,7 +1700,7 @@ select cardinality(make_array([1, 2], [3, 4], [5, 6])), cardinality(array_fill(3 query II select cardinality(make_array()), cardinality(make_array(make_array())) ---- -NULL 0 +1 1 # cardinality with columns query III @@ -1912,7 +1914,7 @@ select array_length(array_fill(3, [3, 2, 5]), 1), array_length(array_fill(3, [3, query III select array_length(make_array()), array_length(make_array(), 1), array_length(make_array(), 2) ---- -0 0 NULL +1 1 NULL # list_length scalar function #6 (function alias `array_length`) query III @@ -1964,7 +1966,7 @@ select array_dims(array_fill(2, [1, 2, 3])), array_dims(array_fill(3, [2, 5, 4]) query ?? select array_dims(make_array()), array_dims(make_array(make_array())) ---- -NULL [1, 0] +[1] [1, 1] # list_dims scalar function #4 (function alias `array_dims`) query ??? @@ -2002,7 +2004,7 @@ select array_ndims(array_fill(1, [1, 2, 3])), array_ndims([[[[[[[[[[[[[[[[[[[[[1 query II select array_ndims(make_array()), array_ndims(make_array(make_array())) ---- -NULL 2 +1 2 # list_ndims scalar function #4 (function alias `array_ndims`) query III @@ -2013,7 +2015,7 @@ select list_ndims(make_array(1, 2, 3)), list_ndims(make_array([1, 2], [3, 4])), query II select array_ndims(make_array()), array_ndims(make_array(make_array())) ---- -NULL 2 +1 2 # array_ndims with columns query III diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index 61eb51f4c754..52cf4d68e744 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -363,21 +363,34 @@ fn array(values: &[ColumnarValue]) -> Result { }) .collect(); - let mut data_type = DataType::Null; + let mut data_type = None; for arg in &arrays { let arg_data_type = arg.data_type(); if !arg_data_type.equals_datatype(&DataType::Null) { - data_type = arg_data_type.clone(); + data_type = Some(arg_data_type.clone()); break; + } else { + data_type = Some(DataType::Null); } } match data_type { - DataType::Null => Ok(ColumnarValue::Scalar(ScalarValue::new_list( + None => Ok(ColumnarValue::Scalar(ScalarValue::new_list( Some(vec![]), DataType::Null, ))), - _ => Ok(ColumnarValue::Array(array_array( + Some(DataType::Null) => { + let nulls = arrays.len(); + let null_arr = NullArray::new(nulls); + let field = Arc::new(Field::new("item", DataType::Null, true)); + let offsets = OffsetBuffer::from_lengths([nulls]); + let values = Arc::new(null_arr) as ArrayRef; + let nulls = None; + Ok(ColumnarValue::Array(Arc::new(ListArray::new( + field, offsets, values, nulls, + )))) + } + Some(data_type) => Ok(ColumnarValue::Array(array_array( arrays.as_slice(), data_type, )?)), From e9c07b4ed90a1b09eb3a32896bd979e73a0c557b Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sat, 5 Aug 2023 21:12:10 +0800 Subject: [PATCH 3/4] move rust test to sql logic test Signed-off-by: jayzhan211 --- .../tests/sqllogictests/test_files/array.slt | 31 ++++------- .../physical-expr/src/array_expressions.rs | 55 ------------------- 2 files changed, 10 insertions(+), 76 deletions(-) diff --git a/datafusion/core/tests/sqllogictests/test_files/array.slt b/datafusion/core/tests/sqllogictests/test_files/array.slt index 992886387e4b..d7c9da7fe045 100644 --- a/datafusion/core/tests/sqllogictests/test_files/array.slt +++ b/datafusion/core/tests/sqllogictests/test_files/array.slt @@ -598,10 +598,8 @@ select array_element(make_array(1, 2, 3, 4, 5), 0), array_element(make_array('h' NULL NULL # array_element scalar function #4 (with NULL) -query error +query error select array_element(make_array(1, 2, 3, 4, 5), NULL), array_element(make_array('h', 'e', 'l', 'l', 'o'), NULL); ----- -NULL NULL # array_element scalar function #5 (with negative index) query IT @@ -708,16 +706,12 @@ select array_slice(make_array(1, 2, 3, 4, 5), 0, 4), array_slice(make_array('h', [1, 2, 3, 4] [h, e, l] # array_slice scalar function #8 (with NULL and positive number) -query error +query error select array_slice(make_array(1, 2, 3, 4, 5), NULL, 4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), NULL, 3); ----- -[1, 2, 3, 4] [h, e, l] # array_slice scalar function #9 (with positive number and NULL) -query error +query error select array_slice(make_array(1, 2, 3, 4, 5), 2, NULL), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 3, NULL); ----- -[2, 3, 4, 5] [l, l, o] # array_slice scalar function #10 (with zero-zero) query ?? @@ -726,10 +720,8 @@ select array_slice(make_array(1, 2, 3, 4, 5), 0, 0), array_slice(make_array('h', [] [] # array_slice scalar function #11 (with NULL-NULL) -query error +query error select array_slice(make_array(1, 2, 3, 4, 5), NULL), array_slice(make_array('h', 'e', 'l', 'l', 'o'), NULL); ----- -[] [] # array_slice scalar function #12 (with zero and negative number) query ?? @@ -738,16 +730,12 @@ select array_slice(make_array(1, 2, 3, 4, 5), 0, -4), array_slice(make_array('h' [1] [h, e] # array_slice scalar function #13 (with negative number and NULL) -query error +query error select array_slice(make_array(1, 2, 3, 4, 5), 2, NULL), array_slice(make_array('h', 'e', 'l', 'l', 'o'), 3, NULL); ----- -[2, 3, 4, 5] [l, l, o] # array_slice scalar function #14 (with NULL and negative number) -query error +query error select array_slice(make_array(1, 2, 3, 4, 5), NULL, -4), array_slice(make_array('h', 'e', 'l', 'l', 'o'), NULL, -3); ----- -[1] [h, e] # array_slice scalar function #15 (with negative indexes) query ?? @@ -829,15 +817,16 @@ select array_slice(make_array(1, 2, 3, 4, 5), column2, column3), array_slice(col [5] [, 54, 55, 56, 57, 58, 59, 60] [55] # make_array with nulls -query ?????? +query ??????? select make_array(make_array('a','b'), null), make_array(make_array('a','b'), null, make_array('c','d')), make_array(null, make_array('a','b'), null), make_array(null, make_array('a','b'), null, null, make_array('c','d')), make_array(['a', 'bc', 'def'], null, make_array('rust')), - make_array([1,2,3], null, make_array(4,5,6,7)); + make_array([1,2,3], null, make_array(4,5,6,7)), + make_array(null, 1, null, 2, null, 3, null, null, 4, 5); ---- -[[a, b], ] [[a, b], , [c, d]] [, [a, b], ] [, [a, b], , , [c, d]] [[a, bc, def], , [rust]] [[1, 2, 3], , [4, 5, 6, 7]] +[[a, b], ] [[a, b], , [c, d]] [, [a, b], ] [, [a, b], , , [c, d]] [[a, bc, def], , [rust]] [[1, 2, 3], , [4, 5, 6, 7]] [, 1, , 2, , 3, , , 4, 5] query ? select make_array(column5, null, column5) from arrays_values_without_nulls; diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index 52cf4d68e744..e1d66c9fb87d 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -2096,61 +2096,6 @@ mod tests { ); } - #[test] - fn test_array_with_nulls() { - // make_array(NULL, 1, NULL, 2, NULL, 3, NULL, NULL, 4, 5) = [NULL, 1, NULL, 2, NULL, 3, NULL, NULL, 4, 5] - let args = [ - ColumnarValue::Scalar(ScalarValue::Null), - ColumnarValue::Scalar(ScalarValue::Int64(Some(1))), - ColumnarValue::Scalar(ScalarValue::Null), - ColumnarValue::Scalar(ScalarValue::Int64(Some(2))), - ColumnarValue::Scalar(ScalarValue::Null), - ColumnarValue::Scalar(ScalarValue::Int64(Some(3))), - ColumnarValue::Scalar(ScalarValue::Null), - ColumnarValue::Scalar(ScalarValue::Null), - ColumnarValue::Scalar(ScalarValue::Int64(Some(4))), - ColumnarValue::Scalar(ScalarValue::Int64(Some(5))), - ]; - let array = array(&args) - .expect("failed to initialize function array") - .into_array(1); - let result = as_list_array(&array).expect("failed to initialize function array"); - assert_eq!(result.len(), 1); - assert_eq!( - &[0, 1, 0, 2, 0, 3, 0, 0, 4, 5], - result - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - .values() - ) - } - - #[test] - fn test_array_all_nulls() { - // make_array(NULL, NULL, NULL) = [] - let args = [ - ColumnarValue::Scalar(ScalarValue::Null), - ColumnarValue::Scalar(ScalarValue::Null), - ColumnarValue::Scalar(ScalarValue::Null), - ]; - let array = array(&args) - .expect("failed to initialize function array") - .into_array(1); - let result = as_list_array(&array).expect("failed to initialize function array"); - assert_eq!(result.len(), 1); - assert_eq!( - 0, - result - .value(0) - .as_any() - .downcast_ref::() - .unwrap() - .null_count() - ) - } - #[test] fn test_array_element() { // array_element([1, 2, 3, 4], 1) = 1 From 6fe4f7ff7c7a4a29efd694f7e19dc08e1583f3a3 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Sun, 6 Aug 2023 18:21:28 +0800 Subject: [PATCH 4/4] differentitate empty array and null array Signed-off-by: jayzhan211 --- .../core/tests/sqllogictests/test_files/array.slt | 10 +++++----- datafusion/physical-expr/src/array_expressions.rs | 6 ++++-- datafusion/physical-expr/src/scalar_function.rs | 6 +++++- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/datafusion/core/tests/sqllogictests/test_files/array.slt b/datafusion/core/tests/sqllogictests/test_files/array.slt index d7c9da7fe045..98646319204c 100644 --- a/datafusion/core/tests/sqllogictests/test_files/array.slt +++ b/datafusion/core/tests/sqllogictests/test_files/array.slt @@ -1689,7 +1689,7 @@ select cardinality(make_array([1, 2], [3, 4], [5, 6])), cardinality(array_fill(3 query II select cardinality(make_array()), cardinality(make_array(make_array())) ---- -1 1 +NULL 0 # cardinality with columns query III @@ -1903,7 +1903,7 @@ select array_length(array_fill(3, [3, 2, 5]), 1), array_length(array_fill(3, [3, query III select array_length(make_array()), array_length(make_array(), 1), array_length(make_array(), 2) ---- -1 1 NULL +0 0 NULL # list_length scalar function #6 (function alias `array_length`) query III @@ -1955,7 +1955,7 @@ select array_dims(array_fill(2, [1, 2, 3])), array_dims(array_fill(3, [2, 5, 4]) query ?? select array_dims(make_array()), array_dims(make_array(make_array())) ---- -[1] [1, 1] +NULL [1, 0] # list_dims scalar function #4 (function alias `array_dims`) query ??? @@ -1993,7 +1993,7 @@ select array_ndims(array_fill(1, [1, 2, 3])), array_ndims([[[[[[[[[[[[[[[[[[[[[1 query II select array_ndims(make_array()), array_ndims(make_array(make_array())) ---- -1 2 +NULL 2 # list_ndims scalar function #4 (function alias `array_ndims`) query III @@ -2004,7 +2004,7 @@ select list_ndims(make_array(1, 2, 3)), list_ndims(make_array([1, 2], [3, 4])), query II select array_ndims(make_array()), array_ndims(make_array(make_array())) ---- -1 2 +NULL 2 # array_ndims with columns query III diff --git a/datafusion/physical-expr/src/array_expressions.rs b/datafusion/physical-expr/src/array_expressions.rs index e1d66c9fb87d..81ba79abf6d1 100644 --- a/datafusion/physical-expr/src/array_expressions.rs +++ b/datafusion/physical-expr/src/array_expressions.rs @@ -375,14 +375,16 @@ fn array(values: &[ColumnarValue]) -> Result { } match data_type { + // empty array None => Ok(ColumnarValue::Scalar(ScalarValue::new_list( Some(vec![]), DataType::Null, ))), + // all nulls, set default data type as int32 Some(DataType::Null) => { let nulls = arrays.len(); - let null_arr = NullArray::new(nulls); - let field = Arc::new(Field::new("item", DataType::Null, true)); + let null_arr = Int32Array::from(vec![None; nulls]); + let field = Arc::new(Field::new("item", DataType::Int32, true)); let offsets = OffsetBuffer::from_lengths([nulls]); let values = Arc::new(null_arr) as ArrayRef; let nulls = None; diff --git a/datafusion/physical-expr/src/scalar_function.rs b/datafusion/physical-expr/src/scalar_function.rs index 25c0627aeece..df1e459efbb0 100644 --- a/datafusion/physical-expr/src/scalar_function.rs +++ b/datafusion/physical-expr/src/scalar_function.rs @@ -125,7 +125,11 @@ impl PhysicalExpr for ScalarFunctionExpr { // evaluate the arguments, if there are no arguments we'll instead pass in a null array // indicating the batch size (as a convention) let inputs = match (self.args.len(), self.name.parse::()) { - (0, Ok(scalar_fun)) if scalar_fun.supports_zero_argument() => { + // MakeArray support zero argument but has the different behavior from the array with one null. + (0, Ok(scalar_fun)) + if scalar_fun.supports_zero_argument() + && scalar_fun != BuiltinScalarFunction::MakeArray => + { vec![ColumnarValue::create_null_array(batch.num_rows())] } _ => self