Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Validate ScalarUDF output rows and fix nulls for array_has and get_field for Map #10148

Merged
merged 22 commits into from
Apr 29, 2024
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ use datafusion_common::{
assert_batches_eq, assert_batches_sorted_eq, cast::as_float64_array,
cast::as_int32_array, not_impl_err, plan_err, ExprSchema, Result, ScalarValue,
};
use datafusion_common::{exec_err, internal_err, DataFusionError};
use datafusion_common::{assert_contains, exec_err, internal_err, DataFusionError};
use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv};
use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
use datafusion_expr::{
Expand Down Expand Up @@ -168,6 +168,44 @@ async fn scalar_udf() -> Result<()> {
Ok(())
}

#[tokio::test]
async fn test_row_mismatch_error_in_scalar_udf() -> Result<()> {
let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);

let batch = RecordBatch::try_new(
Arc::new(schema.clone()),
vec![Arc::new(Int32Array::from(vec![1, 2]))],
)?;

let ctx = SessionContext::new();

ctx.register_batch("t", batch)?;

// udf that always return 1 row
let buggy_udf = Arc::new(|_: &[ColumnarValue]| {
Ok(ColumnarValue::Array(Arc::new(Int32Array::from(vec![0]))))
});

ctx.register_udf(create_udf(
"buggy_func",
vec![DataType::Int32],
Arc::new(DataType::Int32),
Volatility::Immutable,
buggy_udf,
));
assert_contains!(
ctx.sql("select buggy_func(a) from t")
.await?
.show()
.await
.err()
.unwrap()
.to_string(),
"UDF returned a different number of rows than expected"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👌 -- very nice

);
Ok(())
}

#[tokio::test]
async fn scalar_udf_zero_params() -> Result<()> {
let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
Expand Down
60 changes: 32 additions & 28 deletions datafusion/functions-array/src/array_has.rs
Original file line number Diff line number Diff line change
Expand Up @@ -288,36 +288,40 @@ fn general_array_has_dispatch<O: OffsetSizeTrait>(
} else {
array
};

for (row_idx, (arr, sub_arr)) in array.iter().zip(sub_array.iter()).enumerate() {
if let (Some(arr), Some(sub_arr)) = (arr, sub_arr) {
let arr_values = converter.convert_columns(&[arr])?;
let sub_arr_values = if comparison_type != ComparisonType::Single {
converter.convert_columns(&[sub_arr])?
} else {
converter.convert_columns(&[element.clone()])?
};

let mut res = match comparison_type {
ComparisonType::All => sub_arr_values
.iter()
.dedup()
.all(|elem| arr_values.iter().dedup().any(|x| x == elem)),
ComparisonType::Any => sub_arr_values
.iter()
.dedup()
.any(|elem| arr_values.iter().dedup().any(|x| x == elem)),
ComparisonType::Single => arr_values
.iter()
.dedup()
.any(|x| x == sub_arr_values.row(row_idx)),
};

if comparison_type == ComparisonType::Any {
res |= res;
match (arr, sub_arr) {
(Some(arr), Some(sub_arr)) => {
let arr_values = converter.convert_columns(&[arr])?;
let sub_arr_values = if comparison_type != ComparisonType::Single {
converter.convert_columns(&[sub_arr])?
} else {
converter.convert_columns(&[element.clone()])?
};

let mut res = match comparison_type {
ComparisonType::All => sub_arr_values
.iter()
.dedup()
.all(|elem| arr_values.iter().dedup().any(|x| x == elem)),
ComparisonType::Any => sub_arr_values
.iter()
.dedup()
.any(|elem| arr_values.iter().dedup().any(|x| x == elem)),
ComparisonType::Single => arr_values
.iter()
.dedup()
.any(|x| x == sub_arr_values.row(row_idx)),
};

if comparison_type == ComparisonType::Any {
res |= res;
}
boolean_builder.append_value(res);
}
// respect null input
(_, _) => {
boolean_builder.append_null();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

}

boolean_builder.append_value(res);
}
}
Ok(Arc::new(boolean_builder.finish()))
Expand Down
70 changes: 49 additions & 21 deletions datafusion/functions/src/core/getfield.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@
// specific language governing permissions and limitations
// under the License.

use arrow::array::{Scalar, StringArray};
use arrow::array::{
make_array, Array, Capacities, MutableArrayData, Scalar, StringArray,
};
use arrow::datatypes::DataType;
use datafusion_common::cast::{as_map_array, as_struct_array};
use datafusion_common::{exec_err, ExprSchema, Result, ScalarValue};
Expand Down Expand Up @@ -107,29 +109,55 @@ impl ScalarUDFImpl for GetFieldFunc {
);
}
};

match (array.data_type(), name) {
(DataType::Map(_, _), ScalarValue::Utf8(Some(k))) => {
let map_array = as_map_array(array.as_ref())?;
let key_scalar = Scalar::new(StringArray::from(vec![k.clone()]));
let keys = arrow::compute::kernels::cmp::eq(&key_scalar, map_array.keys())?;
let entries = arrow::compute::filter(map_array.entries(), &keys)?;
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

using filter will reduce the number of input rows to the number of rows that have keys matching the input key. But we want to respect the number of input rows, and give null for any rows not having the matching key

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this

If the input is like this (two rows, each three elements)

{ a: 1, b: 2, c: 100}
{ a: 3, b: 4, c: 200}

An expression like col['c'] will still return 2 rows (but each row will have only a single element)

{ c: 100 }
{ c: 200 }

Copy link
Contributor Author

@duongcongtoai duongcongtoai Apr 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previous implememtation

map_array.entries() has type of

pub struct StructArray {
    len: usize,
    data_type: DataType,
    nulls: Option<NullBuffer>,
    fields: Vec<ArrayRef>,
}

With the example above, the layout of field "fields" will be a vector of 2 array, where first array is a list of key, and second array is a list of value

[0]: ["a","b","c","a","b",c"]
[1]: [1,2,100,3,4,200]
                    let keys = arrow::compute::kernels::cmp::eq(&key_scalar, map_array.keys())?;

with this computation, the result is a boolean aray where "key" = "c"

[false,false,true,false,false,true]

and thus this operation will reduce the number of rows into

                    let entries = arrow::compute::filter(map_array.entries(), &keys)?;
[0]: ["c,"c"]
[1]: [100,200]

Problem

However, let's add a row where the map does not have key "c" in between

{ a: 1, b: 2, c: 100}
{ a: 1, b: 2}
{ a: 3, b: 4, c: 200}

map_array.entries() underneath is represented as

[0]: ["a,"b","c","a","b","a","b","c"]
[1]: [1,2,100,1,2,3,4,200]

                    let entries = arrow::compute::filter(map_array.entries(), &keys)?;
Now rows after filtered will be
[0]: ["c","c"]
[1]: [100,200]

and the return result will be

{ c: 100 }
{ c: 200 }

instead of

{ c: 100 }
null
{ c: 200 }

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would expect the result of evaluating col[b] on

{ a: 1, b: 2, c: 100}
{ a: 1, b: 2}
{ a: 3, b: 4, c: 200}

to be:

{ c: 100 }
null
{ c: 200 }

For example, in duckdb:

D create table foo as values (MAP {'a':1, 'b':2, 'c':100}), (MAP{ 'a':1, 'b':2}), (MAP {'a':1, 'b':2, 'c':200});
D select * from foo;
┌───────────────────────┐
│         col0          │
│ map(varchar, integer) │
├───────────────────────┤
│ {a=1, b=2, c=100}     │
│ {a=1, b=2}            │
│ {a=1, b=2, c=200}     │
└───────────────────────┘
D select col0['c'] from foo;
┌───────────┐
│ col0['c'] │
│  int32[]  │
├───────────┤
│ [100]     │
│ []        │
│ [200]     │
└───────────┘

Basically a scalar function has the invarant that each input row produces exactly 1 output row

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i also explained in this discussion: #10148 (comment)

let entries_struct_array = as_struct_array(entries.as_ref())?;
Ok(ColumnarValue::Array(entries_struct_array.column(1).clone()))
}
(DataType::Struct(_), ScalarValue::Utf8(Some(k))) => {
let as_struct_array = as_struct_array(&array)?;
match as_struct_array.column_by_name(k) {
None => exec_err!(
"get indexed field {k} not found in struct"),
Some(col) => Ok(ColumnarValue::Array(col.clone()))
(DataType::Map(_, _), ScalarValue::Utf8(Some(k))) => {
let map_array = as_map_array(array.as_ref())?;
let key_scalar: Scalar<arrow::array::GenericByteArray<arrow::datatypes::GenericStringType<i32>>> = Scalar::new(StringArray::from(vec![k.clone()]));
let keys = arrow::compute::kernels::cmp::eq(&key_scalar, map_array.keys())?;

// note that this array has more entries than the expected output/input size
// because maparray is flatten
let original_data = map_array.entries().column(1).to_data();
let capacity = Capacities::Array(original_data.len());
let mut mutable =
MutableArrayData::with_capacities(vec![&original_data], true,
capacity);

for entry in 0..map_array.len(){
let start = map_array.value_offsets()[entry] as usize;
let end = map_array.value_offsets()[entry + 1] as usize;

let maybe_matched =
keys.slice(start, end-start).
iter().enumerate().
find(|(_, t)| t.unwrap());
if maybe_matched.is_none(){
mutable.extend_nulls(1);
continue
}
let (match_offset,_) = maybe_matched.unwrap();
mutable.extend(0, start + match_offset, start + match_offset + 1);
}
let data = mutable.freeze();
let data = make_array(data);
Ok(ColumnarValue::Array(data))
}
(DataType::Struct(_), ScalarValue::Utf8(Some(k))) => {
let as_struct_array = as_struct_array(&array)?;
match as_struct_array.column_by_name(k) {
None => exec_err!("get indexed field {k} not found in struct"),
Some(col) => Ok(ColumnarValue::Array(col.clone())),
}
(DataType::Struct(_), name) => exec_err!(
"get indexed field is only possible on struct with utf8 indexes. \
Tried with {name:?} index"),
(dt, name) => exec_err!(
"get indexed field is only possible on lists with int64 indexes or struct \
with utf8 indexes. Tried {dt:?} with {name:?} index"),
}
(DataType::Struct(_), name) => exec_err!(
"get indexed field is only possible on struct with utf8 indexes. \
Tried with {name:?} index"
),
(dt, name) => exec_err!(
"get indexed field is only possible on lists with int64 indexes or struct \
with utf8 indexes. Tried {dt:?} with {name:?} index"
),
}
}
}
17 changes: 16 additions & 1 deletion datafusion/physical-expr/src/scalar_function.rs
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,22 @@ impl PhysicalExpr for ScalarFunctionExpr {

// evaluate the function
match self.fun {
ScalarFunctionDefinition::UDF(ref fun) => fun.invoke(&inputs),
ScalarFunctionDefinition::UDF(ref fun) => {
let output = fun.invoke(&inputs)?;
// Only arrow_typeof can bypass this rule
if fun.name() != "arrow_typeof" {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should introduce validate_number_of_rows to ScalarUDFImpl with the default value is true.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i added to the trait and the wrapper

let output_size = match &output {
ColumnarValue::Array(array) => array.len(),
ColumnarValue::Scalar(_) => 1,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think this logic is correct -- specifically ColumnarValue::Scalar represents (logically) a single value for all the rows.

Thus I think if the function returns ColumnarValue::Scalar that implicitly can be any number of rows.

Therefore I think the check could be something like

if let ColumnarValue::Array(array) = &output {
  if output_size != array.len() {
    return internal_err...
  }
}

I'll try and make a PR to improve the documentation on ColumnarValue to make this clearer: https://docs.rs/datafusion/latest/datafusion/logical_expr/enum.ColumnarValue.html

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah yes, so we don't need to check the length if the output is a scalar value, we actually had a discussion here: #10148 (comment)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is a PR to improve the docs #10265

Copy link
Contributor Author

@duongcongtoai duongcongtoai Apr 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh then perhaps the current implementation of arrow_typeof can be wrong, if we have a map like this:

{id:1,name:"bob"}
{id:"string_id",name:"alice"}

then arrow_typeof(map.id) will returns

int64
int64

Is it possible to have this kind of input, because a map's schema can be dynamic per row

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. I agree that we can check with Array only.

Copy link
Contributor

@jayzhan211 jayzhan211 Apr 27, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is map key allowed to have different types, which db has the similar model?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Aha i 'v read duckdb's map definition again

A dictionary of multiple named values, each key having the same type and each value having the same type. Keys and values can be any type and can be different types from one another

I am new to this kind of project, but i think we don't have and should not have that use case 🤔. Else it will affect alot of execution logic

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In Duckdb, map keys can be different value for each row, but the types should not be different.
In OLAP which has the columnar format (we use arrow), it does not make sense to have different types on different rows.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MAPs must have a single type for all keys, and a single type for all values

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

👍

};
if output_size != batch.num_rows() {
return internal_err!("UDF returned a different number of rows than expected. Expected: {}, Got: {}",
batch.num_rows(), output_size);
}
}

Ok(output)
}
ScalarFunctionDefinition::Name(_) => {
internal_err!(
"Name function must be resolved to one of the other variants prior to physical planning"
Expand Down
15 changes: 9 additions & 6 deletions datafusion/sqllogictest/test_files/array.slt
Original file line number Diff line number Diff line change
Expand Up @@ -5169,8 +5169,9 @@ false false false true
true false true false
true false false true
false true false false
false false false false
false false false false
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The test result does not look correct, because it ignore some null rows in between

NULL NULL false false
false false NULL false
false false false NULL

query BBBB
select array_has(arrow_cast(column1, 'LargeList(List(Int64))'), make_array(5, 6)),
Expand All @@ -5183,8 +5184,9 @@ false false false true
true false true false
true false false true
false true false false
false false false false
false false false false
NULL NULL false false
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I double checked and the arrays table has 7 rows, so I agree the correct answer has 7 output rows as well

statement ok
CREATE TABLE arrays
AS VALUES
(make_array(make_array(NULL, 2),make_array(3, NULL)), make_array(1.1, 2.2, 3.3), make_array('L', 'o', 'r', 'e', 'm')),
(make_array(make_array(3, 4),make_array(5, 6)), make_array(NULL, 5.5, 6.6), make_array('i', 'p', NULL, 'u', 'm')),
(make_array(make_array(5, 6),make_array(7, 8)), make_array(7.7, 8.8, 9.9), make_array('d', NULL, 'l', 'o', 'r')),
(make_array(make_array(7, NULL),make_array(9, 10)), make_array(10.1, NULL, 12.2), make_array('s', 'i', 't')),
(NULL, make_array(13.3, 14.4, 15.5), make_array('a', 'm', 'e', 't')),
(make_array(make_array(11, 12),make_array(13, 14)), NULL, make_array(',')),
(make_array(make_array(15, 16),make_array(NULL, 18)), make_array(16.6, 17.7, 18.8), NULL)
;

false false NULL false
false false false NULL

query BBBB
select array_has(column1, make_array(5, 6)),
Expand All @@ -5197,8 +5199,9 @@ false false false true
true false true false
true false false true
false true false false
false false false false
false false false false
NULL NULL false false
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Iikewise I agree this should have 7 output rows

statement ok
CREATE TABLE fixed_size_arrays
AS VALUES
(arrow_cast(make_array(make_array(NULL, 2),make_array(3, NULL)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(1.1, 2.2, 3.3), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('L', 'o', 'r', 'e', 'm'), 'FixedSizeList(5, Utf8)')),
(arrow_cast(make_array(make_array(3, 4),make_array(5, 6)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(NULL, 5.5, 6.6), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('i', 'p', NULL, 'u', 'm'), 'FixedSizeList(5, Utf8)')),
(arrow_cast(make_array(make_array(5, 6),make_array(7, 8)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(7.7, 8.8, 9.9), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('d', NULL, 'l', 'o', 'r'), 'FixedSizeList(5, Utf8)')),
(arrow_cast(make_array(make_array(7, NULL),make_array(9, 10)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(10.1, NULL, 12.2), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('s', 'i', 't', 'a', 'b'), 'FixedSizeList(5, Utf8)')),
(NULL, arrow_cast(make_array(13.3, 14.4, 15.5), 'FixedSizeList(3, Float64)'), arrow_cast(make_array('a', 'm', 'e', 't', 'x'), 'FixedSizeList(5, Utf8)')),
(arrow_cast(make_array(make_array(11, 12),make_array(13, 14)), 'FixedSizeList(2, List(Int64))'), NULL, arrow_cast(make_array(',','a','b','c','d'), 'FixedSizeList(5, Utf8)')),
(arrow_cast(make_array(make_array(15, 16),make_array(NULL, 18)), 'FixedSizeList(2, List(Int64))'), arrow_cast(make_array(16.6, 17.7, 18.8), 'FixedSizeList(3, Float64)'), NULL)
;

false false NULL false
false false false NULL

query BBBBBBBBBBBBB
select array_has_all(make_array(1,2,3), make_array(1,3)),
Expand Down
1 change: 1 addition & 0 deletions datafusion/sqllogictest/test_files/map.slt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ DELETE 24
query T
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to remind myself what this data looked like. Here it is for anyone else who may be interested

DataFusion CLI v37.1.0
> select * from 'datafusion/core/tests/data/parquet_map.parquet';
+----------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
| ints           | strings                                                                                                                                                                                               | timestamp            |
+----------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
| {bytes: 38906} | {host: 198.194.132.41, method: GET, protocol: HTTP/1.0, referer: https://some.com/this/endpoint/prints/money, request: /observability/metrics/production, status: 400, user-identifier: shaneIxD}     | 06/Oct/2023:17:53:45 |
| {bytes: 44606} | {host: 140.115.224.194, method: PATCH, protocol: HTTP/1.0, referer: https://we.org/user/booperbot124, request: /booper/bopper/mooper/mopper, status: 304, user-identifier: jesseddy}                  | 06/Oct/2023:17:53:45 |
| {bytes: 23517} | {host: 63.69.43.67, method: GET, protocol: HTTP/2.0, referer: https://random.net/booper/bopper/mooper/mopper, request: /booper/bopper/mooper/mopper, status: 550, user-identifier: jesseddy}          | 06/Oct/2023:17:53:45 |
| {bytes: 44876} | {host: 69.4.253.156, method: PATCH, protocol: HTTP/1.1, referer: https://some.net/booper/bopper/mooper/mopper, request: /user/booperbot124, status: 403, user-identifier: Karimmove}                  | 06/Oct/2023:17:53:45 |
| {bytes: 34122} | {host: 239.152.196.123, method: DELETE, protocol: HTTP/2.0, referer: https://for.com/observability/metrics/production, request: /apps/deploy, status: 403, user-identifier: meln1ks}                  | 06/Oct/2023:17:53:45 |
| {bytes: 37438} | {host: 95.243.186.123, method: DELETE, protocol: HTTP/1.1, referer: https://make.de/wp-admin, request: /wp-admin, status: 550, user-identifier: Karimmove}                                            | 06/Oct/2023:17:53:45 |
| {bytes: 45784} | {host: 66.142.251.66, method: PUT, protocol: HTTP/2.0, referer: https://some.org/apps/deploy, request: /secret-info/open-sesame, status: 403, user-identifier: benefritz}                             | 06/Oct/2023:17:53:45 |
| {bytes: 27788} | {host: 157.85.140.215, method: GET, protocol: HTTP/1.1, referer: https://random.de/booper/bopper/mooper/mopper, request: /booper/bopper/mooper/mopper, status: 401, user-identifier: devankoshal}     | 06/Oct/2023:17:53:45 |
| {bytes: 5344}  | {host: 62.191.179.3, method: POST, protocol: HTTP/1.0, referer: https://random.org/booper/bopper/mooper/mopper, request: /observability/metrics/production, status: 400, user-identifier: jesseddy}   | 06/Oct/2023:17:53:45 |
| {bytes: 9136}  | {host: 237.213.221.20, method: PUT, protocol: HTTP/2.0, referer: https://some.us/this/endpoint/prints/money, request: /observability/metrics/production, status: 304, user-identifier: ahmadajmi}     | 06/Oct/2023:17:53:46 |
| {bytes: 5640}  | {host: 38.148.115.2, method: GET, protocol: HTTP/1.0, referer: https://for.net/apps/deploy, request: /do-not-access/needs-work, status: 301, user-identifier: benefritz}                              | 06/Oct/2023:17:53:46 |
...

SELECT strings['not_found'] FROM data LIMIT 1;
----
NULL
Copy link
Contributor

@jayzhan211 jayzhan211 Apr 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not familiar with Map. Why should we return null here?
Without the change in Map, what is the error like?

Copy link
Contributor Author

@duongcongtoai duongcongtoai Apr 26, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It will throws the invalidation error i added in this PR. I think the correct behavior is to return null for every input rows not having the associated key. I took a look at duckdb and spark and they also have this behavior

import org.apache.spark.sql.functions._
import org.apache.spark.sql.SparkSession

val spark = SparkSession.builder().appName("Spark SQL Map Example").getOrCreate()
import spark.implicits._

val data = Seq(
  ("Alice", Map("age" -> "25", "email" -> "[email protected]")),
  ("Bob", Map("age" -> "30", "email" -> "[email protected]")),
  ("Carol", Map("age" -> "35", "email" -> "[email protected]"))
)
val df = data.toDF("name", "attributes")
val result = df.select($"name", $"attributes.email".as("email"),$"attributes.notfound".as("should_be_null"))
// Show the DataFrame
result.show(false)

+-----+-----------------+--------------+
|name |email            |should_be_null|
+-----+-----------------+--------------+
|Alice|[email protected]|NULL          |
|Bob  |[email protected]  |NULL          |
|Carol|[email protected]|NULL          |
+-----+-----------------+--------------+

And also, a similar implementation in Datafusion is array_element also return null if the index goes out of range

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is how it works on main:

> select strings['not_found'] from 'datafusion/core/tests/data/parquet_map.parquet';
0 row(s) fetched.
Elapsed 0.006 seconds.

Here is how it works on this PR (aka has a single row for each input row)

DataFusion CLI v37.1.0
> select strings['not_found'] from '../datafusion/core/tests/data/parquet_map.parquet';
+----------------------------------------------------------------------+
| ../datafusion/core/tests/data/parquet_map.parquet.strings[not_found] |
+----------------------------------------------------------------------+
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
|                                                                      |
| .                                                                    |
| .                                                                    |
| .                                                                    |
+----------------------------------------------------------------------+
209 row(s) fetched. (First 40 displayed. Use --maxrows to adjust)
Elapsed 0.033 seconds.


statement ok
drop table data;
Expand Down