From c272704c225e9158764fccc22f5f83d77c2c6685 Mon Sep 17 00:00:00 2001 From: CookiePieWw <1035325592@qq.com> Date: Tue, 10 Sep 2024 16:19:39 +0000 Subject: [PATCH 1/4] feat: add respectiv get_by_path udf for json type --- src/common/function/src/scalars/json.rs | 7 + .../function/src/scalars/json/get_by_path.rs | 453 ++++++++++++++++++ .../standalone/common/function/json.result | 129 +++++ .../cases/standalone/common/function/json.sql | 35 ++ 4 files changed, 624 insertions(+) create mode 100644 src/common/function/src/scalars/json/get_by_path.rs create mode 100644 tests/cases/standalone/common/function/json.result create mode 100644 tests/cases/standalone/common/function/json.sql diff --git a/src/common/function/src/scalars/json.rs b/src/common/function/src/scalars/json.rs index 3812b33f235f..3428b601d342 100644 --- a/src/common/function/src/scalars/json.rs +++ b/src/common/function/src/scalars/json.rs @@ -13,9 +13,11 @@ // limitations under the License. use std::sync::Arc; +mod get_by_path; mod json_to_string; mod to_json; +use get_by_path::{GetByPathBool, GetByPathFloat, GetByPathInt, GetByPathString}; use json_to_string::JsonToStringFunction; use to_json::ToJsonFunction; @@ -27,5 +29,10 @@ impl JsonFunction { pub fn register(registry: &FunctionRegistry) { registry.register(Arc::new(JsonToStringFunction)); registry.register(Arc::new(ToJsonFunction)); + + registry.register(Arc::new(GetByPathInt)); + registry.register(Arc::new(GetByPathFloat)); + registry.register(Arc::new(GetByPathString)); + registry.register(Arc::new(GetByPathBool)); } } diff --git a/src/common/function/src/scalars/json/get_by_path.rs b/src/common/function/src/scalars/json/get_by_path.rs new file mode 100644 index 000000000000..26e16783c1d7 --- /dev/null +++ b/src/common/function/src/scalars/json/get_by_path.rs @@ -0,0 +1,453 @@ +// Copyright 2023 Greptime Team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use std::fmt::{self, Display}; + +use common_query::error::{InvalidFuncArgsSnafu, Result, UnsupportedInputDataTypeSnafu}; +use common_query::prelude::Signature; +use datafusion::logical_expr::Volatility; +use datatypes::data_type::ConcreteDataType; +use datatypes::prelude::VectorRef; +use datatypes::scalars::ScalarVectorBuilder; +use datatypes::vectors::{ + BooleanVectorBuilder, Float64VectorBuilder, Int64VectorBuilder, MutableVector, + StringVectorBuilder, +}; +use snafu::ensure; + +use crate::function::{Function, FunctionContext}; + +fn get_json_by_path(json: &[u8], path: &str) -> Option> { + let json_path = jsonb::jsonpath::parse_json_path(path.as_bytes()); + match json_path { + Ok(json_path) => { + let mut sub_jsonb = Vec::new(); + let mut sub_offsets = Vec::new(); + match jsonb::get_by_path(json, json_path, &mut sub_jsonb, &mut sub_offsets) { + Ok(_) => Some(sub_jsonb), + Err(_) => None, + } + } + _ => None, + } +} + +/// Get the value from the JSONB by the given path and return it as specified type. +/// If the path does not exist or the value is not the type specified, return `NULL`. +macro_rules! get_by_path { + // e.g. name = GetByPathInt, type = Int64, rust_type = i64, doc = "Get the value from the JSONB by the given path and return it as an integer." + ($name: ident, $type: ident, $rust_type: ident, $doc:expr) => { + paste::paste! { + #[doc = $doc] + #[derive(Clone, Debug, Default)] + pub struct $name; + + impl Function for $name { + fn name(&self) -> &str { + stringify!([<$name:snake>]) + } + + fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result { + Ok(ConcreteDataType::[<$type:snake _datatype>]()) + } + + fn signature(&self) -> Signature { + Signature::exact( + vec![ + ConcreteDataType::json_datatype(), + ConcreteDataType::string_datatype(), + ], + Volatility::Immutable, + ) + } + + fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result { + ensure!( + columns.len() == 2, + InvalidFuncArgsSnafu { + err_msg: format!( + "The length of the args is not correct, expect exactly two, have: {}", + columns.len() + ), + } + ); + let jsons = &columns[0]; + let strings = &columns[1]; + + let size = jsons.len(); + let datatype = jsons.data_type(); + let mut results = [<$type VectorBuilder>]::with_capacity(size); + + match datatype { + // JSON data type uses binary vector + ConcreteDataType::Binary(_) => { + for i in 0..size { + let json = jsons.get_ref(i); + let string = strings.get_ref(i); + + let json = json.as_binary(); + let string = string.as_string(); + let result = match (json, string) { + (Ok(Some(json)), Ok(Some(string))) => { + get_json_by_path(json, string) + .and_then(|json| { jsonb::[](&json).ok() }) + } + _ => None, + }; + + results.push(result); + } + } + _ => { + return UnsupportedInputDataTypeSnafu { + function: stringify!([<$name:snake>]), + datatypes: columns.iter().map(|c| c.data_type()).collect::>(), + } + .fail(); + } + } + + Ok(results.to_vector()) + } + } + + impl Display for $name { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", stringify!([<$name:snake>]).to_ascii_uppercase()) + } + } + } + }; +} + +get_by_path!( + GetByPathInt, + Int64, + i64, + "Get the value from the JSONB by the given path and return it as an integer." +); + +get_by_path!( + GetByPathFloat, + Float64, + f64, + "Get the value from the JSONB by the given path and return it as a float." +); + +get_by_path!( + GetByPathBool, + Boolean, + bool, + "Get the value from the JSONB by the given path and return it as a boolean." +); + +/// Get the value from the JSONB by the given path and return it as a string. +#[derive(Clone, Debug, Default)] +pub struct GetByPathString; + +impl Function for GetByPathString { + fn name(&self) -> &str { + "get_by_path_string" + } + + fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result { + Ok(ConcreteDataType::string_datatype()) + } + + fn signature(&self) -> Signature { + Signature::exact( + vec![ + ConcreteDataType::json_datatype(), + ConcreteDataType::string_datatype(), + ], + Volatility::Immutable, + ) + } + + fn eval(&self, _func_ctx: FunctionContext, columns: &[VectorRef]) -> Result { + ensure!( + columns.len() == 2, + InvalidFuncArgsSnafu { + err_msg: format!( + "The length of the args is not correct, expect exactly two, have: {}", + columns.len() + ), + } + ); + let jsons = &columns[0]; + let strings = &columns[1]; + + let size = jsons.len(); + let datatype = jsons.data_type(); + let mut results = StringVectorBuilder::with_capacity(size); + + match datatype { + // JSON data type uses binary vector + ConcreteDataType::Binary(_) => { + for i in 0..size { + let json = jsons.get_ref(i); + let string = strings.get_ref(i); + + let json = json.as_binary(); + let string = string.as_string(); + let result = match (json, string) { + (Ok(Some(json)), Ok(Some(string))) => get_json_by_path(json, string) + .and_then(|json| jsonb::to_str(&json).ok()), + _ => None, + }; + + results.push(result.as_deref()); + } + } + _ => { + return UnsupportedInputDataTypeSnafu { + function: "get_by_path_string", + datatypes: columns.iter().map(|c| c.data_type()).collect::>(), + } + .fail(); + } + } + + Ok(results.to_vector()) + } +} + +impl Display for GetByPathString { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "{}", "get_by_path_string".to_ascii_uppercase()) + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use common_query::prelude::TypeSignature; + use datatypes::scalars::ScalarVector; + use datatypes::vectors::{BinaryVector, StringVector}; + + use super::*; + + #[test] + fn test_get_by_path_int() { + let get_by_path_int = GetByPathInt; + + assert_eq!("get_by_path_int", get_by_path_int.name()); + assert_eq!( + ConcreteDataType::int64_datatype(), + get_by_path_int + .return_type(&[ + ConcreteDataType::json_datatype(), + ConcreteDataType::string_datatype() + ]) + .unwrap() + ); + + assert!(matches!(get_by_path_int.signature(), + Signature { + type_signature: TypeSignature::Exact(valid_types), + volatility: Volatility::Immutable + } if valid_types == vec![ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype()] + )); + + let json_strings = [ + r#"{"a": {"b": 2}, "b": 2, "c": 3}"#, + r#"{"a": 4, "b": {"c": 6}, "c": 6}"#, + r#"{"a": 7, "b": 8, "c": {"a": 7}}"#, + ]; + let paths = vec!["$.a.b", "$.a", "$.c"]; + let results = [Some(2), Some(4), None]; + + let jsonbs = json_strings + .iter() + .map(|s| { + let value = jsonb::parse_value(s.as_bytes()).unwrap(); + value.to_vec() + }) + .collect::>(); + + let json_vector = BinaryVector::from_vec(jsonbs); + let path_vector = StringVector::from_vec(paths); + let args: Vec = vec![Arc::new(json_vector), Arc::new(path_vector)]; + let vector = get_by_path_int + .eval(FunctionContext::default(), &args) + .unwrap(); + + assert_eq!(3, vector.len()); + for (i, gt) in results.iter().enumerate() { + let result = vector.get_ref(i); + let result = result.as_i64().unwrap(); + assert_eq!(*gt, result); + } + } + + #[test] + fn test_get_by_path_float() { + let get_by_path_float = GetByPathFloat; + + assert_eq!("get_by_path_float", get_by_path_float.name()); + assert_eq!( + ConcreteDataType::float64_datatype(), + get_by_path_float + .return_type(&[ + ConcreteDataType::json_datatype(), + ConcreteDataType::string_datatype() + ]) + .unwrap() + ); + + assert!(matches!(get_by_path_float.signature(), + Signature { + type_signature: TypeSignature::Exact(valid_types), + volatility: Volatility::Immutable + } if valid_types == vec![ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype()] + )); + + let json_strings = [ + r#"{"a": {"b": 2.1}, "b": 2.2, "c": 3.3}"#, + r#"{"a": 4.4, "b": {"c": 6.6}, "c": 6.6}"#, + r#"{"a": 7.7, "b": 8.8, "c": {"a": 7.7}}"#, + ]; + let paths = vec!["$.a.b", "$.a", "$.c"]; + let results = [Some(2.1), Some(4.4), None]; + + let jsonbs = json_strings + .iter() + .map(|s| { + let value = jsonb::parse_value(s.as_bytes()).unwrap(); + value.to_vec() + }) + .collect::>(); + + let json_vector = BinaryVector::from_vec(jsonbs); + let path_vector = StringVector::from_vec(paths); + let args: Vec = vec![Arc::new(json_vector), Arc::new(path_vector)]; + let vector = get_by_path_float + .eval(FunctionContext::default(), &args) + .unwrap(); + + assert_eq!(3, vector.len()); + for (i, gt) in results.iter().enumerate() { + let result = vector.get_ref(i); + let result = result.as_f64().unwrap(); + assert_eq!(*gt, result); + } + } + + #[test] + fn test_get_by_path_boolean() { + let get_by_path_bool = GetByPathBool; + + assert_eq!("get_by_path_bool", get_by_path_bool.name()); + assert_eq!( + ConcreteDataType::boolean_datatype(), + get_by_path_bool + .return_type(&[ + ConcreteDataType::json_datatype(), + ConcreteDataType::string_datatype() + ]) + .unwrap() + ); + + assert!(matches!(get_by_path_bool.signature(), + Signature { + type_signature: TypeSignature::Exact(valid_types), + volatility: Volatility::Immutable + } if valid_types == vec![ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype()] + )); + + let json_strings = [ + r#"{"a": {"b": true}, "b": false, "c": true}"#, + r#"{"a": false, "b": {"c": true}, "c": false}"#, + r#"{"a": true, "b": false, "c": {"a": true}}"#, + ]; + let paths = vec!["$.a.b", "$.a", "$.c"]; + let results = [Some(true), Some(false), None]; + + let jsonbs = json_strings + .iter() + .map(|s| { + let value = jsonb::parse_value(s.as_bytes()).unwrap(); + value.to_vec() + }) + .collect::>(); + + let json_vector = BinaryVector::from_vec(jsonbs); + let path_vector = StringVector::from_vec(paths); + let args: Vec = vec![Arc::new(json_vector), Arc::new(path_vector)]; + let vector = get_by_path_bool + .eval(FunctionContext::default(), &args) + .unwrap(); + + assert_eq!(3, vector.len()); + for (i, gt) in results.iter().enumerate() { + let result = vector.get_ref(i); + let result = result.as_boolean().unwrap(); + assert_eq!(*gt, result); + } + } + + #[test] + fn test_get_by_path_string() { + let get_by_path_string = GetByPathString; + + assert_eq!("get_by_path_string", get_by_path_string.name()); + assert_eq!( + ConcreteDataType::string_datatype(), + get_by_path_string + .return_type(&[ + ConcreteDataType::json_datatype(), + ConcreteDataType::string_datatype() + ]) + .unwrap() + ); + + assert!(matches!(get_by_path_string.signature(), + Signature { + type_signature: TypeSignature::Exact(valid_types), + volatility: Volatility::Immutable + } if valid_types == vec![ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype()] + )); + + let json_strings = [ + r#"{"a": {"b": "a"}, "b": "b", "c": "c"}"#, + r#"{"a": "d", "b": {"c": "e"}, "c": "f"}"#, + r#"{"a": "g", "b": "h", "c": {"a": "g"}}"#, + ]; + let paths = vec!["$.a.b", "$.a", ""]; + let results = [Some("a"), Some("d"), None]; + + let jsonbs = json_strings + .iter() + .map(|s| { + let value = jsonb::parse_value(s.as_bytes()).unwrap(); + value.to_vec() + }) + .collect::>(); + + let json_vector = BinaryVector::from_vec(jsonbs); + let path_vector = StringVector::from_vec(paths); + let args: Vec = vec![Arc::new(json_vector), Arc::new(path_vector)]; + let vector = get_by_path_string + .eval(FunctionContext::default(), &args) + .unwrap(); + + assert_eq!(3, vector.len()); + for (i, gt) in results.iter().enumerate() { + let result = vector.get_ref(i); + let result = result.as_string().unwrap(); + assert_eq!(*gt, result); + } + } +} diff --git a/tests/cases/standalone/common/function/json.result b/tests/cases/standalone/common/function/json.result new file mode 100644 index 000000000000..66c7ce75eeb4 --- /dev/null +++ b/tests/cases/standalone/common/function/json.result @@ -0,0 +1,129 @@ +-- get_by_path functions -- +SELECT get_by_path_int(to_json('{"a": {"b": {"c": 1}}}'), 'a.b.c'); + ++------------------------------------------------------------------------+ +| get_by_path_int(to_json(Utf8("{"a": {"b": {"c": 1}}}")),Utf8("a.b.c")) | ++------------------------------------------------------------------------+ +| 1 | ++------------------------------------------------------------------------+ + +SELECT get_by_path_float(to_json('{"a": {"b": {"c": 1.234}}}'), 'a.b.c'); + ++------------------------------------------------------------------------------+ +| get_by_path_float(to_json(Utf8("{"a": {"b": {"c": 1.234}}}")),Utf8("a.b.c")) | ++------------------------------------------------------------------------------+ +| 1.234 | ++------------------------------------------------------------------------------+ + +SELECT get_by_path_string(to_json('{"a": {"b": {"c": "foo"}}}'), 'a.b.c'); + ++-------------------------------------------------------------------------------+ +| get_by_path_string(to_json(Utf8("{"a": {"b": {"c": "foo"}}}")),Utf8("a.b.c")) | ++-------------------------------------------------------------------------------+ +| foo | ++-------------------------------------------------------------------------------+ + +SELECT get_by_path_bool(to_json('{"a": {"b": {"c": true}}}'), 'a.b.c'); + ++----------------------------------------------------------------------------+ +| get_by_path_bool(to_json(Utf8("{"a": {"b": {"c": true}}}")),Utf8("a.b.c")) | ++----------------------------------------------------------------------------+ +| true | ++----------------------------------------------------------------------------+ + +SELECT get_by_path_int(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); + ++-----------------------------------------------------------------------------+ +| get_by_path_int(to_json(Utf8("{"a": {"b": {"c": {"d": 1}}}}")),Utf8("a.b")) | ++-----------------------------------------------------------------------------+ +| | ++-----------------------------------------------------------------------------+ + +SELECT get_by_path_string(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); + ++--------------------------------------------------------------------------------+ +| get_by_path_string(to_json(Utf8("{"a": {"b": {"c": {"d": 1}}}}")),Utf8("a.b")) | ++--------------------------------------------------------------------------------+ +| | ++--------------------------------------------------------------------------------+ + +-- test functions with table rows -- +CREATE TABLE jsons(j JSON, ts timestamp time index); + +Affected Rows: 0 + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": 1}}}'), 1); + +Affected Rows: 1 + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": 1.234}}}'), 2); + +Affected Rows: 1 + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": "foo"}}}'), 3); + +Affected Rows: 1 + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": true}}}'), 4); + +Affected Rows: 1 + +SELECT get_by_path_int(j, 'a.b.c') FROM jsons; + ++----------------------------------------+ +| get_by_path_int(jsons.j,Utf8("a.b.c")) | ++----------------------------------------+ +| 1 | +| | +| | +| 1 | ++----------------------------------------+ + +SELECT get_by_path_float(j, 'a.b.c') FROM jsons; + ++------------------------------------------+ +| get_by_path_float(jsons.j,Utf8("a.b.c")) | ++------------------------------------------+ +| 1.0 | +| 1.234 | +| | +| 1.0 | ++------------------------------------------+ + +SELECT get_by_path_string(j, 'a.b.c') FROM jsons; + ++-------------------------------------------+ +| get_by_path_string(jsons.j,Utf8("a.b.c")) | ++-------------------------------------------+ +| 1 | +| 1.234 | +| foo | +| true | ++-------------------------------------------+ + +SELECT get_by_path_bool(j, 'a.b.c') FROM jsons; + ++-----------------------------------------+ +| get_by_path_bool(jsons.j,Utf8("a.b.c")) | ++-----------------------------------------+ +| | +| | +| | +| true | ++-----------------------------------------+ + +SELECT get_by_path_int(j, 'd') FROM jsons; + ++------------------------------------+ +| get_by_path_int(jsons.j,Utf8("d")) | ++------------------------------------+ +| | +| | +| | +| | ++------------------------------------+ + +DROP TABLE jsons; + +Affected Rows: 0 + diff --git a/tests/cases/standalone/common/function/json.sql b/tests/cases/standalone/common/function/json.sql new file mode 100644 index 000000000000..5f902a4c223b --- /dev/null +++ b/tests/cases/standalone/common/function/json.sql @@ -0,0 +1,35 @@ +-- get_by_path functions -- +SELECT get_by_path_int(to_json('{"a": {"b": {"c": 1}}}'), 'a.b.c'); + +SELECT get_by_path_float(to_json('{"a": {"b": {"c": 1.234}}}'), 'a.b.c'); + +SELECT get_by_path_string(to_json('{"a": {"b": {"c": "foo"}}}'), 'a.b.c'); + +SELECT get_by_path_bool(to_json('{"a": {"b": {"c": true}}}'), 'a.b.c'); + +SELECT get_by_path_int(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); + +SELECT get_by_path_string(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); + +-- test functions with table rows -- +CREATE TABLE jsons(j JSON, ts timestamp time index); + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": 1}}}'), 1); + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": 1.234}}}'), 2); + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": "foo"}}}'), 3); + +INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": true}}}'), 4); + +SELECT get_by_path_int(j, 'a.b.c') FROM jsons; + +SELECT get_by_path_float(j, 'a.b.c') FROM jsons; + +SELECT get_by_path_string(j, 'a.b.c') FROM jsons; + +SELECT get_by_path_bool(j, 'a.b.c') FROM jsons; + +SELECT get_by_path_int(j, 'd') FROM jsons; + +DROP TABLE jsons; From 8d9c71d2409f134a8629ca9ce3cc69e571cf7a6a Mon Sep 17 00:00:00 2001 From: Yohan Wal <59358312+CookiePieWw@users.noreply.github.com> Date: Wed, 11 Sep 2024 01:14:29 +0800 Subject: [PATCH 2/4] Apply review comments Co-authored-by: Weny Xu --- .../function/src/scalars/json/get_by_path.rs | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/common/function/src/scalars/json/get_by_path.rs b/src/common/function/src/scalars/json/get_by_path.rs index 26e16783c1d7..8ca6211a2ecb 100644 --- a/src/common/function/src/scalars/json/get_by_path.rs +++ b/src/common/function/src/scalars/json/get_by_path.rs @@ -83,7 +83,7 @@ macro_rules! get_by_path { } ); let jsons = &columns[0]; - let strings = &columns[1]; + let paths = &columns[1]; let size = jsons.len(); let datatype = jsons.data_type(); @@ -94,13 +94,13 @@ macro_rules! get_by_path { ConcreteDataType::Binary(_) => { for i in 0..size { let json = jsons.get_ref(i); - let string = strings.get_ref(i); + let path = strings.get_ref(i); let json = json.as_binary(); - let string = string.as_string(); - let result = match (json, string) { - (Ok(Some(json)), Ok(Some(string))) => { - get_json_by_path(json, string) + let path = string.as_string(); + let result = match (json, path) { + (Ok(Some(json)), Ok(Some(path))) => { + get_json_by_path(json, path) .and_then(|json| { jsonb::[](&json).ok() }) } _ => None, @@ -197,12 +197,12 @@ impl Function for GetByPathString { ConcreteDataType::Binary(_) => { for i in 0..size { let json = jsons.get_ref(i); - let string = strings.get_ref(i); + let path = strings.get_ref(i); let json = json.as_binary(); - let string = string.as_string(); - let result = match (json, string) { - (Ok(Some(json)), Ok(Some(string))) => get_json_by_path(json, string) + let path = string.as_string(); + let result = match (json, path) { + (Ok(Some(json)), Ok(Some(path))) => get_json_by_path(json, path) .and_then(|json| jsonb::to_str(&json).ok()), _ => None, }; From 62ddd5069baa6376de2f33292d41304a077cda52 Mon Sep 17 00:00:00 2001 From: CookiePieWw <1035325592@qq.com> Date: Tue, 10 Sep 2024 17:24:13 +0000 Subject: [PATCH 3/4] fix: fix compile error --- .../function/src/scalars/json/get_by_path.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/common/function/src/scalars/json/get_by_path.rs b/src/common/function/src/scalars/json/get_by_path.rs index 8ca6211a2ecb..80ca5b253881 100644 --- a/src/common/function/src/scalars/json/get_by_path.rs +++ b/src/common/function/src/scalars/json/get_by_path.rs @@ -94,10 +94,10 @@ macro_rules! get_by_path { ConcreteDataType::Binary(_) => { for i in 0..size { let json = jsons.get_ref(i); - let path = strings.get_ref(i); + let path = paths.get_ref(i); let json = json.as_binary(); - let path = string.as_string(); + let path = path.as_string(); let result = match (json, path) { (Ok(Some(json)), Ok(Some(path))) => { get_json_by_path(json, path) @@ -186,7 +186,7 @@ impl Function for GetByPathString { } ); let jsons = &columns[0]; - let strings = &columns[1]; + let paths = &columns[1]; let size = jsons.len(); let datatype = jsons.data_type(); @@ -197,13 +197,14 @@ impl Function for GetByPathString { ConcreteDataType::Binary(_) => { for i in 0..size { let json = jsons.get_ref(i); - let path = strings.get_ref(i); + let path = paths.get_ref(i); let json = json.as_binary(); - let path = string.as_string(); + let path = path.as_string(); let result = match (json, path) { - (Ok(Some(json)), Ok(Some(path))) => get_json_by_path(json, path) - .and_then(|json| jsonb::to_str(&json).ok()), + (Ok(Some(json)), Ok(Some(path))) => { + get_json_by_path(json, path).and_then(|json| jsonb::to_str(&json).ok()) + } _ => None, }; From b3b268a54f4f0c79bfef87fd90a8acbb875211a9 Mon Sep 17 00:00:00 2001 From: CookiePieWw <1035325592@qq.com> Date: Wed, 11 Sep 2024 07:47:11 +0000 Subject: [PATCH 4/4] refactor: change name of UDFs, add some tests --- src/common/function/src/scalars/json.rs | 12 +- .../json/{get_by_path.rs => json_get.rs} | 76 +++---- .../standalone/common/function/json.result | 195 ++++++++++++------ .../cases/standalone/common/function/json.sql | 45 ++-- 4 files changed, 209 insertions(+), 119 deletions(-) rename src/common/function/src/scalars/json/{get_by_path.rs => json_get.rs} (90%) diff --git a/src/common/function/src/scalars/json.rs b/src/common/function/src/scalars/json.rs index 3428b601d342..26d63d3b45e1 100644 --- a/src/common/function/src/scalars/json.rs +++ b/src/common/function/src/scalars/json.rs @@ -13,11 +13,11 @@ // limitations under the License. use std::sync::Arc; -mod get_by_path; +mod json_get; mod json_to_string; mod to_json; -use get_by_path::{GetByPathBool, GetByPathFloat, GetByPathInt, GetByPathString}; +use json_get::{JsonGetBool, JsonGetFloat, JsonGetInt, JsonGetString}; use json_to_string::JsonToStringFunction; use to_json::ToJsonFunction; @@ -30,9 +30,9 @@ impl JsonFunction { registry.register(Arc::new(JsonToStringFunction)); registry.register(Arc::new(ToJsonFunction)); - registry.register(Arc::new(GetByPathInt)); - registry.register(Arc::new(GetByPathFloat)); - registry.register(Arc::new(GetByPathString)); - registry.register(Arc::new(GetByPathBool)); + registry.register(Arc::new(JsonGetInt)); + registry.register(Arc::new(JsonGetFloat)); + registry.register(Arc::new(JsonGetString)); + registry.register(Arc::new(JsonGetBool)); } } diff --git a/src/common/function/src/scalars/json/get_by_path.rs b/src/common/function/src/scalars/json/json_get.rs similarity index 90% rename from src/common/function/src/scalars/json/get_by_path.rs rename to src/common/function/src/scalars/json/json_get.rs index 80ca5b253881..78ddc1d2642c 100644 --- a/src/common/function/src/scalars/json/get_by_path.rs +++ b/src/common/function/src/scalars/json/json_get.rs @@ -45,8 +45,8 @@ fn get_json_by_path(json: &[u8], path: &str) -> Option> { /// Get the value from the JSONB by the given path and return it as specified type. /// If the path does not exist or the value is not the type specified, return `NULL`. -macro_rules! get_by_path { - // e.g. name = GetByPathInt, type = Int64, rust_type = i64, doc = "Get the value from the JSONB by the given path and return it as an integer." +macro_rules! json_get { + // e.g. name = JsonGetInt, type = Int64, rust_type = i64, doc = "Get the value from the JSONB by the given path and return it as an integer." ($name: ident, $type: ident, $rust_type: ident, $doc:expr) => { paste::paste! { #[doc = $doc] @@ -131,22 +131,22 @@ macro_rules! get_by_path { }; } -get_by_path!( - GetByPathInt, +json_get!( + JsonGetInt, Int64, i64, "Get the value from the JSONB by the given path and return it as an integer." ); -get_by_path!( - GetByPathFloat, +json_get!( + JsonGetFloat, Float64, f64, "Get the value from the JSONB by the given path and return it as a float." ); -get_by_path!( - GetByPathBool, +json_get!( + JsonGetBool, Boolean, bool, "Get the value from the JSONB by the given path and return it as a boolean." @@ -154,11 +154,11 @@ get_by_path!( /// Get the value from the JSONB by the given path and return it as a string. #[derive(Clone, Debug, Default)] -pub struct GetByPathString; +pub struct JsonGetString; -impl Function for GetByPathString { +impl Function for JsonGetString { fn name(&self) -> &str { - "get_by_path_string" + "json_get_string" } fn return_type(&self, _input_types: &[ConcreteDataType]) -> Result { @@ -213,7 +213,7 @@ impl Function for GetByPathString { } _ => { return UnsupportedInputDataTypeSnafu { - function: "get_by_path_string", + function: "json_get_string", datatypes: columns.iter().map(|c| c.data_type()).collect::>(), } .fail(); @@ -224,9 +224,9 @@ impl Function for GetByPathString { } } -impl Display for GetByPathString { +impl Display for JsonGetString { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{}", "get_by_path_string".to_ascii_uppercase()) + write!(f, "{}", "json_get_string".to_ascii_uppercase()) } } @@ -241,13 +241,13 @@ mod tests { use super::*; #[test] - fn test_get_by_path_int() { - let get_by_path_int = GetByPathInt; + fn test_json_get_int() { + let json_get_int = JsonGetInt; - assert_eq!("get_by_path_int", get_by_path_int.name()); + assert_eq!("json_get_int", json_get_int.name()); assert_eq!( ConcreteDataType::int64_datatype(), - get_by_path_int + json_get_int .return_type(&[ ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype() @@ -255,7 +255,7 @@ mod tests { .unwrap() ); - assert!(matches!(get_by_path_int.signature(), + assert!(matches!(json_get_int.signature(), Signature { type_signature: TypeSignature::Exact(valid_types), volatility: Volatility::Immutable @@ -281,7 +281,7 @@ mod tests { let json_vector = BinaryVector::from_vec(jsonbs); let path_vector = StringVector::from_vec(paths); let args: Vec = vec![Arc::new(json_vector), Arc::new(path_vector)]; - let vector = get_by_path_int + let vector = json_get_int .eval(FunctionContext::default(), &args) .unwrap(); @@ -294,13 +294,13 @@ mod tests { } #[test] - fn test_get_by_path_float() { - let get_by_path_float = GetByPathFloat; + fn test_json_get_float() { + let json_get_float = JsonGetFloat; - assert_eq!("get_by_path_float", get_by_path_float.name()); + assert_eq!("json_get_float", json_get_float.name()); assert_eq!( ConcreteDataType::float64_datatype(), - get_by_path_float + json_get_float .return_type(&[ ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype() @@ -308,7 +308,7 @@ mod tests { .unwrap() ); - assert!(matches!(get_by_path_float.signature(), + assert!(matches!(json_get_float.signature(), Signature { type_signature: TypeSignature::Exact(valid_types), volatility: Volatility::Immutable @@ -334,7 +334,7 @@ mod tests { let json_vector = BinaryVector::from_vec(jsonbs); let path_vector = StringVector::from_vec(paths); let args: Vec = vec![Arc::new(json_vector), Arc::new(path_vector)]; - let vector = get_by_path_float + let vector = json_get_float .eval(FunctionContext::default(), &args) .unwrap(); @@ -347,13 +347,13 @@ mod tests { } #[test] - fn test_get_by_path_boolean() { - let get_by_path_bool = GetByPathBool; + fn test_json_get_bool() { + let json_get_bool = JsonGetBool; - assert_eq!("get_by_path_bool", get_by_path_bool.name()); + assert_eq!("json_get_bool", json_get_bool.name()); assert_eq!( ConcreteDataType::boolean_datatype(), - get_by_path_bool + json_get_bool .return_type(&[ ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype() @@ -361,7 +361,7 @@ mod tests { .unwrap() ); - assert!(matches!(get_by_path_bool.signature(), + assert!(matches!(json_get_bool.signature(), Signature { type_signature: TypeSignature::Exact(valid_types), volatility: Volatility::Immutable @@ -387,7 +387,7 @@ mod tests { let json_vector = BinaryVector::from_vec(jsonbs); let path_vector = StringVector::from_vec(paths); let args: Vec = vec![Arc::new(json_vector), Arc::new(path_vector)]; - let vector = get_by_path_bool + let vector = json_get_bool .eval(FunctionContext::default(), &args) .unwrap(); @@ -400,13 +400,13 @@ mod tests { } #[test] - fn test_get_by_path_string() { - let get_by_path_string = GetByPathString; + fn test_json_get_string() { + let json_get_string = JsonGetString; - assert_eq!("get_by_path_string", get_by_path_string.name()); + assert_eq!("json_get_string", json_get_string.name()); assert_eq!( ConcreteDataType::string_datatype(), - get_by_path_string + json_get_string .return_type(&[ ConcreteDataType::json_datatype(), ConcreteDataType::string_datatype() @@ -414,7 +414,7 @@ mod tests { .unwrap() ); - assert!(matches!(get_by_path_string.signature(), + assert!(matches!(json_get_string.signature(), Signature { type_signature: TypeSignature::Exact(valid_types), volatility: Volatility::Immutable @@ -440,7 +440,7 @@ mod tests { let json_vector = BinaryVector::from_vec(jsonbs); let path_vector = StringVector::from_vec(paths); let args: Vec = vec![Arc::new(json_vector), Arc::new(path_vector)]; - let vector = get_by_path_string + let vector = json_get_string .eval(FunctionContext::default(), &args) .unwrap(); diff --git a/tests/cases/standalone/common/function/json.result b/tests/cases/standalone/common/function/json.result index 66c7ce75eeb4..f2a59b9d7066 100644 --- a/tests/cases/standalone/common/function/json.result +++ b/tests/cases/standalone/common/function/json.result @@ -1,52 +1,52 @@ --- get_by_path functions -- -SELECT get_by_path_int(to_json('{"a": {"b": {"c": 1}}}'), 'a.b.c'); +-- json_get functions -- +SELECT json_get_int(to_json('{"a": {"b": {"c": 1}}}'), 'a.b.c'); -+------------------------------------------------------------------------+ -| get_by_path_int(to_json(Utf8("{"a": {"b": {"c": 1}}}")),Utf8("a.b.c")) | -+------------------------------------------------------------------------+ -| 1 | -+------------------------------------------------------------------------+ ++---------------------------------------------------------------------+ +| json_get_int(to_json(Utf8("{"a": {"b": {"c": 1}}}")),Utf8("a.b.c")) | ++---------------------------------------------------------------------+ +| 1 | ++---------------------------------------------------------------------+ -SELECT get_by_path_float(to_json('{"a": {"b": {"c": 1.234}}}'), 'a.b.c'); +SELECT json_get_float(to_json('{"a": {"b": {"c": 1.234}}}'), 'a:b.c'); -+------------------------------------------------------------------------------+ -| get_by_path_float(to_json(Utf8("{"a": {"b": {"c": 1.234}}}")),Utf8("a.b.c")) | -+------------------------------------------------------------------------------+ -| 1.234 | -+------------------------------------------------------------------------------+ ++---------------------------------------------------------------------------+ +| json_get_float(to_json(Utf8("{"a": {"b": {"c": 1.234}}}")),Utf8("a:b.c")) | ++---------------------------------------------------------------------------+ +| 1.234 | ++---------------------------------------------------------------------------+ -SELECT get_by_path_string(to_json('{"a": {"b": {"c": "foo"}}}'), 'a.b.c'); +SELECT json_get_string(to_json('{"a": {"b": {"c": "foo"}}}'), 'a.b:c'); -+-------------------------------------------------------------------------------+ -| get_by_path_string(to_json(Utf8("{"a": {"b": {"c": "foo"}}}")),Utf8("a.b.c")) | -+-------------------------------------------------------------------------------+ -| foo | -+-------------------------------------------------------------------------------+ ++----------------------------------------------------------------------------+ +| json_get_string(to_json(Utf8("{"a": {"b": {"c": "foo"}}}")),Utf8("a.b:c")) | ++----------------------------------------------------------------------------+ +| foo | ++----------------------------------------------------------------------------+ -SELECT get_by_path_bool(to_json('{"a": {"b": {"c": true}}}'), 'a.b.c'); +SELECT json_get_bool(to_json('{"a": {"b": {"c": true}}}'), 'a.b["c"]'); +----------------------------------------------------------------------------+ -| get_by_path_bool(to_json(Utf8("{"a": {"b": {"c": true}}}")),Utf8("a.b.c")) | +| json_get_bool(to_json(Utf8("{"a": {"b": {"c": true}}}")),Utf8("a.b["c"]")) | +----------------------------------------------------------------------------+ | true | +----------------------------------------------------------------------------+ -SELECT get_by_path_int(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); +SELECT json_get_int(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); + ++--------------------------------------------------------------------------+ +| json_get_int(to_json(Utf8("{"a": {"b": {"c": {"d": 1}}}}")),Utf8("a.b")) | ++--------------------------------------------------------------------------+ +| | ++--------------------------------------------------------------------------+ + +SELECT json_get_string(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); +-----------------------------------------------------------------------------+ -| get_by_path_int(to_json(Utf8("{"a": {"b": {"c": {"d": 1}}}}")),Utf8("a.b")) | +| json_get_string(to_json(Utf8("{"a": {"b": {"c": {"d": 1}}}}")),Utf8("a.b")) | +-----------------------------------------------------------------------------+ | | +-----------------------------------------------------------------------------+ -SELECT get_by_path_string(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); - -+--------------------------------------------------------------------------------+ -| get_by_path_string(to_json(Utf8("{"a": {"b": {"c": {"d": 1}}}}")),Utf8("a.b")) | -+--------------------------------------------------------------------------------+ -| | -+--------------------------------------------------------------------------------+ - -- test functions with table rows -- CREATE TABLE jsons(j JSON, ts timestamp time index); @@ -68,21 +68,21 @@ INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": true}}}'), 4); Affected Rows: 1 -SELECT get_by_path_int(j, 'a.b.c') FROM jsons; +SELECT json_get_int(j, 'a.b.c') FROM jsons; -+----------------------------------------+ -| get_by_path_int(jsons.j,Utf8("a.b.c")) | -+----------------------------------------+ -| 1 | -| | -| | -| 1 | -+----------------------------------------+ ++-------------------------------------+ +| json_get_int(jsons.j,Utf8("a.b.c")) | ++-------------------------------------+ +| 1 | +| | +| | +| 1 | ++-------------------------------------+ -SELECT get_by_path_float(j, 'a.b.c') FROM jsons; +SELECT json_get_float(j, 'a["b"].c') FROM jsons; +------------------------------------------+ -| get_by_path_float(jsons.j,Utf8("a.b.c")) | +| json_get_float(jsons.j,Utf8("a["b"].c")) | +------------------------------------------+ | 1.0 | | 1.234 | @@ -90,39 +90,108 @@ SELECT get_by_path_float(j, 'a.b.c') FROM jsons; | 1.0 | +------------------------------------------+ -SELECT get_by_path_string(j, 'a.b.c') FROM jsons; +SELECT json_get_string(j, 'a.b.c?(@ == 1)') FROM jsons; + ++-------------------------------------------------+ +| json_get_string(jsons.j,Utf8("a.b.c?(@ == 1)")) | ++-------------------------------------------------+ +| 1 | +| | +| | +| | ++-------------------------------------------------+ + +SELECT json_get_bool(j, 'a.b.c') FROM jsons; + ++--------------------------------------+ +| json_get_bool(jsons.j,Utf8("a.b.c")) | ++--------------------------------------+ +| | +| | +| | +| true | ++--------------------------------------+ + +SELECT json_get_int(j, 'a.b["c"]') FROM jsons; + ++----------------------------------------+ +| json_get_int(jsons.j,Utf8("a.b["c"]")) | ++----------------------------------------+ +| 1 | +| | +| | +| 1 | ++----------------------------------------+ + +DROP TABLE jsons; -+-------------------------------------------+ -| get_by_path_string(jsons.j,Utf8("a.b.c")) | -+-------------------------------------------+ -| 1 | -| 1.234 | -| foo | -| true | -+-------------------------------------------+ +Affected Rows: 0 -SELECT get_by_path_bool(j, 'a.b.c') FROM jsons; +-- test functions with arrays -- +CREATE TABLE jsons(j JSON, ts timestamp time index); -+-----------------------------------------+ -| get_by_path_bool(jsons.j,Utf8("a.b.c")) | -+-----------------------------------------+ -| | -| | -| | -| true | -+-----------------------------------------+ +Affected Rows: 0 -SELECT get_by_path_int(j, 'd') FROM jsons; +INSERT INTO jsons VALUES(to_json('["a", "bcde", "", "Long time ago, there is a little pig flying in the sky"]'), 1); + +Affected Rows: 1 + +INSERT INTO jsons VALUES(to_json('[true, false, false, false]'), 2); + +Affected Rows: 1 + +INSERT INTO jsons VALUES(to_json('[1, 0, -2147483649, 2147483648]'), 3); + +Affected Rows: 1 + +INSERT INTO jsons VALUES(to_json('[1.2, 3.1415926535897932384626, -3e123, 1e100]'), 4); + +Affected Rows: 1 + +SELECT json_get_int(j, '[0]') FROM jsons; + ++-----------------------------------+ +| json_get_int(jsons.j,Utf8("[0]")) | ++-----------------------------------+ +| | +| 1 | +| 1 | +| | ++-----------------------------------+ + +SELECT json_get_float(j, '[1]') FROM jsons; + ++-------------------------------------+ +| json_get_float(jsons.j,Utf8("[1]")) | ++-------------------------------------+ +| | +| 0.0 | +| 0.0 | +| 3.141592653589793 | ++-------------------------------------+ + +SELECT json_get_bool(j, '[2]') FROM jsons; +------------------------------------+ -| get_by_path_int(jsons.j,Utf8("d")) | +| json_get_bool(jsons.j,Utf8("[2]")) | +------------------------------------+ | | -| | +| false | | | | | +------------------------------------+ +SELECT json_get_string(j, '[3]') FROM jsons; + ++--------------------------------------------------------+ +| json_get_string(jsons.j,Utf8("[3]")) | ++--------------------------------------------------------+ +| Long time ago, there is a little pig flying in the sky | +| false | +| 2147483648 | +| 1e100 | ++--------------------------------------------------------+ + DROP TABLE jsons; Affected Rows: 0 diff --git a/tests/cases/standalone/common/function/json.sql b/tests/cases/standalone/common/function/json.sql index 5f902a4c223b..c6214ae0f8b9 100644 --- a/tests/cases/standalone/common/function/json.sql +++ b/tests/cases/standalone/common/function/json.sql @@ -1,15 +1,15 @@ --- get_by_path functions -- -SELECT get_by_path_int(to_json('{"a": {"b": {"c": 1}}}'), 'a.b.c'); +-- json_get functions -- +SELECT json_get_int(to_json('{"a": {"b": {"c": 1}}}'), 'a.b.c'); -SELECT get_by_path_float(to_json('{"a": {"b": {"c": 1.234}}}'), 'a.b.c'); +SELECT json_get_float(to_json('{"a": {"b": {"c": 1.234}}}'), 'a:b.c'); -SELECT get_by_path_string(to_json('{"a": {"b": {"c": "foo"}}}'), 'a.b.c'); +SELECT json_get_string(to_json('{"a": {"b": {"c": "foo"}}}'), 'a.b:c'); -SELECT get_by_path_bool(to_json('{"a": {"b": {"c": true}}}'), 'a.b.c'); +SELECT json_get_bool(to_json('{"a": {"b": {"c": true}}}'), 'a.b["c"]'); -SELECT get_by_path_int(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); +SELECT json_get_int(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); -SELECT get_by_path_string(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); +SELECT json_get_string(to_json('{"a": {"b": {"c": {"d": 1}}}}'), 'a.b'); -- test functions with table rows -- CREATE TABLE jsons(j JSON, ts timestamp time index); @@ -22,14 +22,35 @@ INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": "foo"}}}'), 3); INSERT INTO jsons VALUES(to_json('{"a": {"b": {"c": true}}}'), 4); -SELECT get_by_path_int(j, 'a.b.c') FROM jsons; +SELECT json_get_int(j, 'a.b.c') FROM jsons; -SELECT get_by_path_float(j, 'a.b.c') FROM jsons; +SELECT json_get_float(j, 'a["b"].c') FROM jsons; -SELECT get_by_path_string(j, 'a.b.c') FROM jsons; +SELECT json_get_string(j, 'a.b.c?(@ == 1)') FROM jsons; -SELECT get_by_path_bool(j, 'a.b.c') FROM jsons; +SELECT json_get_bool(j, 'a.b.c') FROM jsons; -SELECT get_by_path_int(j, 'd') FROM jsons; +SELECT json_get_int(j, 'a.b["c"]') FROM jsons; + +DROP TABLE jsons; + +-- test functions with arrays -- +CREATE TABLE jsons(j JSON, ts timestamp time index); + +INSERT INTO jsons VALUES(to_json('["a", "bcde", "", "Long time ago, there is a little pig flying in the sky"]'), 1); + +INSERT INTO jsons VALUES(to_json('[true, false, false, false]'), 2); + +INSERT INTO jsons VALUES(to_json('[1, 0, -2147483649, 2147483648]'), 3); + +INSERT INTO jsons VALUES(to_json('[1.2, 3.1415926535897932384626, -3e123, 1e100]'), 4); + +SELECT json_get_int(j, '[0]') FROM jsons; + +SELECT json_get_float(j, '[1]') FROM jsons; + +SELECT json_get_bool(j, '[2]') FROM jsons; + +SELECT json_get_string(j, '[3]') FROM jsons; DROP TABLE jsons;