From 888403564b735eb720bc30af515f1e0fcaca62b3 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Fri, 19 Apr 2024 14:39:47 +0800 Subject: [PATCH 1/2] feat: Expressify `str.json_path_match` --- .../src/chunked_array/strings/json_path.rs | 45 +++++++++++++++---- .../src/dsl/function_expr/strings.rs | 13 +++--- crates/polars-plan/src/dsl/string.rs | 10 +++-- py-polars/polars/expr/string.py | 3 +- py-polars/polars/series/string.py | 2 +- py-polars/src/expr/string.rs | 4 +- .../unit/namespaces/string/test_string.py | 28 ++++++++++++ 7 files changed, 83 insertions(+), 22 deletions(-) diff --git a/crates/polars-ops/src/chunked_array/strings/json_path.rs b/crates/polars-ops/src/chunked_array/strings/json_path.rs index 3b8edcaea962..b5f4d2299267 100644 --- a/crates/polars-ops/src/chunked_array/strings/json_path.rs +++ b/crates/polars-ops/src/chunked_array/strings/json_path.rs @@ -2,20 +2,21 @@ use std::borrow::Cow; use arrow::array::ValueSize; use jsonpath_lib::PathCompiled; +use polars_core::prelude::arity::{broadcast_try_binary_elementwise, unary_elementwise}; use serde_json::Value; use super::*; -pub fn extract_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option> { +pub fn extract_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option { serde_json::from_str(json_str).ok().and_then(|value| { // TODO: a lot of heap allocations here. Improve json path by adding a take? let result = expr.select(&value).ok()?; let first = *result.first()?; match first { - Value::String(s) => Some(Cow::Owned(s.clone())), + Value::String(s) => Some(s.clone()), Value::Null => None, - v => Some(Cow::Owned(v.to_string())), + v => Some(v.to_string()), } }) } @@ -41,12 +42,38 @@ pub fn select_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option - fn json_path_match(&self, json_path: &str) -> PolarsResult { - let pat = PathCompiled::compile(json_path) - .map_err(|e| polars_err!(ComputeError: "error compiling JSONpath expression {}", e))?; - Ok(self - .as_string() - .apply(|opt_s| opt_s.and_then(|s| extract_json(&pat, s)))) + fn json_path_match(&self, json_path: &StringChunked) -> PolarsResult { + let ca = self.as_string(); + match (ca.len(), json_path.len()) { + (_, 1) => { + // SAFETY: `json_path` was verified to have exactly 1 element. + let opt_path = unsafe { json_path.get_unchecked(0) }; + let out = if let Some(path) = opt_path { + let pat = PathCompiled::compile(path).map_err( + |e| polars_err!(ComputeError: "error compiling JSON path expression {}", e), + )?; + unary_elementwise(ca, |opt_s| opt_s.and_then(|s| extract_json(&pat, s))) + } else { + StringChunked::full_null(ca.name(), ca.len()) + }; + Ok(out) + }, + (len_ca, len_path) if len_ca == 1 || len_ca == len_path => { + broadcast_try_binary_elementwise(ca, json_path, |opt_str, opt_path| { + match (opt_str, opt_path) { + (Some(str_val), Some(path)) => { + PathCompiled::compile(path) + .map_err(|e| polars_err!(ComputeError: "error compiling JSON path expression {}", e)) + .map(|path| extract_json(&path, str_val)) + }, + _ => Ok(None), + } + }) + }, + (len_ca, len_path) => { + polars_bail!(ComputeError: "The length of `ca` and `json_path` should either 1 or the same, but `{}`, `{}` founded", len_ca, len_path) + }, + } } /// Returns the inferred DataType for JSON values for each row diff --git a/crates/polars-plan/src/dsl/function_expr/strings.rs b/crates/polars-plan/src/dsl/function_expr/strings.rs index 813907f51d23..4c932f8a131f 100644 --- a/crates/polars-plan/src/dsl/function_expr/strings.rs +++ b/crates/polars-plan/src/dsl/function_expr/strings.rs @@ -63,7 +63,7 @@ pub enum StringFunction { infer_schema_len: Option, }, #[cfg(feature = "extract_jsonpath")] - JsonPathMatch(String), + JsonPathMatch, #[cfg(feature = "regex")] Replace { // negative is replace all @@ -149,7 +149,7 @@ impl StringFunction { #[cfg(feature = "extract_jsonpath")] JsonDecode { dtype, .. } => mapper.with_opt_dtype(dtype.clone()), #[cfg(feature = "extract_jsonpath")] - JsonPathMatch(_) => mapper.with_dtype(DataType::String), + JsonPathMatch => mapper.with_dtype(DataType::String), LenBytes => mapper.with_dtype(DataType::UInt32), LenChars => mapper.with_dtype(DataType::UInt32), #[cfg(feature = "regex")] @@ -221,7 +221,7 @@ impl Display for StringFunction { #[cfg(feature = "extract_jsonpath")] JsonDecode { .. } => "json_decode", #[cfg(feature = "extract_jsonpath")] - JsonPathMatch(_) => "json_path_match", + JsonPathMatch => "json_path_match", LenBytes => "len_bytes", Lowercase => "lowercase", LenChars => "len_chars", @@ -374,7 +374,7 @@ impl From for SpecialEq> { infer_schema_len, } => map!(strings::json_decode, dtype.clone(), infer_schema_len), #[cfg(feature = "extract_jsonpath")] - JsonPathMatch(pat) => map!(strings::json_path_match, &pat), + JsonPathMatch => map_as_slice!(strings::json_path_match), #[cfg(feature = "find_many")] ContainsMany { ascii_case_insensitive, @@ -994,7 +994,8 @@ pub(super) fn json_decode( } #[cfg(feature = "extract_jsonpath")] -pub(super) fn json_path_match(s: &Series, pat: &str) -> PolarsResult { - let ca = s.str()?; +pub(super) fn json_path_match(s: &[Series]) -> PolarsResult { + let ca = s[0].str()?; + let pat = s[1].str()?; Ok(ca.json_path_match(pat)?.into_series()) } diff --git a/crates/polars-plan/src/dsl/string.rs b/crates/polars-plan/src/dsl/string.rs index 131bea09d371..e5aa3fc58119 100644 --- a/crates/polars-plan/src/dsl/string.rs +++ b/crates/polars-plan/src/dsl/string.rs @@ -562,8 +562,12 @@ impl StringNameSpace { } #[cfg(feature = "extract_jsonpath")] - pub fn json_path_match(self, pat: String) -> Expr { - self.0 - .map_private(FunctionExpr::StringExpr(StringFunction::JsonPathMatch(pat))) + pub fn json_path_match(self, pat: Expr) -> Expr { + self.0.map_many_private( + FunctionExpr::StringExpr(StringFunction::JsonPathMatch), + &[pat], + false, + false, + ) } } diff --git a/py-polars/polars/expr/string.py b/py-polars/polars/expr/string.py index af8acf399f43..47319b9f2893 100644 --- a/py-polars/polars/expr/string.py +++ b/py-polars/polars/expr/string.py @@ -1304,7 +1304,7 @@ def json_decode( dtype = py_type_to_dtype(dtype) return wrap_expr(self._pyexpr.str_json_decode(dtype, infer_schema_length)) - def json_path_match(self, json_path: str) -> Expr: + def json_path_match(self, json_path: IntoExprColumn) -> Expr: """ Extract the first match of JSON string with the provided JSONPath expression. @@ -1345,6 +1345,7 @@ def json_path_match(self, json_path: str) -> Expr: │ {"a":true} ┆ true │ └────────────┴─────────┘ """ + json_path = parse_as_expression(json_path, str_as_lit=True) return wrap_expr(self._pyexpr.str_json_path_match(json_path)) def decode(self, encoding: TransferEncoding, *, strict: bool = True) -> Expr: diff --git a/py-polars/polars/series/string.py b/py-polars/polars/series/string.py index c21217477dfb..9979d1041622 100644 --- a/py-polars/polars/series/string.py +++ b/py-polars/polars/series/string.py @@ -730,7 +730,7 @@ def json_decode( ] """ - def json_path_match(self, json_path: str) -> Series: + def json_path_match(self, json_path: IntoExprColumn) -> Series: """ Extract the first match of json string with provided JSONPath expression. diff --git a/py-polars/src/expr/string.rs b/py-polars/src/expr/string.rs index bcf17614de37..eb623c0dce8e 100644 --- a/py-polars/src/expr/string.rs +++ b/py-polars/src/expr/string.rs @@ -237,8 +237,8 @@ impl PyExpr { } #[cfg(feature = "extract_jsonpath")] - fn str_json_path_match(&self, pat: String) -> Self { - self.inner.clone().str().json_path_match(pat).into() + fn str_json_path_match(&self, pat: Self) -> Self { + self.inner.clone().str().json_path_match(pat.inner).into() } fn str_extract(&self, pat: Self, group_index: usize) -> Self { diff --git a/py-polars/tests/unit/namespaces/string/test_string.py b/py-polars/tests/unit/namespaces/string/test_string.py index 07765257a133..8d5562bcc5a0 100644 --- a/py-polars/tests/unit/namespaces/string/test_string.py +++ b/py-polars/tests/unit/namespaces/string/test_string.py @@ -745,6 +745,34 @@ def test_jsonpath_single() -> None: assert_series_equal(s.str.json_path_match("$.a"), expected) +def test_json_path_match() -> None: + df = pl.DataFrame( + { + "str": [ + '{"a":"1"}', + None, + '{"b":2}', + '{"a":2.1, "b": "hello"}', + '{"a":true}', + ], + "pat": ["$.a", "$.a", "$.b", "$.b", None], + } + ) + out = df.select( + all_expr=pl.col("str").str.json_path_match(pl.col("pat")), + str_expr=pl.col("str").str.json_path_match("$.a"), + pat_expr=pl.lit('{"a": 1.1, "b": 10}').str.json_path_match(pl.col("pat")), + ) + expected = pl.DataFrame( + { + "all_expr": ["1", None, "2", "hello", None], + "str_expr": ["1", None, None, "2.1", "true"], + "pat_expr": ["1.1", "1.1", "10", "10", None], + } + ) + assert_frame_equal(out, expected) + + def test_extract_regex() -> None: s = pl.Series( [ From d3e783c4bd13456db6f86a19f5956513ac752536 Mon Sep 17 00:00:00 2001 From: Weijie Guo Date: Fri, 19 Apr 2024 16:38:10 +0800 Subject: [PATCH 2/2] clippy --- crates/polars-ops/src/chunked_array/strings/json_path.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/polars-ops/src/chunked_array/strings/json_path.rs b/crates/polars-ops/src/chunked_array/strings/json_path.rs index b5f4d2299267..6c54de338676 100644 --- a/crates/polars-ops/src/chunked_array/strings/json_path.rs +++ b/crates/polars-ops/src/chunked_array/strings/json_path.rs @@ -7,7 +7,7 @@ use serde_json::Value; use super::*; -pub fn extract_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option { +pub fn extract_json(expr: &PathCompiled, json_str: &str) -> Option { serde_json::from_str(json_str).ok().and_then(|value| { // TODO: a lot of heap allocations here. Improve json path by adding a take? let result = expr.select(&value).ok()?;