Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Expressify str.json_path_match #15764

Merged
merged 2 commits into from
Apr 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 36 additions & 9 deletions crates/polars-ops/src/chunked_array/strings/json_path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,21 @@ use std::borrow::Cow;

use arrow::array::ValueSize;
use jsonpath_lib::PathCompiled;
use polars_core::prelude::arity::{broadcast_try_binary_elementwise, unary_elementwise};
use serde_json::Value;

use super::*;

pub fn extract_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option<Cow<'a, str>> {
pub fn extract_json(expr: &PathCompiled, json_str: &str) -> Option<String> {
serde_json::from_str(json_str).ok().and_then(|value| {
// TODO: a lot of heap allocations here. Improve json path by adding a take?
let result = expr.select(&value).ok()?;
let first = *result.first()?;

match first {
Value::String(s) => Some(Cow::Owned(s.clone())),
Value::String(s) => Some(s.clone()),
Value::Null => None,
v => Some(Cow::Owned(v.to_string())),
v => Some(v.to_string()),
}
})
}
Expand All @@ -41,12 +42,38 @@ pub fn select_json<'a>(expr: &PathCompiled, json_str: &'a str) -> Option<Cow<'a,
pub trait Utf8JsonPathImpl: AsString {
/// Extract json path, first match
/// Refer to <https://goessner.net/articles/JsonPath/>
fn json_path_match(&self, json_path: &str) -> PolarsResult<StringChunked> {
let pat = PathCompiled::compile(json_path)
.map_err(|e| polars_err!(ComputeError: "error compiling JSONpath expression {}", e))?;
Ok(self
.as_string()
.apply(|opt_s| opt_s.and_then(|s| extract_json(&pat, s))))
fn json_path_match(&self, json_path: &StringChunked) -> PolarsResult<StringChunked> {
let ca = self.as_string();
match (ca.len(), json_path.len()) {
(_, 1) => {
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should not let this branch merge with the following broadcast_try_binary_elementwise branch as we only need compile the json_path once.

// SAFETY: `json_path` was verified to have exactly 1 element.
let opt_path = unsafe { json_path.get_unchecked(0) };
let out = if let Some(path) = opt_path {
let pat = PathCompiled::compile(path).map_err(
|e| polars_err!(ComputeError: "error compiling JSON path expression {}", e),
)?;
unary_elementwise(ca, |opt_s| opt_s.and_then(|s| extract_json(&pat, s)))
} else {
StringChunked::full_null(ca.name(), ca.len())
};
Ok(out)
},
(len_ca, len_path) if len_ca == 1 || len_ca == len_path => {
broadcast_try_binary_elementwise(ca, json_path, |opt_str, opt_path| {
match (opt_str, opt_path) {
(Some(str_val), Some(path)) => {
PathCompiled::compile(path)
Copy link
Collaborator Author

@reswqa reswqa Apr 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In theory, this could be further optimized with FastFixedCache, but PathCompile<'a>, unlike Regex, has a lifetime annotation making borrow checker unhappy. We should be able to find a way to make it work, but I think it make sense that we leave this for the next perf PR.

.map_err(|e| polars_err!(ComputeError: "error compiling JSON path expression {}", e))
.map(|path| extract_json(&path, str_val))
},
_ => Ok(None),
}
})
},
(len_ca, len_path) => {
polars_bail!(ComputeError: "The length of `ca` and `json_path` should either 1 or the same, but `{}`, `{}` founded", len_ca, len_path)
},
}
}

/// Returns the inferred DataType for JSON values for each row
Expand Down
13 changes: 7 additions & 6 deletions crates/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ pub enum StringFunction {
infer_schema_len: Option<usize>,
},
#[cfg(feature = "extract_jsonpath")]
JsonPathMatch(String),
JsonPathMatch,
#[cfg(feature = "regex")]
Replace {
// negative is replace all
Expand Down Expand Up @@ -149,7 +149,7 @@ impl StringFunction {
#[cfg(feature = "extract_jsonpath")]
JsonDecode { dtype, .. } => mapper.with_opt_dtype(dtype.clone()),
#[cfg(feature = "extract_jsonpath")]
JsonPathMatch(_) => mapper.with_dtype(DataType::String),
JsonPathMatch => mapper.with_dtype(DataType::String),
LenBytes => mapper.with_dtype(DataType::UInt32),
LenChars => mapper.with_dtype(DataType::UInt32),
#[cfg(feature = "regex")]
Expand Down Expand Up @@ -221,7 +221,7 @@ impl Display for StringFunction {
#[cfg(feature = "extract_jsonpath")]
JsonDecode { .. } => "json_decode",
#[cfg(feature = "extract_jsonpath")]
JsonPathMatch(_) => "json_path_match",
JsonPathMatch => "json_path_match",
LenBytes => "len_bytes",
Lowercase => "lowercase",
LenChars => "len_chars",
Expand Down Expand Up @@ -374,7 +374,7 @@ impl From<StringFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
infer_schema_len,
} => map!(strings::json_decode, dtype.clone(), infer_schema_len),
#[cfg(feature = "extract_jsonpath")]
JsonPathMatch(pat) => map!(strings::json_path_match, &pat),
JsonPathMatch => map_as_slice!(strings::json_path_match),
#[cfg(feature = "find_many")]
ContainsMany {
ascii_case_insensitive,
Expand Down Expand Up @@ -994,7 +994,8 @@ pub(super) fn json_decode(
}

#[cfg(feature = "extract_jsonpath")]
pub(super) fn json_path_match(s: &Series, pat: &str) -> PolarsResult<Series> {
let ca = s.str()?;
pub(super) fn json_path_match(s: &[Series]) -> PolarsResult<Series> {
let ca = s[0].str()?;
let pat = s[1].str()?;
Ok(ca.json_path_match(pat)?.into_series())
}
10 changes: 7 additions & 3 deletions crates/polars-plan/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -562,8 +562,12 @@ impl StringNameSpace {
}

#[cfg(feature = "extract_jsonpath")]
pub fn json_path_match(self, pat: String) -> Expr {
self.0
.map_private(FunctionExpr::StringExpr(StringFunction::JsonPathMatch(pat)))
pub fn json_path_match(self, pat: Expr) -> Expr {
self.0.map_many_private(
FunctionExpr::StringExpr(StringFunction::JsonPathMatch),
&[pat],
false,
false,
)
}
}
3 changes: 2 additions & 1 deletion py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -1304,7 +1304,7 @@ def json_decode(
dtype = py_type_to_dtype(dtype)
return wrap_expr(self._pyexpr.str_json_decode(dtype, infer_schema_length))

def json_path_match(self, json_path: str) -> Expr:
def json_path_match(self, json_path: IntoExprColumn) -> Expr:
"""
Extract the first match of JSON string with the provided JSONPath expression.

Expand Down Expand Up @@ -1345,6 +1345,7 @@ def json_path_match(self, json_path: str) -> Expr:
│ {"a":true} ┆ true │
└────────────┴─────────┘
"""
json_path = parse_as_expression(json_path, str_as_lit=True)
return wrap_expr(self._pyexpr.str_json_path_match(json_path))

def decode(self, encoding: TransferEncoding, *, strict: bool = True) -> Expr:
Expand Down
2 changes: 1 addition & 1 deletion py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -730,7 +730,7 @@ def json_decode(
]
"""

def json_path_match(self, json_path: str) -> Series:
def json_path_match(self, json_path: IntoExprColumn) -> Series:
"""
Extract the first match of json string with provided JSONPath expression.

Expand Down
4 changes: 2 additions & 2 deletions py-polars/src/expr/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -237,8 +237,8 @@ impl PyExpr {
}

#[cfg(feature = "extract_jsonpath")]
fn str_json_path_match(&self, pat: String) -> Self {
self.inner.clone().str().json_path_match(pat).into()
fn str_json_path_match(&self, pat: Self) -> Self {
self.inner.clone().str().json_path_match(pat.inner).into()
}

fn str_extract(&self, pat: Self, group_index: usize) -> Self {
Expand Down
28 changes: 28 additions & 0 deletions py-polars/tests/unit/namespaces/string/test_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -745,6 +745,34 @@ def test_jsonpath_single() -> None:
assert_series_equal(s.str.json_path_match("$.a"), expected)


def test_json_path_match() -> None:
df = pl.DataFrame(
{
"str": [
'{"a":"1"}',
None,
'{"b":2}',
'{"a":2.1, "b": "hello"}',
'{"a":true}',
],
"pat": ["$.a", "$.a", "$.b", "$.b", None],
}
)
out = df.select(
all_expr=pl.col("str").str.json_path_match(pl.col("pat")),
str_expr=pl.col("str").str.json_path_match("$.a"),
pat_expr=pl.lit('{"a": 1.1, "b": 10}').str.json_path_match(pl.col("pat")),
)
expected = pl.DataFrame(
{
"all_expr": ["1", None, "2", "hello", None],
"str_expr": ["1", None, None, "2.1", "true"],
"pat_expr": ["1.1", "1.1", "10", "10", None],
}
)
assert_frame_equal(out, expected)


def test_extract_regex() -> None:
s = pl.Series(
[
Expand Down
Loading