Skip to content

Commit

Permalink
feat(rust, python): add infer schema len to json_extract (pola-rs#9478)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored and c-peters committed Jul 14, 2023
1 parent d711519 commit e573e8e
Show file tree
Hide file tree
Showing 12 changed files with 87 additions and 31 deletions.
7 changes: 6 additions & 1 deletion polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,12 @@ decompress = ["polars-io/decompress"]
decompress-fast = ["polars-io/decompress-fast"]
mode = ["polars-core/mode", "polars-lazy/mode"]
take_opt_iter = ["polars-core/take_opt_iter"]
extract_jsonpath = ["polars-core/strings", "polars-ops/extract_jsonpath", "polars-ops/strings"]
extract_jsonpath = [
"polars-core/strings",
"polars-ops/extract_jsonpath",
"polars-ops/strings",
"polars-lazy/extract_jsonpath",
]
string_encoding = ["polars-ops/string_encoding", "polars-core/strings"]
binary_encoding = ["polars-ops/binary_encoding"]
groupby_list = ["polars-core/groupby_list"]
Expand Down
3 changes: 2 additions & 1 deletion polars/polars-json/src/ndjson/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,5 +144,6 @@ pub fn infer_iter<A: AsRef<str>>(rows: impl Iterator<Item = A>) -> PolarsResult<
}

let v: Vec<&DataType> = data_types.iter().collect();
Ok(crate::json::infer_schema::coerce_data_type(&v))
dbg!(&v);
dbg!(Ok(crate::json::infer_schema::coerce_data_type(&v)))
}
1 change: 1 addition & 0 deletions polars/polars-lazy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ list_take = ["polars-ops/list_take", "polars-plan/list_take"]
list_count = ["polars-ops/list_count", "polars-plan/list_count"]

true_div = ["polars-plan/true_div"]
extract_jsonpath = ["polars-plan/extract_jsonpath", "polars-ops/extract_jsonpath"]

# operations
approx_unique = ["polars-plan/approx_unique"]
Expand Down
1 change: 1 addition & 0 deletions polars/polars-lazy/polars-plan/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ timezones = ["chrono-tz", "polars-time/timezones", "polars-core/timezones", "reg
binary_encoding = ["polars-ops/binary_encoding"]
true_div = []
nightly = ["polars-utils/nightly", "polars-ops/nightly"]
extract_jsonpath = []

# operations
approx_unique = ["polars-ops/approx_unique"]
Expand Down
5 changes: 5 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,11 @@ impl From<StringFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
Explode => map!(strings::explode),
#[cfg(feature = "dtype-decimal")]
ToDecimal(infer_len) => map!(strings::to_decimal, infer_len),
#[cfg(feature = "extract_jsonpath")]
JsonExtract {
dtype,
infer_schema_len,
} => map!(strings::json_extract, dtype.clone(), infer_schema_len),
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -339,4 +339,10 @@ impl<'a> FieldsMapper<'a> {
}
Ok(first)
}

#[cfg(feature = "extract_jsonpath")]
pub(super) fn with_opt_dtype(&self, dtype: Option<DataType>) -> PolarsResult<Field> {
let dtype = dtype.unwrap_or(DataType::Unknown);
self.with_dtype(dtype)
}
}
19 changes: 19 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ pub enum StringFunction {
Explode,
#[cfg(feature = "dtype-decimal")]
ToDecimal(usize),
#[cfg(feature = "extract_jsonpath")]
JsonExtract {
dtype: Option<DataType>,
infer_schema_len: Option<usize>,
},
}

impl StringFunction {
Expand Down Expand Up @@ -98,6 +103,8 @@ impl StringFunction {
Explode => mapper.with_same_dtype(),
#[cfg(feature = "dtype-decimal")]
ToDecimal(_) => mapper.with_dtype(DataType::Decimal(None, None)),
#[cfg(feature = "extract_jsonpath")]
JsonExtract { dtype, .. } => mapper.with_opt_dtype(dtype.clone()),
}
}
}
Expand Down Expand Up @@ -139,6 +146,8 @@ impl Display for StringFunction {
StringFunction::Explode => "explode",
#[cfg(feature = "dtype-decimal")]
StringFunction::ToDecimal(_) => "to_decimal",
#[cfg(feature = "extract_jsonpath")]
StringFunction::JsonExtract { .. } => "json_extract",
};

write!(f, "str.{s}")
Expand Down Expand Up @@ -675,3 +684,13 @@ pub(super) fn to_decimal(s: &Series, infer_len: usize) -> PolarsResult<Series> {
let ca = s.utf8()?;
ca.to_decimal(infer_len)
}

#[cfg(feature = "extract_jsonpath")]
pub(super) fn json_extract(
s: &Series,
dtype: Option<DataType>,
infer_schema_len: Option<usize>,
) -> PolarsResult<Series> {
let ca = s.utf8()?;
ca.json_extract(dtype, infer_schema_len)
}
9 changes: 9 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -476,4 +476,13 @@ impl StringNameSpace {
self.0
.apply_private(FunctionExpr::StringExpr(StringFunction::Explode))
}

#[cfg(feature = "extract_jsonpath")]
pub fn json_extract(self, dtype: Option<DataType>, infer_schema_len: Option<usize>) -> Expr {
self.0
.map_private(FunctionExpr::StringExpr(StringFunction::JsonExtract {
dtype,
infer_schema_len,
}))
}
}
23 changes: 16 additions & 7 deletions polars/polars-ops/src/chunked_array/strings/json_path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,15 @@ pub trait Utf8JsonPathImpl: AsUtf8 {
}

/// Extracts a typed-JSON value for each row in the Utf8Chunked
fn json_extract(&self, dtype: Option<DataType>) -> PolarsResult<Series> {
fn json_extract(
&self,
dtype: Option<DataType>,
infer_schema_len: Option<usize>,
) -> PolarsResult<Series> {
let ca = self.as_utf8();
let dtype = match dtype {
Some(dt) => dt,
None => ca.json_infer(None)?,
None => ca.json_infer(infer_schema_len)?,
};

let buf_size = ca.get_values_size() + ca.null_count() * "null".len();
Expand All @@ -92,9 +96,14 @@ pub trait Utf8JsonPathImpl: AsUtf8 {
.apply_on_opt(|opt_s| opt_s.and_then(|s| select_json(&pat, s))))
}

fn json_path_extract(&self, json_path: &str, dtype: Option<DataType>) -> PolarsResult<Series> {
fn json_path_extract(
&self,
json_path: &str,
dtype: Option<DataType>,
infer_schema_len: Option<usize>,
) -> PolarsResult<Series> {
let selected_json = self.as_utf8().json_path_select(json_path)?;
selected_json.json_extract(dtype)
selected_json.json_extract(dtype, infer_schema_len)
}
}

Expand Down Expand Up @@ -178,11 +187,11 @@ mod tests {
let expected_dtype = expected_series.dtype().clone();

assert!(ca
.json_extract(None)
.json_extract(None, None)
.unwrap()
.series_equal_missing(&expected_series));
assert!(ca
.json_extract(Some(expected_dtype))
.json_extract(Some(expected_dtype), None)
.unwrap()
.series_equal_missing(&expected_series));
}
Expand Down Expand Up @@ -253,7 +262,7 @@ mod tests {
);

assert!(ca
.json_path_extract("$.b[:].c", None)
.json_path_extract("$.b[:].c", None, None)
.unwrap()
.into_series()
.series_equal_missing(&c_series));
Expand Down
9 changes: 7 additions & 2 deletions py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,9 @@ def starts_with(self, prefix: str | Expr) -> Expr:
prefix = parse_as_expression(prefix, str_as_lit=True)
return wrap_expr(self._pyexpr.str_starts_with(prefix))

def json_extract(self, dtype: PolarsDataType | None = None) -> Expr:
def json_extract(
self, dtype: PolarsDataType | None = None, infer_schema_length: int | None = 100
) -> Expr:
"""
Parse string values as JSON.
Expand All @@ -953,6 +955,9 @@ def json_extract(self, dtype: PolarsDataType | None = None) -> Expr:
dtype
The dtype to cast the extracted value to. If None, the dtype will be
inferred from the JSON value.
infer_schema_length
How many rows to parse to determine the schema.
If ``None`` all rows are used.
Examples
--------
Expand Down Expand Up @@ -980,7 +985,7 @@ def json_extract(self, dtype: PolarsDataType | None = None) -> Expr:
"""
if dtype is not None:
dtype = py_type_to_dtype(dtype)
return wrap_expr(self._pyexpr.str_json_extract(dtype))
return wrap_expr(self._pyexpr.str_json_extract(dtype, infer_schema_length))

def json_path_match(self, json_path: str) -> Expr:
"""
Expand Down
7 changes: 6 additions & 1 deletion py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,9 @@ def encode(self, encoding: TransferEncoding) -> Series:
"""

def json_extract(self, dtype: PolarsDataType | None = None) -> Series:
def json_extract(
self, dtype: PolarsDataType | None = None, infer_schema_length: int | None = 100
) -> Series:
"""
Parse string values as JSON.
Expand All @@ -570,6 +572,9 @@ def json_extract(self, dtype: PolarsDataType | None = None) -> Series:
dtype
The dtype to cast the extracted value to. If None, the dtype will be
inferred from the JSON value.
infer_schema_length
How many rows to parse to determine the schema.
If ``None`` all rows are used.
Examples
--------
Expand Down
28 changes: 9 additions & 19 deletions py-polars/src/expr/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,26 +219,16 @@ impl PyExpr {
}

#[cfg(feature = "extract_jsonpath")]
fn str_json_extract(&self, dtype: Option<Wrap<DataType>>) -> Self {
fn str_json_extract(
&self,
dtype: Option<Wrap<DataType>>,
infer_schema_len: Option<usize>,
) -> Self {
let dtype = dtype.map(|wrap| wrap.0);

let output_type = match dtype.clone() {
Some(dtype) => GetOutput::from_type(dtype),
None => GetOutput::from_type(DataType::Unknown),
};

let function = move |s: Series| {
let ca = s.utf8()?;
match ca.json_extract(dtype.clone()) {
Ok(ca) => Ok(Some(ca.into_series())),
Err(e) => Err(PolarsError::ComputeError(format!("{e:?}").into())),
}
};

self.clone()
.inner
.map(function, output_type)
.with_fmt("str.json_extract")
self.inner
.clone()
.str()
.json_extract(dtype, infer_schema_len)
.into()
}

Expand Down

0 comments on commit e573e8e

Please sign in to comment.