Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(rust, python): add infer schema len to json_extract #9478

Merged
merged 2 commits into from
Jun 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion polars/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,12 @@ decompress = ["polars-io/decompress"]
decompress-fast = ["polars-io/decompress-fast"]
mode = ["polars-core/mode", "polars-lazy/mode"]
take_opt_iter = ["polars-core/take_opt_iter"]
extract_jsonpath = ["polars-core/strings", "polars-ops/extract_jsonpath", "polars-ops/strings"]
extract_jsonpath = [
"polars-core/strings",
"polars-ops/extract_jsonpath",
"polars-ops/strings",
"polars-lazy/extract_jsonpath",
]
string_encoding = ["polars-ops/string_encoding", "polars-core/strings"]
binary_encoding = ["polars-ops/binary_encoding"]
groupby_list = ["polars-core/groupby_list"]
Expand Down
3 changes: 2 additions & 1 deletion polars/polars-json/src/ndjson/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -144,5 +144,6 @@ pub fn infer_iter<A: AsRef<str>>(rows: impl Iterator<Item = A>) -> PolarsResult<
}

let v: Vec<&DataType> = data_types.iter().collect();
Ok(crate::json::infer_schema::coerce_data_type(&v))
dbg!(&v);
dbg!(Ok(crate::json::infer_schema::coerce_data_type(&v)))
}
1 change: 1 addition & 0 deletions polars/polars-lazy/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ list_take = ["polars-ops/list_take", "polars-plan/list_take"]
list_count = ["polars-ops/list_count", "polars-plan/list_count"]

true_div = ["polars-plan/true_div"]
extract_jsonpath = ["polars-plan/extract_jsonpath", "polars-ops/extract_jsonpath"]

# operations
approx_unique = ["polars-plan/approx_unique"]
Expand Down
1 change: 1 addition & 0 deletions polars/polars-lazy/polars-plan/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ timezones = ["chrono-tz", "polars-time/timezones", "polars-core/timezones", "reg
binary_encoding = ["polars-ops/binary_encoding"]
true_div = []
nightly = ["polars-utils/nightly", "polars-ops/nightly"]
extract_jsonpath = []

# operations
approx_unique = ["polars-ops/approx_unique"]
Expand Down
5 changes: 5 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/function_expr/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -556,6 +556,11 @@ impl From<StringFunction> for SpecialEq<Arc<dyn SeriesUdf>> {
Explode => map!(strings::explode),
#[cfg(feature = "dtype-decimal")]
ToDecimal(infer_len) => map!(strings::to_decimal, infer_len),
#[cfg(feature = "extract_jsonpath")]
JsonExtract {
dtype,
infer_schema_len,
} => map!(strings::json_extract, dtype.clone(), infer_schema_len),
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -339,4 +339,10 @@ impl<'a> FieldsMapper<'a> {
}
Ok(first)
}

#[cfg(feature = "extract_jsonpath")]
pub(super) fn with_opt_dtype(&self, dtype: Option<DataType>) -> PolarsResult<Field> {
let dtype = dtype.unwrap_or(DataType::Unknown);
self.with_dtype(dtype)
}
}
19 changes: 19 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/function_expr/strings.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ pub enum StringFunction {
Explode,
#[cfg(feature = "dtype-decimal")]
ToDecimal(usize),
#[cfg(feature = "extract_jsonpath")]
JsonExtract {
dtype: Option<DataType>,
infer_schema_len: Option<usize>,
},
}

impl StringFunction {
Expand Down Expand Up @@ -98,6 +103,8 @@ impl StringFunction {
Explode => mapper.with_same_dtype(),
#[cfg(feature = "dtype-decimal")]
ToDecimal(_) => mapper.with_dtype(DataType::Decimal(None, None)),
#[cfg(feature = "extract_jsonpath")]
JsonExtract { dtype, .. } => mapper.with_opt_dtype(dtype.clone()),
}
}
}
Expand Down Expand Up @@ -139,6 +146,8 @@ impl Display for StringFunction {
StringFunction::Explode => "explode",
#[cfg(feature = "dtype-decimal")]
StringFunction::ToDecimal(_) => "to_decimal",
#[cfg(feature = "extract_jsonpath")]
StringFunction::JsonExtract { .. } => "json_extract",
};

write!(f, "str.{s}")
Expand Down Expand Up @@ -675,3 +684,13 @@ pub(super) fn to_decimal(s: &Series, infer_len: usize) -> PolarsResult<Series> {
let ca = s.utf8()?;
ca.to_decimal(infer_len)
}

#[cfg(feature = "extract_jsonpath")]
pub(super) fn json_extract(
s: &Series,
dtype: Option<DataType>,
infer_schema_len: Option<usize>,
) -> PolarsResult<Series> {
let ca = s.utf8()?;
ca.json_extract(dtype, infer_schema_len)
}
9 changes: 9 additions & 0 deletions polars/polars-lazy/polars-plan/src/dsl/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -476,4 +476,13 @@ impl StringNameSpace {
self.0
.apply_private(FunctionExpr::StringExpr(StringFunction::Explode))
}

#[cfg(feature = "extract_jsonpath")]
pub fn json_extract(self, dtype: Option<DataType>, infer_schema_len: Option<usize>) -> Expr {
self.0
.map_private(FunctionExpr::StringExpr(StringFunction::JsonExtract {
dtype,
infer_schema_len,
}))
}
}
23 changes: 16 additions & 7 deletions polars/polars-ops/src/chunked_array/strings/json_path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,15 @@ pub trait Utf8JsonPathImpl: AsUtf8 {
}

/// Extracts a typed-JSON value for each row in the Utf8Chunked
fn json_extract(&self, dtype: Option<DataType>) -> PolarsResult<Series> {
fn json_extract(
&self,
dtype: Option<DataType>,
infer_schema_len: Option<usize>,
) -> PolarsResult<Series> {
let ca = self.as_utf8();
let dtype = match dtype {
Some(dt) => dt,
None => ca.json_infer(None)?,
None => ca.json_infer(infer_schema_len)?,
};

let buf_size = ca.get_values_size() + ca.null_count() * "null".len();
Expand All @@ -92,9 +96,14 @@ pub trait Utf8JsonPathImpl: AsUtf8 {
.apply_on_opt(|opt_s| opt_s.and_then(|s| select_json(&pat, s))))
}

fn json_path_extract(&self, json_path: &str, dtype: Option<DataType>) -> PolarsResult<Series> {
fn json_path_extract(
&self,
json_path: &str,
dtype: Option<DataType>,
infer_schema_len: Option<usize>,
) -> PolarsResult<Series> {
let selected_json = self.as_utf8().json_path_select(json_path)?;
selected_json.json_extract(dtype)
selected_json.json_extract(dtype, infer_schema_len)
}
}

Expand Down Expand Up @@ -178,11 +187,11 @@ mod tests {
let expected_dtype = expected_series.dtype().clone();

assert!(ca
.json_extract(None)
.json_extract(None, None)
.unwrap()
.series_equal_missing(&expected_series));
assert!(ca
.json_extract(Some(expected_dtype))
.json_extract(Some(expected_dtype), None)
.unwrap()
.series_equal_missing(&expected_series));
}
Expand Down Expand Up @@ -253,7 +262,7 @@ mod tests {
);

assert!(ca
.json_path_extract("$.b[:].c", None)
.json_path_extract("$.b[:].c", None, None)
.unwrap()
.into_series()
.series_equal_missing(&c_series));
Expand Down
9 changes: 7 additions & 2 deletions py-polars/polars/expr/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,9 @@ def starts_with(self, prefix: str | Expr) -> Expr:
prefix = parse_as_expression(prefix, str_as_lit=True)
return wrap_expr(self._pyexpr.str_starts_with(prefix))

def json_extract(self, dtype: PolarsDataType | None = None) -> Expr:
def json_extract(
self, dtype: PolarsDataType | None = None, infer_schema_length: int | None = 100
) -> Expr:
"""
Parse string values as JSON.

Expand All @@ -953,6 +955,9 @@ def json_extract(self, dtype: PolarsDataType | None = None) -> Expr:
dtype
The dtype to cast the extracted value to. If None, the dtype will be
inferred from the JSON value.
infer_schema_length
How many rows to parse to determine the schema.
If ``None`` all rows are used.

Examples
--------
Expand Down Expand Up @@ -980,7 +985,7 @@ def json_extract(self, dtype: PolarsDataType | None = None) -> Expr:
"""
if dtype is not None:
dtype = py_type_to_dtype(dtype)
return wrap_expr(self._pyexpr.str_json_extract(dtype))
return wrap_expr(self._pyexpr.str_json_extract(dtype, infer_schema_length))

def json_path_match(self, json_path: str) -> Expr:
"""
Expand Down
7 changes: 6 additions & 1 deletion py-polars/polars/series/string.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,9 @@ def encode(self, encoding: TransferEncoding) -> Series:

"""

def json_extract(self, dtype: PolarsDataType | None = None) -> Series:
def json_extract(
self, dtype: PolarsDataType | None = None, infer_schema_length: int | None = 100
) -> Series:
"""
Parse string values as JSON.

Expand All @@ -570,6 +572,9 @@ def json_extract(self, dtype: PolarsDataType | None = None) -> Series:
dtype
The dtype to cast the extracted value to. If None, the dtype will be
inferred from the JSON value.
infer_schema_length
How many rows to parse to determine the schema.
If ``None`` all rows are used.

Examples
--------
Expand Down
28 changes: 9 additions & 19 deletions py-polars/src/expr/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,26 +219,16 @@ impl PyExpr {
}

#[cfg(feature = "extract_jsonpath")]
fn str_json_extract(&self, dtype: Option<Wrap<DataType>>) -> Self {
fn str_json_extract(
&self,
dtype: Option<Wrap<DataType>>,
infer_schema_len: Option<usize>,
) -> Self {
let dtype = dtype.map(|wrap| wrap.0);

let output_type = match dtype.clone() {
Some(dtype) => GetOutput::from_type(dtype),
None => GetOutput::from_type(DataType::Unknown),
};

let function = move |s: Series| {
let ca = s.utf8()?;
match ca.json_extract(dtype.clone()) {
Ok(ca) => Ok(Some(ca.into_series())),
Err(e) => Err(PolarsError::ComputeError(format!("{e:?}").into())),
}
};

self.clone()
.inner
.map(function, output_type)
.with_fmt("str.json_extract")
self.inner
.clone()
.str()
.json_extract(dtype, infer_schema_len)
.into()
}

Expand Down