From a3909d07f31f80f1e30108e93bf5fefcca3afcd8 Mon Sep 17 00:00:00 2001 From: Shobhit Singh Date: Tue, 5 Nov 2024 19:35:01 +0000 Subject: [PATCH] feat: support `json_extract_string_array` in the `bigquery` module --- bigframes/bigquery/__init__.py | 2 + bigframes/bigquery/_operations/json.py | 119 ++++++++++++++++--- bigframes/core/compile/scalar_op_compiler.py | 14 +++ bigframes/operations/__init__.py | 37 ++++-- tests/system/small/bigquery/test_json.py | 40 ++++++- 5 files changed, 188 insertions(+), 24 deletions(-) diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py index 0b2d2d5aeb..a39914d6e7 100644 --- a/bigframes/bigquery/__init__.py +++ b/bigframes/bigquery/__init__.py @@ -25,6 +25,7 @@ from bigframes.bigquery._operations.json import ( json_extract, json_extract_array, + json_extract_string_array, json_set, ) from bigframes.bigquery._operations.search import create_vector_index, vector_search @@ -37,6 +38,7 @@ "json_set", "json_extract", "json_extract_array", + "json_extract_string_array", "approx_top_count", "struct", "create_vector_index", diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py index d3c3c97a9c..152c93186a 100644 --- a/bigframes/bigquery/_operations/json.py +++ b/bigframes/bigquery/_operations/json.py @@ -21,14 +21,17 @@ from __future__ import annotations -from typing import Any, Sequence, Tuple +from typing import Any, cast, Optional, Sequence, Tuple, Union +import bigframes.dtypes import bigframes.operations as ops import bigframes.series as series +from . import array + def json_set( - series: series.Series, + input: series.Series, json_path_value_pairs: Sequence[Tuple[str, Any]], ) -> series.Series: """Produces a new JSON value within a Series by inserting or replacing values at @@ -47,7 +50,7 @@ def json_set( Name: data, dtype: string Args: - series (bigframes.series.Series): + input (bigframes.series.Series): The Series containing JSON data (as native JSON objects or JSON-formatted strings). json_path_value_pairs (Sequence[Tuple[str, Any]]): Pairs of JSON path and the new value to insert/replace. @@ -59,6 +62,7 @@ def json_set( # SQLGlot parser does not support the "create_if_missing => true" syntax, so # create_if_missing is not currently implemented. + result = input for json_path_value_pair in json_path_value_pairs: if len(json_path_value_pair) != 2: raise ValueError( @@ -67,14 +71,14 @@ def json_set( ) json_path, json_value = json_path_value_pair - series = series._apply_binary_op( + result = result._apply_binary_op( json_value, ops.JSONSet(json_path=json_path), alignment="left" ) - return series + return result def json_extract( - series: series.Series, + input: series.Series, json_path: str, ) -> series.Series: """Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON` @@ -93,7 +97,7 @@ def json_extract( dtype: string Args: - series (bigframes.series.Series): + input (bigframes.series.Series): The Series containing JSON data (as native JSON objects or JSON-formatted strings). json_path (str): The JSON path identifying the data that you want to obtain from the input. @@ -101,16 +105,16 @@ def json_extract( Returns: bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. """ - return series._apply_unary_op(ops.JSONExtract(json_path=json_path)) + return input._apply_unary_op(ops.JSONExtract(json_path=json_path)) def json_extract_array( - series: series.Series, + input: series.Series, json_path: str = "$", ) -> series.Series: - """Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON` - values. This function uses single quotes and brackets to escape invalid JSONPath - characters in JSON keys. + """Extracts a JSON array and converts it to a SQL array of JSON-formatted + `STRING` or `JSON` values. This function uses single quotes and brackets to + escape invalid JSONPath characters in JSON keys. **Examples:** @@ -124,13 +128,98 @@ def json_extract_array( 1 ['4' '5'] dtype: list[pyarrow] + >>> s = bpd.Series([ + ... '{"fruits": [{"name": "apple"}, {"name": "cherry"}]}', + ... '{"fruits": [{"name": "guava"}, {"name": "grapes"}]}' + ... ]) + >>> bbq.json_extract_array(s, "$.fruits") + 0 ['{"name":"apple"}' '{"name":"cherry"}'] + 1 ['{"name":"guava"}' '{"name":"grapes"}'] + dtype: list[pyarrow] + + >>> s = bpd.Series([ + ... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}', + ... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}' + ... ]) + >>> bbq.json_extract_array(s, "$.fruits.names") + 0 ['"apple"' '"cherry"'] + 1 ['"guava"' '"grapes"'] + dtype: list[pyarrow] + Args: - series (bigframes.series.Series): + input (bigframes.series.Series): The Series containing JSON data (as native JSON objects or JSON-formatted strings). json_path (str): The JSON path identifying the data that you want to obtain from the input. Returns: - bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING. + bigframes.series.Series: A new Series with the parsed arrays from the input. """ - return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path)) + return input._apply_unary_op(ops.JSONExtractArray(json_path=json_path)) + + +def json_extract_string_array( + input: series.Series, + json_path: str = "$", + value_dtype: Optional[ + Union[bigframes.dtypes.Dtype, bigframes.dtypes.DtypeString] + ] = None, +) -> series.Series: + """Extracts a JSON array and converts it to a SQL array of `STRING` values. + A `value_dtype` can be provided to further coerce the data type of the + values in the array. This function uses single quotes and brackets to escape + invalid JSONPath characters in JSON keys. + + **Examples:** + + >>> import bigframes.pandas as bpd + >>> import bigframes.bigquery as bbq + >>> bpd.options.display.progress_bar = None + + >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]']) + >>> bbq.json_extract_string_array(s) + 0 ['1' '2' '3'] + 1 ['4' '5'] + dtype: list[pyarrow] + + >>> bbq.json_extract_string_array(s, value_dtype='Int64') + 0 [1 2 3] + 1 [4 5] + dtype: list[pyarrow] + + >>> s = bpd.Series([ + ... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}', + ... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}' + ... ]) + >>> bbq.json_extract_string_array(s, "$.fruits.names") + 0 ['apple' 'cherry'] + 1 ['guava' 'grapes'] + dtype: list[pyarrow] + + Args: + input (bigframes.series.Series): + The Series containing JSON data (as native JSON objects or JSON-formatted strings). + json_path (str): + The JSON path identifying the data that you want to obtain from the input. + value_dtype (dtype, Optional): + The data type supported by BigFrames DataFrame. + + Returns: + bigframes.series.Series: A new Series with the parsed arrays from the input. + """ + array_series = input._apply_unary_op( + ops.JSONExtractStringArray(json_path=json_path) + ) + if value_dtype not in [None, bigframes.dtypes.STRING_DTYPE]: + array_items_series = array_series.explode() + if value_dtype == bigframes.dtypes.BOOL_DTYPE: + array_items_series = array_items_series.str.lower() == "true" + else: + array_items_series = array_items_series.astype(value_dtype) + array_series = cast( + series.Series, + array.array_agg( + array_items_series.groupby(level=input.index.names, dropna=False) + ), + ) + return array_series diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py index 729b341e85..80e354aa8c 100644 --- a/bigframes/core/compile/scalar_op_compiler.py +++ b/bigframes/core/compile/scalar_op_compiler.py @@ -1140,6 +1140,13 @@ def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray): return json_extract_array(json_obj=x, json_path=op.json_path) +@scalar_op_compiler.register_unary_op(ops.JSONExtractStringArray, pass_op=True) +def json_extract_string_array_op_impl( + x: ibis_types.Value, op: ops.JSONExtractStringArray +): + return json_extract_string_array(json_obj=x, json_path=op.json_path) + + ### Binary Ops def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None): """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar.""" @@ -1801,6 +1808,13 @@ def json_extract_array( """Extracts a JSON array and converts it to a SQL ARRAY of JSON-formatted STRINGs or JSON values.""" +@ibis.udf.scalar.builtin(name="json_extract_string_array") +def json_extract_string_array( + json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str +) -> ibis_dtypes.Array[ibis_dtypes.String]: + """Extracts a JSON array and converts it to a SQL ARRAY of STRINGs.""" + + @ibis.udf.scalar.builtin(name="ML.DISTANCE") def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64: """Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")""" diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py index 63127a70de..2e2e4a0552 100644 --- a/bigframes/operations/__init__.py +++ b/bigframes/operations/__init__.py @@ -25,7 +25,6 @@ from pandas.tseries.offsets import DateOffset import pyarrow as pa -import bigframes.dtypes import bigframes.dtypes as dtypes import bigframes.operations.type as op_typing @@ -526,6 +525,13 @@ class RemoteFunctionOp(UnaryOp): def output_type(self, *input_types): # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method if hasattr(self.func, "output_dtype"): + if dtypes.is_array_like(self.func.output_dtype): + # TODO(b/284515241): remove this special handling to support + # array output types once BQ remote functions support ARRAY. + # Until then, use json serialized strings at the remote function + # level, and parse that to the intended output type at the + # bigframes level. + return dtypes.STRING_DTYPE return self.func.output_dtype else: raise AttributeError("output_dtype not defined") @@ -548,9 +554,9 @@ class ToDatetimeOp(UnaryOp): def output_type(self, *input_types): if input_types[0] not in ( - bigframes.dtypes.FLOAT_DTYPE, - bigframes.dtypes.INT_DTYPE, - bigframes.dtypes.STRING_DTYPE, + dtypes.FLOAT_DTYPE, + dtypes.INT_DTYPE, + dtypes.STRING_DTYPE, ): raise TypeError("expected string or numeric input") return pd.ArrowDtype(pa.timestamp("us", tz=None)) @@ -565,9 +571,9 @@ class ToTimestampOp(UnaryOp): def output_type(self, *input_types): # Must be numeric or string if input_types[0] not in ( - bigframes.dtypes.FLOAT_DTYPE, - bigframes.dtypes.INT_DTYPE, - bigframes.dtypes.STRING_DTYPE, + dtypes.FLOAT_DTYPE, + dtypes.INT_DTYPE, + dtypes.STRING_DTYPE, ): raise TypeError("expected string or numeric input") return pd.ArrowDtype(pa.timestamp("us", tz="UTC")) @@ -699,6 +705,23 @@ def output_type(self, *input_types): ) +@dataclasses.dataclass(frozen=True) +class JSONExtractStringArray(UnaryOp): + name: typing.ClassVar[str] = "json_extract_string_array" + json_path: str + + def output_type(self, *input_types): + input_type = input_types[0] + if not dtypes.is_json_like(input_type): + raise TypeError( + "Input type must be an valid JSON object or JSON-formatted string type." + + f" Received type: {input_type}" + ) + return pd.ArrowDtype( + pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE)) + ) + + # Binary Ops fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE) maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE) diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py index 68356f4a15..75b9345107 100644 --- a/tests/system/small/bigquery/test_json.py +++ b/tests/system/small/bigquery/test_json.py @@ -19,6 +19,7 @@ import pytest import bigframes.bigquery as bbq +import bigframes.dtypes import bigframes.pandas as bpd @@ -142,9 +143,9 @@ def test_json_extract_w_invalid_series_type(): def test_json_extract_array_from_json_strings(): - s = bpd.Series(['{"a": [1, 2, 3]}', '{"a": []}', '{"a": [4,5]}']) + s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}']) actual = bbq.json_extract_array(s, "$.a") - expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]]) + expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"']]) pd.testing.assert_series_equal( actual.to_pandas(), expected.to_pandas(), @@ -164,3 +165,38 @@ def test_json_extract_array_from_array_strings(): def test_json_extract_array_w_invalid_series_type(): with pytest.raises(TypeError): bbq.json_extract_array(bpd.Series([1, 2])) + + +def test_json_extract_string_array_from_json_strings(): + s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}']) + actual = bbq.json_extract_string_array(s, "$.a") + expected = bpd.Series([["ab", "2", "3 xy"], [], ["4", "5"]]) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +def test_json_extract_string_array_from_array_strings(): + s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"]) + actual = bbq.json_extract_string_array(s) + expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]]) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +def test_json_extract_string_array_as_float_array_from_array_strings(): + s = bpd.Series(["[1, 2.5, 3]", "[]", "[4,5]"]) + actual = bbq.json_extract_string_array(s, value_dtype=bigframes.dtypes.FLOAT_DTYPE) + expected = bpd.Series([[1, 2.5, 3], [], [4, 5]]) + pd.testing.assert_series_equal( + actual.to_pandas(), + expected.to_pandas(), + ) + + +def test_json_extract_string_array_w_invalid_series_type(): + with pytest.raises(TypeError): + bbq.json_extract_string_array(bpd.Series([1, 2]))