Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: support json_extract_string_array in the bigquery module #1131

Merged
merged 2 commits into from
Nov 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bigframes/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
from bigframes.bigquery._operations.json import (
json_extract,
json_extract_array,
json_extract_string_array,
json_set,
)
from bigframes.bigquery._operations.search import create_vector_index, vector_search
Expand All @@ -37,6 +38,7 @@
"json_set",
"json_extract",
"json_extract_array",
"json_extract_string_array",
"approx_top_count",
"struct",
"create_vector_index",
Expand Down
119 changes: 104 additions & 15 deletions bigframes/bigquery/_operations/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,17 @@

from __future__ import annotations

from typing import Any, Sequence, Tuple
from typing import Any, cast, Optional, Sequence, Tuple, Union

import bigframes.dtypes
import bigframes.operations as ops
import bigframes.series as series

from . import array


def json_set(
series: series.Series,
input: series.Series,
json_path_value_pairs: Sequence[Tuple[str, Any]],
) -> series.Series:
"""Produces a new JSON value within a Series by inserting or replacing values at
Expand All @@ -47,7 +50,7 @@ def json_set(
Name: data, dtype: string

Args:
series (bigframes.series.Series):
input (bigframes.series.Series):
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
json_path_value_pairs (Sequence[Tuple[str, Any]]):
Pairs of JSON path and the new value to insert/replace.
Expand All @@ -59,6 +62,7 @@ def json_set(
# SQLGlot parser does not support the "create_if_missing => true" syntax, so
# create_if_missing is not currently implemented.

result = input
for json_path_value_pair in json_path_value_pairs:
if len(json_path_value_pair) != 2:
raise ValueError(
Expand All @@ -67,14 +71,14 @@ def json_set(
)

json_path, json_value = json_path_value_pair
series = series._apply_binary_op(
result = result._apply_binary_op(
json_value, ops.JSONSet(json_path=json_path), alignment="left"
)
return series
return result


def json_extract(
series: series.Series,
input: series.Series,
json_path: str,
) -> series.Series:
"""Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON`
Expand All @@ -93,24 +97,24 @@ def json_extract(
dtype: string

Args:
series (bigframes.series.Series):
input (bigframes.series.Series):
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
json_path (str):
The JSON path identifying the data that you want to obtain from the input.

Returns:
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
"""
return series._apply_unary_op(ops.JSONExtract(json_path=json_path))
return input._apply_unary_op(ops.JSONExtract(json_path=json_path))


def json_extract_array(
series: series.Series,
input: series.Series,
json_path: str = "$",
) -> series.Series:
"""Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON`
values. This function uses single quotes and brackets to escape invalid JSONPath
characters in JSON keys.
"""Extracts a JSON array and converts it to a SQL array of JSON-formatted
`STRING` or `JSON` values. This function uses single quotes and brackets to
escape invalid JSONPath characters in JSON keys.

**Examples:**

Expand All @@ -124,13 +128,98 @@ def json_extract_array(
1 ['4' '5']
dtype: list<item: string>[pyarrow]

>>> s = bpd.Series([
... '{"fruits": [{"name": "apple"}, {"name": "cherry"}]}',
... '{"fruits": [{"name": "guava"}, {"name": "grapes"}]}'
... ])
>>> bbq.json_extract_array(s, "$.fruits")
0 ['{"name":"apple"}' '{"name":"cherry"}']
1 ['{"name":"guava"}' '{"name":"grapes"}']
dtype: list<item: string>[pyarrow]

>>> s = bpd.Series([
... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}',
... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
... ])
>>> bbq.json_extract_array(s, "$.fruits.names")
0 ['"apple"' '"cherry"']
1 ['"guava"' '"grapes"']
dtype: list<item: string>[pyarrow]

Args:
series (bigframes.series.Series):
input (bigframes.series.Series):
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
json_path (str):
The JSON path identifying the data that you want to obtain from the input.

Returns:
bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
bigframes.series.Series: A new Series with the parsed arrays from the input.
"""
return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
return input._apply_unary_op(ops.JSONExtractArray(json_path=json_path))


def json_extract_string_array(
input: series.Series,
json_path: str = "$",
value_dtype: Optional[
Union[bigframes.dtypes.Dtype, bigframes.dtypes.DtypeString]
] = None,
) -> series.Series:
"""Extracts a JSON array and converts it to a SQL array of `STRING` values.
A `value_dtype` can be provided to further coerce the data type of the
values in the array. This function uses single quotes and brackets to escape
invalid JSONPath characters in JSON keys.

**Examples:**

>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq
>>> bpd.options.display.progress_bar = None

>>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
>>> bbq.json_extract_string_array(s)
0 ['1' '2' '3']
1 ['4' '5']
dtype: list<item: string>[pyarrow]

>>> bbq.json_extract_string_array(s, value_dtype='Int64')
0 [1 2 3]
1 [4 5]
dtype: list<item: int64>[pyarrow]

>>> s = bpd.Series([
... '{"fruits": {"color": "red", "names": ["apple","cherry"]}}',
... '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
... ])
>>> bbq.json_extract_string_array(s, "$.fruits.names")
0 ['apple' 'cherry']
1 ['guava' 'grapes']
dtype: list<item: string>[pyarrow]

Args:
input (bigframes.series.Series):
The Series containing JSON data (as native JSON objects or JSON-formatted strings).
json_path (str):
The JSON path identifying the data that you want to obtain from the input.
value_dtype (dtype, Optional):
The data type supported by BigFrames DataFrame.

Returns:
bigframes.series.Series: A new Series with the parsed arrays from the input.
"""
array_series = input._apply_unary_op(
ops.JSONExtractStringArray(json_path=json_path)
)
if value_dtype not in [None, bigframes.dtypes.STRING_DTYPE]:
array_items_series = array_series.explode()
if value_dtype == bigframes.dtypes.BOOL_DTYPE:
array_items_series = array_items_series.str.lower() == "true"
else:
array_items_series = array_items_series.astype(value_dtype)
array_series = cast(
series.Series,
array.array_agg(
array_items_series.groupby(level=input.index.names, dropna=False)
),
)
return array_series
14 changes: 14 additions & 0 deletions bigframes/core/compile/scalar_op_compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1140,6 +1140,13 @@ def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray):
return json_extract_array(json_obj=x, json_path=op.json_path)


@scalar_op_compiler.register_unary_op(ops.JSONExtractStringArray, pass_op=True)
def json_extract_string_array_op_impl(
x: ibis_types.Value, op: ops.JSONExtractStringArray
):
return json_extract_string_array(json_obj=x, json_path=op.json_path)


### Binary Ops
def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None):
"""Wraps a binary operator to generate nulls of the expected type if either input is a null scalar."""
Expand Down Expand Up @@ -1801,6 +1808,13 @@ def json_extract_array(
"""Extracts a JSON array and converts it to a SQL ARRAY of JSON-formatted STRINGs or JSON values."""


@ibis.udf.scalar.builtin(name="json_extract_string_array")
def json_extract_string_array(
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str
) -> ibis_dtypes.Array[ibis_dtypes.String]:
"""Extracts a JSON array and converts it to a SQL ARRAY of STRINGs."""


@ibis.udf.scalar.builtin(name="ML.DISTANCE")
def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64:
"""Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")"""
37 changes: 30 additions & 7 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@
from pandas.tseries.offsets import DateOffset
import pyarrow as pa

import bigframes.dtypes
import bigframes.dtypes as dtypes
import bigframes.operations.type as op_typing

Expand Down Expand Up @@ -526,6 +525,13 @@ class RemoteFunctionOp(UnaryOp):
def output_type(self, *input_types):
# This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method
if hasattr(self.func, "output_dtype"):
if dtypes.is_array_like(self.func.output_dtype):
# TODO(b/284515241): remove this special handling to support
# array output types once BQ remote functions support ARRAY.
# Until then, use json serialized strings at the remote function
# level, and parse that to the intended output type at the
# bigframes level.
return dtypes.STRING_DTYPE
return self.func.output_dtype
else:
raise AttributeError("output_dtype not defined")
Expand All @@ -548,9 +554,9 @@ class ToDatetimeOp(UnaryOp):

def output_type(self, *input_types):
if input_types[0] not in (
bigframes.dtypes.FLOAT_DTYPE,
bigframes.dtypes.INT_DTYPE,
bigframes.dtypes.STRING_DTYPE,
dtypes.FLOAT_DTYPE,
dtypes.INT_DTYPE,
dtypes.STRING_DTYPE,
):
raise TypeError("expected string or numeric input")
return pd.ArrowDtype(pa.timestamp("us", tz=None))
Expand All @@ -565,9 +571,9 @@ class ToTimestampOp(UnaryOp):
def output_type(self, *input_types):
# Must be numeric or string
if input_types[0] not in (
bigframes.dtypes.FLOAT_DTYPE,
bigframes.dtypes.INT_DTYPE,
bigframes.dtypes.STRING_DTYPE,
dtypes.FLOAT_DTYPE,
dtypes.INT_DTYPE,
dtypes.STRING_DTYPE,
):
raise TypeError("expected string or numeric input")
return pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
Expand Down Expand Up @@ -699,6 +705,23 @@ def output_type(self, *input_types):
)


@dataclasses.dataclass(frozen=True)
class JSONExtractStringArray(UnaryOp):
name: typing.ClassVar[str] = "json_extract_string_array"
json_path: str

def output_type(self, *input_types):
input_type = input_types[0]
if not dtypes.is_json_like(input_type):
raise TypeError(
"Input type must be an valid JSON object or JSON-formatted string type."
+ f" Received type: {input_type}"
)
return pd.ArrowDtype(
pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE))
)


# Binary Ops
fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE)
Expand Down
40 changes: 38 additions & 2 deletions tests/system/small/bigquery/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import pytest

import bigframes.bigquery as bbq
import bigframes.dtypes
import bigframes.pandas as bpd


Expand Down Expand Up @@ -142,9 +143,9 @@ def test_json_extract_w_invalid_series_type():


def test_json_extract_array_from_json_strings():
s = bpd.Series(['{"a": [1, 2, 3]}', '{"a": []}', '{"a": [4,5]}'])
s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
actual = bbq.json_extract_array(s, "$.a")
expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"']])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
Expand All @@ -164,3 +165,38 @@ def test_json_extract_array_from_array_strings():
def test_json_extract_array_w_invalid_series_type():
with pytest.raises(TypeError):
bbq.json_extract_array(bpd.Series([1, 2]))


def test_json_extract_string_array_from_json_strings():
s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
actual = bbq.json_extract_string_array(s, "$.a")
expected = bpd.Series([["ab", "2", "3 xy"], [], ["4", "5"]])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_extract_string_array_from_array_strings():
s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
actual = bbq.json_extract_string_array(s)
expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_extract_string_array_as_float_array_from_array_strings():
s = bpd.Series(["[1, 2.5, 3]", "[]", "[4,5]"])
actual = bbq.json_extract_string_array(s, value_dtype=bigframes.dtypes.FLOAT_DTYPE)
expected = bpd.Series([[1, 2.5, 3], [], [4, 5]])
pd.testing.assert_series_equal(
actual.to_pandas(),
expected.to_pandas(),
)


def test_json_extract_string_array_w_invalid_series_type():
with pytest.raises(TypeError):
bbq.json_extract_string_array(bpd.Series([1, 2]))