From a3909d07f31f80f1e30108e93bf5fefcca3afcd8 Mon Sep 17 00:00:00 2001
From: Shobhit Singh <shobs@google.com>
Date: Tue, 5 Nov 2024 19:35:01 +0000
Subject: [PATCH] feat: support `json_extract_string_array` in the `bigquery`
 module

---
 bigframes/bigquery/__init__.py               |   2 +
 bigframes/bigquery/_operations/json.py       | 119 ++++++++++++++++---
 bigframes/core/compile/scalar_op_compiler.py |  14 +++
 bigframes/operations/__init__.py             |  37 ++++--
 tests/system/small/bigquery/test_json.py     |  40 ++++++-
 5 files changed, 188 insertions(+), 24 deletions(-)

diff --git a/bigframes/bigquery/__init__.py b/bigframes/bigquery/__init__.py
index 0b2d2d5aeb..a39914d6e7 100644
--- a/bigframes/bigquery/__init__.py
+++ b/bigframes/bigquery/__init__.py
@@ -25,6 +25,7 @@
 from bigframes.bigquery._operations.json import (
     json_extract,
     json_extract_array,
+    json_extract_string_array,
     json_set,
 )
 from bigframes.bigquery._operations.search import create_vector_index, vector_search
@@ -37,6 +38,7 @@
     "json_set",
     "json_extract",
     "json_extract_array",
+    "json_extract_string_array",
     "approx_top_count",
     "struct",
     "create_vector_index",
diff --git a/bigframes/bigquery/_operations/json.py b/bigframes/bigquery/_operations/json.py
index d3c3c97a9c..152c93186a 100644
--- a/bigframes/bigquery/_operations/json.py
+++ b/bigframes/bigquery/_operations/json.py
@@ -21,14 +21,17 @@
 
 from __future__ import annotations
 
-from typing import Any, Sequence, Tuple
+from typing import Any, cast, Optional, Sequence, Tuple, Union
 
+import bigframes.dtypes
 import bigframes.operations as ops
 import bigframes.series as series
 
+from . import array
+
 
 def json_set(
-    series: series.Series,
+    input: series.Series,
     json_path_value_pairs: Sequence[Tuple[str, Any]],
 ) -> series.Series:
     """Produces a new JSON value within a Series by inserting or replacing values at
@@ -47,7 +50,7 @@ def json_set(
             Name: data, dtype: string
 
     Args:
-        series (bigframes.series.Series):
+        input (bigframes.series.Series):
             The Series containing JSON data (as native JSON objects or JSON-formatted strings).
         json_path_value_pairs (Sequence[Tuple[str, Any]]):
             Pairs of JSON path and the new value to insert/replace.
@@ -59,6 +62,7 @@ def json_set(
     # SQLGlot parser does not support the "create_if_missing => true" syntax, so
     # create_if_missing is not currently implemented.
 
+    result = input
     for json_path_value_pair in json_path_value_pairs:
         if len(json_path_value_pair) != 2:
             raise ValueError(
@@ -67,14 +71,14 @@ def json_set(
             )
 
         json_path, json_value = json_path_value_pair
-        series = series._apply_binary_op(
+        result = result._apply_binary_op(
             json_value, ops.JSONSet(json_path=json_path), alignment="left"
         )
-    return series
+    return result
 
 
 def json_extract(
-    series: series.Series,
+    input: series.Series,
     json_path: str,
 ) -> series.Series:
     """Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON`
@@ -93,7 +97,7 @@ def json_extract(
         dtype: string
 
     Args:
-        series (bigframes.series.Series):
+        input (bigframes.series.Series):
             The Series containing JSON data (as native JSON objects or JSON-formatted strings).
         json_path (str):
             The JSON path identifying the data that you want to obtain from the input.
@@ -101,16 +105,16 @@ def json_extract(
     Returns:
         bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
     """
-    return series._apply_unary_op(ops.JSONExtract(json_path=json_path))
+    return input._apply_unary_op(ops.JSONExtract(json_path=json_path))
 
 
 def json_extract_array(
-    series: series.Series,
+    input: series.Series,
     json_path: str = "$",
 ) -> series.Series:
-    """Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON`
-    values. This function uses single quotes and brackets to escape invalid JSONPath
-    characters in JSON keys.
+    """Extracts a JSON array and converts it to a SQL array of JSON-formatted
+    `STRING` or `JSON` values. This function uses single quotes and brackets to
+    escape invalid JSONPath characters in JSON keys.
 
     **Examples:**
 
@@ -124,13 +128,98 @@ def json_extract_array(
         1        ['4' '5']
         dtype: list<item: string>[pyarrow]
 
+        >>> s = bpd.Series([
+        ...   '{"fruits": [{"name": "apple"}, {"name": "cherry"}]}',
+        ...   '{"fruits": [{"name": "guava"}, {"name": "grapes"}]}'
+        ... ])
+        >>> bbq.json_extract_array(s, "$.fruits")
+        0    ['{"name":"apple"}' '{"name":"cherry"}']
+        1    ['{"name":"guava"}' '{"name":"grapes"}']
+        dtype: list<item: string>[pyarrow]
+
+        >>> s = bpd.Series([
+        ...   '{"fruits": {"color": "red",   "names": ["apple","cherry"]}}',
+        ...   '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
+        ... ])
+        >>> bbq.json_extract_array(s, "$.fruits.names")
+        0    ['"apple"' '"cherry"']
+        1    ['"guava"' '"grapes"']
+        dtype: list<item: string>[pyarrow]
+
     Args:
-        series (bigframes.series.Series):
+        input (bigframes.series.Series):
             The Series containing JSON data (as native JSON objects or JSON-formatted strings).
         json_path (str):
             The JSON path identifying the data that you want to obtain from the input.
 
     Returns:
-        bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
+        bigframes.series.Series: A new Series with the parsed arrays from the input.
     """
-    return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
+    return input._apply_unary_op(ops.JSONExtractArray(json_path=json_path))
+
+
+def json_extract_string_array(
+    input: series.Series,
+    json_path: str = "$",
+    value_dtype: Optional[
+        Union[bigframes.dtypes.Dtype, bigframes.dtypes.DtypeString]
+    ] = None,
+) -> series.Series:
+    """Extracts a JSON array and converts it to a SQL array of `STRING` values.
+    A `value_dtype` can be provided to further coerce the data type of the
+    values in the array. This function uses single quotes and brackets to escape
+    invalid JSONPath characters in JSON keys.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+
+        >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
+        >>> bbq.json_extract_string_array(s)
+        0    ['1' '2' '3']
+        1        ['4' '5']
+        dtype: list<item: string>[pyarrow]
+
+        >>> bbq.json_extract_string_array(s, value_dtype='Int64')
+        0    [1 2 3]
+        1      [4 5]
+        dtype: list<item: int64>[pyarrow]
+
+        >>> s = bpd.Series([
+        ...   '{"fruits": {"color": "red",   "names": ["apple","cherry"]}}',
+        ...   '{"fruits": {"color": "green", "names": ["guava", "grapes"]}}'
+        ... ])
+        >>> bbq.json_extract_string_array(s, "$.fruits.names")
+        0    ['apple' 'cherry']
+        1    ['guava' 'grapes']
+        dtype: list<item: string>[pyarrow]
+
+    Args:
+        input (bigframes.series.Series):
+            The Series containing JSON data (as native JSON objects or JSON-formatted strings).
+        json_path (str):
+            The JSON path identifying the data that you want to obtain from the input.
+        value_dtype (dtype, Optional):
+            The data type supported by BigFrames DataFrame.
+
+    Returns:
+        bigframes.series.Series: A new Series with the parsed arrays from the input.
+    """
+    array_series = input._apply_unary_op(
+        ops.JSONExtractStringArray(json_path=json_path)
+    )
+    if value_dtype not in [None, bigframes.dtypes.STRING_DTYPE]:
+        array_items_series = array_series.explode()
+        if value_dtype == bigframes.dtypes.BOOL_DTYPE:
+            array_items_series = array_items_series.str.lower() == "true"
+        else:
+            array_items_series = array_items_series.astype(value_dtype)
+        array_series = cast(
+            series.Series,
+            array.array_agg(
+                array_items_series.groupby(level=input.index.names, dropna=False)
+            ),
+        )
+    return array_series
diff --git a/bigframes/core/compile/scalar_op_compiler.py b/bigframes/core/compile/scalar_op_compiler.py
index 729b341e85..80e354aa8c 100644
--- a/bigframes/core/compile/scalar_op_compiler.py
+++ b/bigframes/core/compile/scalar_op_compiler.py
@@ -1140,6 +1140,13 @@ def json_extract_array_op_impl(x: ibis_types.Value, op: ops.JSONExtractArray):
     return json_extract_array(json_obj=x, json_path=op.json_path)
 
 
+@scalar_op_compiler.register_unary_op(ops.JSONExtractStringArray, pass_op=True)
+def json_extract_string_array_op_impl(
+    x: ibis_types.Value, op: ops.JSONExtractStringArray
+):
+    return json_extract_string_array(json_obj=x, json_path=op.json_path)
+
+
 ### Binary Ops
 def short_circuit_nulls(type_override: typing.Optional[ibis_dtypes.DataType] = None):
     """Wraps a binary operator to generate nulls of the expected type if either input is a null scalar."""
@@ -1801,6 +1808,13 @@ def json_extract_array(
     """Extracts a JSON array and converts it to a SQL ARRAY of JSON-formatted STRINGs or JSON values."""
 
 
+@ibis.udf.scalar.builtin(name="json_extract_string_array")
+def json_extract_string_array(
+    json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.str
+) -> ibis_dtypes.Array[ibis_dtypes.String]:
+    """Extracts a JSON array and converts it to a SQL ARRAY of STRINGs."""
+
+
 @ibis.udf.scalar.builtin(name="ML.DISTANCE")
 def vector_distance(vector1, vector2, type: str) -> ibis_dtypes.Float64:
     """Computes the distance between two vectors using specified type ("EUCLIDEAN", "MANHATTAN", or "COSINE")"""
diff --git a/bigframes/operations/__init__.py b/bigframes/operations/__init__.py
index 63127a70de..2e2e4a0552 100644
--- a/bigframes/operations/__init__.py
+++ b/bigframes/operations/__init__.py
@@ -25,7 +25,6 @@
 from pandas.tseries.offsets import DateOffset
 import pyarrow as pa
 
-import bigframes.dtypes
 import bigframes.dtypes as dtypes
 import bigframes.operations.type as op_typing
 
@@ -526,6 +525,13 @@ class RemoteFunctionOp(UnaryOp):
     def output_type(self, *input_types):
         # This property should be set to a valid Dtype by the @remote_function decorator or read_gbq_function method
         if hasattr(self.func, "output_dtype"):
+            if dtypes.is_array_like(self.func.output_dtype):
+                # TODO(b/284515241): remove this special handling to support
+                # array output types once BQ remote functions support ARRAY.
+                # Until then, use json serialized strings at the remote function
+                # level, and parse that to the intended output type at the
+                # bigframes level.
+                return dtypes.STRING_DTYPE
             return self.func.output_dtype
         else:
             raise AttributeError("output_dtype not defined")
@@ -548,9 +554,9 @@ class ToDatetimeOp(UnaryOp):
 
     def output_type(self, *input_types):
         if input_types[0] not in (
-            bigframes.dtypes.FLOAT_DTYPE,
-            bigframes.dtypes.INT_DTYPE,
-            bigframes.dtypes.STRING_DTYPE,
+            dtypes.FLOAT_DTYPE,
+            dtypes.INT_DTYPE,
+            dtypes.STRING_DTYPE,
         ):
             raise TypeError("expected string or numeric input")
         return pd.ArrowDtype(pa.timestamp("us", tz=None))
@@ -565,9 +571,9 @@ class ToTimestampOp(UnaryOp):
     def output_type(self, *input_types):
         # Must be numeric or string
         if input_types[0] not in (
-            bigframes.dtypes.FLOAT_DTYPE,
-            bigframes.dtypes.INT_DTYPE,
-            bigframes.dtypes.STRING_DTYPE,
+            dtypes.FLOAT_DTYPE,
+            dtypes.INT_DTYPE,
+            dtypes.STRING_DTYPE,
         ):
             raise TypeError("expected string or numeric input")
         return pd.ArrowDtype(pa.timestamp("us", tz="UTC"))
@@ -699,6 +705,23 @@ def output_type(self, *input_types):
         )
 
 
+@dataclasses.dataclass(frozen=True)
+class JSONExtractStringArray(UnaryOp):
+    name: typing.ClassVar[str] = "json_extract_string_array"
+    json_path: str
+
+    def output_type(self, *input_types):
+        input_type = input_types[0]
+        if not dtypes.is_json_like(input_type):
+            raise TypeError(
+                "Input type must be an valid JSON object or JSON-formatted string type."
+                + f" Received type: {input_type}"
+            )
+        return pd.ArrowDtype(
+            pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE))
+        )
+
+
 # Binary Ops
 fillna_op = create_binary_op(name="fillna", type_signature=op_typing.COERCE)
 maximum_op = create_binary_op(name="maximum", type_signature=op_typing.COERCE)
diff --git a/tests/system/small/bigquery/test_json.py b/tests/system/small/bigquery/test_json.py
index 68356f4a15..75b9345107 100644
--- a/tests/system/small/bigquery/test_json.py
+++ b/tests/system/small/bigquery/test_json.py
@@ -19,6 +19,7 @@
 import pytest
 
 import bigframes.bigquery as bbq
+import bigframes.dtypes
 import bigframes.pandas as bpd
 
 
@@ -142,9 +143,9 @@ def test_json_extract_w_invalid_series_type():
 
 
 def test_json_extract_array_from_json_strings():
-    s = bpd.Series(['{"a": [1, 2, 3]}', '{"a": []}', '{"a": [4,5]}'])
+    s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
     actual = bbq.json_extract_array(s, "$.a")
-    expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
+    expected = bpd.Series([['"ab"', '"2"', '"3 xy"'], [], ['"4"', '"5"']])
     pd.testing.assert_series_equal(
         actual.to_pandas(),
         expected.to_pandas(),
@@ -164,3 +165,38 @@ def test_json_extract_array_from_array_strings():
 def test_json_extract_array_w_invalid_series_type():
     with pytest.raises(TypeError):
         bbq.json_extract_array(bpd.Series([1, 2]))
+
+
+def test_json_extract_string_array_from_json_strings():
+    s = bpd.Series(['{"a": ["ab", "2", "3 xy"]}', '{"a": []}', '{"a": ["4","5"]}'])
+    actual = bbq.json_extract_string_array(s, "$.a")
+    expected = bpd.Series([["ab", "2", "3 xy"], [], ["4", "5"]])
+    pd.testing.assert_series_equal(
+        actual.to_pandas(),
+        expected.to_pandas(),
+    )
+
+
+def test_json_extract_string_array_from_array_strings():
+    s = bpd.Series(["[1, 2, 3]", "[]", "[4,5]"])
+    actual = bbq.json_extract_string_array(s)
+    expected = bpd.Series([["1", "2", "3"], [], ["4", "5"]])
+    pd.testing.assert_series_equal(
+        actual.to_pandas(),
+        expected.to_pandas(),
+    )
+
+
+def test_json_extract_string_array_as_float_array_from_array_strings():
+    s = bpd.Series(["[1, 2.5, 3]", "[]", "[4,5]"])
+    actual = bbq.json_extract_string_array(s, value_dtype=bigframes.dtypes.FLOAT_DTYPE)
+    expected = bpd.Series([[1, 2.5, 3], [], [4, 5]])
+    pd.testing.assert_series_equal(
+        actual.to_pandas(),
+        expected.to_pandas(),
+    )
+
+
+def test_json_extract_string_array_w_invalid_series_type():
+    with pytest.raises(TypeError):
+        bbq.json_extract_string_array(bpd.Series([1, 2]))