googleapis · tswast · Oct 18, 2024 · Sep 26, 2024 · Sep 26, 2024 · Oct 14, 2024
@@ -0,0 +1,13 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -0,0 +1,59 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+import bigframes.operations.aggregations as agg_ops
+import bigframes.series as series
+
+"""
+Approximate functions defined from
+https://cloud.google.com/bigquery/docs/reference/standard-sql/approximate_aggregate_functions
+"""
+
+
+def approx_top_count(
+    series: series.Series,
+    number: int,
+) -> series.Series:
+    """Returns the approximate top elements of `expression` as an array of STRUCTs.
+    The number parameter specifies the number of elements returned.
+
+    Each `STRUCT` contains two fields. The first field (named `value`) contains an input
+    value. The second field (named `count`) contains an `INT64` specifying the number
+    of times the value was returned.
+
+    Returns `NULL` if there are zero input rows.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+        >>> s = bpd.Series(["apple", "apple", "pear", "pear", "pear", "banana"])
+        >>> bbq.approx_top_count(s, number=2)
+        [{'value': 'pear', 'count': 3}, {'value': 'apple', 'count': 2}]
+
+    Args:
+        series (bigframes.series.Series):
+            The Series with any data type that the `GROUP BY` clause supports.
+        number (int):
+            An integer specifying the number of times the value was returned.
+
+    Returns:
+        bigframes.series.Series: A new Series with the result data.
+    """
+    if number < 1:
+        raise ValueError("The number of approx_top_count must be at least 1")
+    return series._apply_aggregation(agg_ops.ApproxTopCountOp(number=number))
@@ -0,0 +1,151 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Array functions defined from
+https://cloud.google.com/bigquery/docs/reference/standard-sql/array_functions
+"""
+
+
+from __future__ import annotations
+
+import typing
+
+import bigframes_vendored.constants as constants
+
+import bigframes.core.groupby as groupby
+import bigframes.operations as ops
+import bigframes.operations.aggregations as agg_ops
+import bigframes.series as series
+
+if typing.TYPE_CHECKING:
+    import bigframes.dataframe as dataframe
+
+
+def array_length(series: series.Series) -> series.Series:
+    """Compute the length of each array element in the Series.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+
+        >>> s = bpd.Series([[1, 2, 8, 3], [], [3, 4]])
+        >>> bbq.array_length(s)
+        0    4
+        1    0
+        2    2
+        dtype: Int64
+
+    You can also apply this function directly to Series.
+
+        >>> s.apply(bbq.array_length, by_row=False)
+        0    4
+        1    0
+        2    2
+        dtype: Int64
+
+    Args:
+        series (bigframes.series.Series): A Series with array columns.
+
+    Returns:
+        bigframes.series.Series: A Series of integer values indicating
+            the length of each element in the Series.
+
+    """
+    return series._apply_unary_op(ops.len_op)
+
+
+def array_agg(
+    obj: groupby.SeriesGroupBy | groupby.DataFrameGroupBy,
+) -> series.Series | dataframe.DataFrame:
+    """Group data and create arrays from selected columns, omitting NULLs to avoid
+    BigQuery errors (NULLs not allowed in arrays).
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> import numpy as np
+        >>> bpd.options.display.progress_bar = None
+
+    For a SeriesGroupBy object:
+
+        >>> lst = ['a', 'a', 'b', 'b', 'a']
+        >>> s = bpd.Series([1, 2, 3, 4, np.nan], index=lst)
+        >>> bbq.array_agg(s.groupby(level=0))
+        a    [1. 2.]
+        b    [3. 4.]
+        dtype: list<item: double>[pyarrow]
+
+    For a DataFrameGroupBy object:
+
+        >>> l = [[1, 2, 3], [1, None, 4], [2, 1, 3], [1, 2, 2]]
+        >>> df = bpd.DataFrame(l, columns=["a", "b", "c"])
+        >>> bbq.array_agg(df.groupby(by=["b"]))
+                 a      c
+        b
+        1.0    [2]    [3]
+        2.0  [1 1]  [3 2]
+        <BLANKLINE>
+        [2 rows x 2 columns]
+
+    Args:
+        obj (groupby.SeriesGroupBy | groupby.DataFrameGroupBy):
+            A GroupBy object to be applied the function.
+
+    Returns:
+        bigframes.series.Series | bigframes.dataframe.DataFrame: A Series or
+            DataFrame containing aggregated array columns, and indexed by the
+            original group columns.
+    """
+    if isinstance(obj, groupby.SeriesGroupBy):
+        return obj._aggregate(agg_ops.ArrayAggOp())
+    elif isinstance(obj, groupby.DataFrameGroupBy):
+        return obj._aggregate_all(agg_ops.ArrayAggOp(), numeric_only=False)
+    else:
+        raise ValueError(
+            f"Unsupported type {type(obj)} to apply `array_agg` function. {constants.FEEDBACK_LINK}"
+        )
+
+
+def array_to_string(series: series.Series, delimiter: str) -> series.Series:
+    """Converts array elements within a Series into delimited strings.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> import numpy as np
+        >>> bpd.options.display.progress_bar = None
+
+        >>> s = bpd.Series([["H", "i", "!"], ["Hello", "World"], np.nan, [], ["Hi"]])
+        >>> bbq.array_to_string(s, delimiter=", ")
+            0         H, i, !
+            1    Hello, World
+            2
+            3
+            4              Hi
+            dtype: string
+
+    Args:
+        series (bigframes.series.Series): A Series containing arrays.
+        delimiter (str): The string used to separate array elements.
+
+    Returns:
+        bigframes.series.Series: A Series containing delimited strings.
+
+    """
+    return series._apply_unary_op(ops.ArrayToStringOp(delimiter=delimiter))
@@ -0,0 +1,136 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+JSON functions defined from
+https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions
+"""
+
+
+from __future__ import annotations
+
+from typing import Any, Sequence, Tuple
+
+import bigframes.operations as ops
+import bigframes.series as series
+
+
+def json_set(
+    series: series.Series,
+    json_path_value_pairs: Sequence[Tuple[str, Any]],
+) -> series.Series:
+    """Produces a new JSON value within a Series by inserting or replacing values at
+    specified paths.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> import numpy as np
+        >>> bpd.options.display.progress_bar = None
+
+        >>> s = bpd.read_gbq("SELECT JSON '{\\\"a\\\": 1}' AS data")["data"]
+        >>> bbq.json_set(s, json_path_value_pairs=[("$.a", 100), ("$.b", "hi")])
+            0    {"a":100,"b":"hi"}
+            Name: data, dtype: string
+
+    Args:
+        series (bigframes.series.Series):
+            The Series containing JSON data (as native JSON objects or JSON-formatted strings).
+        json_path_value_pairs (Sequence[Tuple[str, Any]]):
+            Pairs of JSON path and the new value to insert/replace.
+
+    Returns:
+        bigframes.series.Series: A new Series with the transformed JSON data.
+
+    """
+    # SQLGlot parser does not support the "create_if_missing => true" syntax, so
+    # create_if_missing is not currently implemented.
+
+    for json_path_value_pair in json_path_value_pairs:
+        if len(json_path_value_pair) != 2:
+            raise ValueError(
+                "Incorrect format: Expected (<json_path>, <json_value>), but found: "
+                + f"{json_path_value_pair}"
+            )
+
+        json_path, json_value = json_path_value_pair
+        series = series._apply_binary_op(
+            json_value, ops.JSONSet(json_path=json_path), alignment="left"
+        )
+    return series
+
+
+def json_extract(
+    series: series.Series,
+    json_path: str,
+) -> series.Series:
+    """Extracts a JSON value and converts it to a SQL JSON-formatted `STRING` or `JSON`
+    value. This function uses single quotes and brackets to escape invalid JSONPath
+    characters in JSON keys.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+
+        >>> s = bpd.Series(['{"class": {"students": [{"id": 5}, {"id": 12}]}}'])
+        >>> bbq.json_extract(s, json_path="$.class")
+        0    {"students":[{"id":5},{"id":12}]}
+        dtype: string
+
+    Args:
+        series (bigframes.series.Series):
+            The Series containing JSON data (as native JSON objects or JSON-formatted strings).
+        json_path (str):
+            The JSON path identifying the data that you want to obtain from the input.
+
+    Returns:
+        bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
+    """
+    return series._apply_unary_op(ops.JSONExtract(json_path=json_path))
+
+
+def json_extract_array(
+    series: series.Series,
+    json_path: str = "$",
+) -> series.Series:
+    """Extracts a JSON array and converts it to a SQL array of JSON-formatted `STRING` or `JSON`
+    values. This function uses single quotes and brackets to escape invalid JSONPath
+    characters in JSON keys.
+
+    **Examples:**
+
+        >>> import bigframes.pandas as bpd
+        >>> import bigframes.bigquery as bbq
+        >>> bpd.options.display.progress_bar = None
+
+        >>> s = bpd.Series(['[1, 2, 3]', '[4, 5]'])
+        >>> bbq.json_extract_array(s)
+        0    ['1' '2' '3']
+        1        ['4' '5']
+        dtype: list<item: string>[pyarrow]
+
+    Args:
+        series (bigframes.series.Series):
+            The Series containing JSON data (as native JSON objects or JSON-formatted strings).
+        json_path (str):
+            The JSON path identifying the data that you want to obtain from the input.
+
+    Returns:
+        bigframes.series.Series: A new Series with the JSON or JSON-formatted STRING.
+    """
+    return series._apply_unary_op(ops.JSONExtractArray(json_path=json_path))