apache · icexelloss · Jun 8, 2023 · May 9, 2023 · May 9, 2023 · May 15, 2023
@@ -954,7 +954,9 @@ ExtensionIdRegistry::SubstraitAggregateToArrow DecodeBasicAggregate(
         return Status::Invalid("Expected aggregate call ", call.id().uri, "#",
                                call.id().name, " to have at least one argument");
       }
-      case 1: {
+      default: {
+        // Handles all arity > 0
+
         std::shared_ptr<compute::FunctionOptions> options = nullptr;
         if (arrow_function_name == "stddev" || arrow_function_name == "variance") {
           // See the following URL for the spec of stddev and variance:
@@ -981,21 +983,22 @@ ExtensionIdRegistry::SubstraitAggregateToArrow DecodeBasicAggregate(
         }
         fixed_arrow_func += arrow_function_name;
 
-        ARROW_ASSIGN_OR_RAISE(compute::Expression arg, call.GetValueArg(0));
-        const FieldRef* arg_ref = arg.field_ref();
-        if (!arg_ref) {
-          return Status::Invalid("Expected an aggregate call ", call.id().uri, "#",
-                                 call.id().name, " to have a direct reference");
+        std::vector<FieldRef> target;
+        for (int i = 0; i < call.size(); i++) {
+          ARROW_ASSIGN_OR_RAISE(compute::Expression arg, call.GetValueArg(i));
+          const FieldRef* arg_ref = arg.field_ref();
+          if (!arg_ref) {
+            return Status::Invalid("Expected an aggregate call ", call.id().uri, "#",
+                                   call.id().name, " to have a direct reference");
+          }
+          // Copy arg_ref here because field_ref() return const FieldRef*
+          target.emplace_back(*arg_ref);
         }
-
         return compute::Aggregate{std::move(fixed_arrow_func),
-                                  options ? std::move(options) : nullptr, *arg_ref, ""};
+                                  options ? std::move(options) : nullptr,
+                                  std::move(target), ""};
       }
-      default:
-        break;
     }
-    return Status::NotImplemented(
-        "Only nullary and unary aggregate functions are currently supported");
   };
 }
 

diff --git a/python/pyarrow/_compute.pxd b/python/pyarrow/_compute.pxd
@@ -21,11 +21,11 @@ from pyarrow.lib cimport *
 from pyarrow.includes.common cimport *
 from pyarrow.includes.libarrow cimport *
 
-cdef class ScalarUdfContext(_Weakrefable):
+cdef class UdfContext(_Weakrefable):
     cdef:
-        CScalarUdfContext c_context
+        CUdfContext c_context
 
-    cdef void init(self, const CScalarUdfContext& c_context)
+    cdef void init(self, const CUdfContext& c_context)
 
 
 cdef class FunctionOptions(_Weakrefable):

diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
@@ -2559,7 +2559,7 @@ cdef CExpression _bind(Expression filter, Schema schema) except *:
         deref(pyarrow_unwrap_schema(schema).get())))
 
 
-cdef class ScalarUdfContext:
+cdef class UdfContext:
     """
     Per-invocation function context/state.
 
@@ -2571,7 +2571,7 @@ cdef class ScalarUdfContext:
         raise TypeError("Do not call {}'s constructor directly"
                         .format(self.__class__.__name__))
 
-    cdef void init(self, const CScalarUdfContext &c_context):
+    cdef void init(self, const CUdfContext &c_context):
         self.c_context = c_context
 
     @property
@@ -2620,26 +2620,26 @@ cdef inline CFunctionDoc _make_function_doc(dict func_doc) except *:
     return f_doc
 
 
-cdef object box_scalar_udf_context(const CScalarUdfContext& c_context):
-    cdef ScalarUdfContext context = ScalarUdfContext.__new__(ScalarUdfContext)
+cdef object box_udf_context(const CUdfContext& c_context):
+    cdef UdfContext context = UdfContext.__new__(UdfContext)
     context.init(c_context)
     return context
 
 
-cdef _udf_callback(user_function, const CScalarUdfContext& c_context, inputs):
+cdef _udf_callback(user_function, const CUdfContext& c_context, inputs):
     """
-    Helper callback function used to wrap the ScalarUdfContext from Python to C++
+    Helper callback function used to wrap the UdfContext from Python to C++
     execution.
     """
-    context = box_scalar_udf_context(c_context)
+    context = box_udf_context(c_context)
     return user_function(context, *inputs)
 
 
-def _get_scalar_udf_context(memory_pool, batch_length):
-    cdef CScalarUdfContext c_context
+def _get_udf_context(memory_pool, batch_length):
+    cdef CUdfContext c_context
     c_context.pool = maybe_unbox_memory_pool(memory_pool)
     c_context.batch_length = batch_length
-    context = box_scalar_udf_context(c_context)
+    context = box_udf_context(c_context)
     return context
 
 
@@ -2665,11 +2665,19 @@ cdef get_register_tabular_function():
     return reg
 
 
+cdef get_register_aggregate_function():
+    cdef RegisterUdf reg = RegisterUdf.__new__(RegisterUdf)
+    reg.register_func = RegisterAggregateFunction
+    return reg
+
+
 def register_scalar_function(func, function_name, function_doc, in_types, out_type,
                              func_registry=None):
     """
     Register a user-defined scalar function.
 
+    This API is EXPERIMENTAL.
+
     A scalar function is a function that executes elementwise
     operations on arrays or scalars, i.e. a scalar function must
     be computed row-by-row with no state where each output row
@@ -2684,17 +2692,18 @@ def register_scalar_function(func, function_name, function_doc, in_types, out_ty
     func : callable
         A callable implementing the user-defined function.
         The first argument is the context argument of type
-        ScalarUdfContext.
+        UdfContext.
         Then, it must take arguments equal to the number of
         in_types defined. It must return an Array or Scalar
         matching the out_type. It must return a Scalar if
         all arguments are scalar, else it must return an Array.
 
         To define a varargs function, pass a callable that takes
-        varargs. The last in_type will be the type of all varargs
+        *args. The last in_type will be the type of all varargs
         arguments.
     function_name : str
-        Name of the function. This name must be globally unique.
+        Name of the function. There should only be one function
+        registered with this name in the function registry.
     function_doc : dict
         A dictionary object with keys "summary" (str),
         and "description" (str).
@@ -2738,18 +2747,97 @@ def register_scalar_function(func, function_name, function_doc, in_types, out_ty
       21
     ]
     """
-    return _register_scalar_like_function(get_register_scalar_function(),
-                                          func, function_name, function_doc, in_types,
-                                          out_type, func_registry)
+    return _register_user_defined_function(get_register_scalar_function(),
+                                           func, function_name, function_doc, in_types,
+                                           out_type, func_registry)
+
+
+def register_aggregate_function(func, function_name, function_doc, in_types, out_type,
+                                func_registry=None):
+    """
+    Register a user-defined non-decomposable aggregate function.
+
+    This API is EXPERIMENTAL.
+
+    A non-decomposable aggregation function is a function that executes
+    aggregate operations on the whole data that it is aggregating.
+    In other words, non-decomposable aggregate function cannot be
+    split into consume/merge/finalize steps.
+
+    This is often used with ordered or segmented aggregation where groups
+    can be emit before accumulating all of the input data.
+
+    Parameters
+    ----------
+    func : callable
+        A callable implementing the user-defined function.
+        The first argument is the context argument of type
+        UdfContext.
+        Then, it must take arguments equal to the number of
+        in_types defined. It must return a Scalar matching the
+        out_type.
+        To define a varargs function, pass a callable that takes
+        *args. The in_type needs to match in type of inputs when
+        the function gets called.
+    function_name : str
+        Name of the function. This name must be unique, i.e.,
+        there should only be one function registered with
+        this name in the function registry.
+    function_doc : dict
+        A dictionary object with keys "summary" (str),
+        and "description" (str).
+    in_types : Dict[str, DataType]
+        A dictionary mapping function argument names to
+        their respective DataType.
+        The argument names will be used to generate
+        documentation for the function. The number of
+        arguments specified here determines the function
+        arity.
+    out_type : DataType
+        Output type of the function.
+    func_registry : FunctionRegistry
+        Optional function registry to use instead of the default global one.
+
+    Examples
+    --------
+    >>> import numpy as np
+    >>> import pyarrow as pa
+    >>> import pyarrow.compute as pc
+    >>>
+    >>> func_doc = {}
+    >>> func_doc["summary"] = "simple median udf"
+    >>> func_doc["description"] = "compute median"
+    >>>
+    >>> def compute_median(ctx, array):
+    ...     return pa.scalar(np.median(array))
+    >>>
+    >>> func_name = "py_compute_median"
+    >>> in_types = {"array": pa.int64()}
+    >>> out_type = pa.float64()
+    >>> pc.register_aggregate_function(compute_median, func_name, func_doc,
+    ...                   in_types, out_type)
+    >>>
+    >>> func = pc.get_function(func_name)
+    >>> func.name
+    'py_compute_median'
+    >>> answer = pc.call_function(func_name, [pa.array([20, 40])])
+    >>> answer
+    <pyarrow.DoubleScalar: 30.0>
+    """
+    return _register_user_defined_function(get_register_aggregate_function(),
+                                           func, function_name, function_doc, in_types,
+                                           out_type, func_registry)
 
 
 def register_tabular_function(func, function_name, function_doc, in_types, out_type,
                               func_registry=None):
     """
     Register a user-defined tabular function.
 
+    This API is EXPERIMENTAL.
+
     A tabular function is one accepting a context argument of type
-    ScalarUdfContext and returning a generator of struct arrays.
+    UdfContext and returning a generator of struct arrays.
     The in_types argument must be empty and the out_type argument
     specifies a schema. Each struct array must have field types
     correspoding to the schema.
@@ -2759,11 +2847,12 @@ def register_tabular_function(func, function_name, function_doc, in_types, out_t
     func : callable
         A callable implementing the user-defined function.
         The only argument is the context argument of type
-        ScalarUdfContext. It must return a callable that
+        UdfContext. It must return a callable that
         returns on each invocation a StructArray matching
         the out_type, where an empty array indicates end.
     function_name : str
-        Name of the function. This name must be globally unique.
+        Name of the function. There should only be one function
+        registered with this name in the function registry.
     function_doc : dict
         A dictionary object with keys "summary" (str),
         and "description" (str).
@@ -2783,46 +2872,34 @@ def register_tabular_function(func, function_name, function_doc, in_types, out_t
         with nogil:
             c_type = <shared_ptr[CDataType]>make_shared[CStructType](deref(c_schema).fields())
         out_type = pyarrow_wrap_data_type(c_type)
-    return _register_scalar_like_function(get_register_tabular_function(),
-                                          func, function_name, function_doc, in_types,
-                                          out_type, func_registry)
+    return _register_user_defined_function(get_register_tabular_function(),
+                                           func, function_name, function_doc, in_types,
+                                           out_type, func_registry)
 
 
-def _register_scalar_like_function(register_func, func, function_name, function_doc, in_types,
-                                   out_type, func_registry=None):
+def _register_user_defined_function(register_func, func, function_name, function_doc, in_types,
+                                    out_type, func_registry=None):
     """
-    Register a user-defined scalar-like function.
+    Register a user-defined function.
 
-    A scalar-like function is a callable accepting a first
-    context argument of type ScalarUdfContext as well as
-    possibly additional Arrow arguments, and returning a
-    an Arrow result appropriate for the kind of function.
-    A scalar function and a tabular function are examples
-    for scalar-like functions.
-    This function is normally not called directly but via
-    register_scalar_function or register_tabular_function.
+    This method itself doesn't care about the type of the UDF
+    (i.e., scalar vs tabular vs aggregate)
 
     Parameters
     ----------
     register_func: object
-        An object holding a CRegisterUdf in a "register_func" attribute. Use
-        get_register_scalar_function() for a scalar function and
-        get_register_tabular_function() for a tabular function.
+        An object holding a CRegisterUdf in a "register_func" attribute.
     func : callable
         A callable implementing the user-defined function.
-        See register_scalar_function and
-        register_tabular_function for details.
-
     function_name : str
-        Name of the function. This name must be globally unique.
+        Name of the function. There should only be one function
+        registered with this name in the function registry.
     function_doc : dict
         A dictionary object with keys "summary" (str),
         and "description" (str).
     in_types : Dict[str, DataType]
         A dictionary mapping function argument names to
         their respective DataType.
-        See register_scalar_function and
-        register_tabular_function for details.
     out_type : DataType
         Output type of the function.
     func_registry : FunctionRegistry

diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py
@@ -84,7 +84,8 @@
     call_tabular_function,
     register_scalar_function,
     register_tabular_function,
-    ScalarUdfContext,
+    register_aggregate_function,
+    UdfContext,
     # Expressions
     Expression,
 )

diff --git a/python/pyarrow/conftest.py b/python/pyarrow/conftest.py
@@ -278,3 +278,59 @@ def unary_function(ctx, x):
                                 {"array": pa.int64()},
                                 pa.int64())
     return unary_function, func_name
+
+
+@pytest.fixture(scope="session")
+def unary_agg_func_fixture():
+    """
+    Register a unary aggregate function
+    """
+    from pyarrow import compute as pc
+    import numpy as np
+
+    def func(ctx, x):
+        return pa.scalar(np.nanmean(x))
+
+    func_name = "y=avg(x)"
+    func_doc = {"summary": "y=avg(x)",
+                "description": "find mean of x"}
+
+    pc.register_aggregate_function(func,
+                                   func_name,
+                                   func_doc,
+                                   {
+                                       "x": pa.float64(),
+                                   },
+                                   pa.float64()
+                                   )
+    return func, func_name
+
+
+@pytest.fixture(scope="session")
+def varargs_agg_func_fixture():
+    """
+    Register a unary aggregate function
+    """
+    from pyarrow import compute as pc
+    import numpy as np
+
+    def func(ctx, *args):
+        sum = 0.0
+        for arg in args:
+            sum += np.nanmean(arg)
+        return pa.scalar(sum)
+
+    func_name = "y=sum_mean(x...)"
+    func_doc = {"summary": "Varargs aggregate",
+                "description": "Varargs aggregate"}
+
+    pc.register_aggregate_function(func,
+                                   func_name,
+                                   func_doc,
+                                   {
+                                       "x": pa.int64(),
+                                       "y": pa.float64()
+                                   },
+                                   pa.float64()
+                                   )
+    return func, func_name