From e2e8e770f78ef87a6a95f456926f85c88d7d80dc Mon Sep 17 00:00:00 2001 From: Jin Shang Date: Thu, 8 Jun 2023 04:32:16 +0800 Subject: [PATCH] GH-32190: [C++] Implement cumulative product, max, and min compute functions --- cpp/src/arrow/compute/api_vector.cc | 39 +- cpp/src/arrow/compute/api_vector.h | 61 +- cpp/src/arrow/compute/function_internal.h | 56 +- .../kernels/base_arithmetic_internal.h | 84 +++ .../arrow/compute/kernels/codegen_internal.h | 3 + .../compute/kernels/vector_cumulative_ops.cc | 95 ++- .../kernels/vector_cumulative_ops_test.cc | 659 ++++++++++++++++-- cpp/src/arrow/scalar.h | 3 + docs/source/cpp/compute.rst | 49 +- docs/source/python/api/compute.rst | 12 +- python/pyarrow/_compute.pyx | 24 +- python/pyarrow/compute.py | 2 +- python/pyarrow/includes/libarrow.pxd | 9 +- python/pyarrow/tests/test_compute.py | 174 ++++- 14 files changed, 1085 insertions(+), 185 deletions(-) diff --git a/cpp/src/arrow/compute/api_vector.cc b/cpp/src/arrow/compute/api_vector.cc index 5044d4f25690a..b33e3feb72993 100644 --- a/cpp/src/arrow/compute/api_vector.cc +++ b/cpp/src/arrow/compute/api_vector.cc @@ -26,6 +26,7 @@ #include "arrow/array/array_nested.h" #include "arrow/array/builder_primitive.h" #include "arrow/compute/exec.h" +#include "arrow/compute/function.h" #include "arrow/compute/function_internal.h" #include "arrow/compute/kernels/vector_sort_internal.h" #include "arrow/compute/registry.h" @@ -142,9 +143,9 @@ static auto kPartitionNthOptionsType = GetFunctionOptionsType( DataMember("k", &SelectKOptions::k), DataMember("sort_keys", &SelectKOptions::sort_keys)); -static auto kCumulativeSumOptionsType = GetFunctionOptionsType( - DataMember("start", &CumulativeSumOptions::start), - DataMember("skip_nulls", &CumulativeSumOptions::skip_nulls)); +static auto kCumulativeOptionsType = GetFunctionOptionsType( + DataMember("start", &CumulativeOptions::start), + DataMember("skip_nulls", &CumulativeOptions::skip_nulls)); static auto kRankOptionsType = GetFunctionOptionsType( DataMember("sort_keys", &RankOptions::sort_keys), DataMember("null_placement", &RankOptions::null_placement), @@ -198,13 +199,15 @@ SelectKOptions::SelectKOptions(int64_t k, std::vector sort_keys) sort_keys(std::move(sort_keys)) {} constexpr char SelectKOptions::kTypeName[]; -CumulativeSumOptions::CumulativeSumOptions(double start, bool skip_nulls) - : CumulativeSumOptions(std::make_shared(start), skip_nulls) {} -CumulativeSumOptions::CumulativeSumOptions(std::shared_ptr start, bool skip_nulls) - : FunctionOptions(internal::kCumulativeSumOptionsType), +CumulativeOptions::CumulativeOptions(bool skip_nulls) + : FunctionOptions(internal::kCumulativeOptionsType), skip_nulls(skip_nulls) {} +CumulativeOptions::CumulativeOptions(double start, bool skip_nulls) + : CumulativeOptions(std::make_shared(start), skip_nulls) {} +CumulativeOptions::CumulativeOptions(std::shared_ptr start, bool skip_nulls) + : FunctionOptions(internal::kCumulativeOptionsType), start(std::move(start)), skip_nulls(skip_nulls) {} -constexpr char CumulativeSumOptions::kTypeName[]; +constexpr char CumulativeOptions::kTypeName[]; RankOptions::RankOptions(std::vector sort_keys, NullPlacement null_placement, RankOptions::Tiebreaker tiebreaker) @@ -224,7 +227,7 @@ void RegisterVectorOptions(FunctionRegistry* registry) { DCHECK_OK(registry->AddFunctionOptionsType(kSortOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kPartitionNthOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kSelectKOptionsType)); - DCHECK_OK(registry->AddFunctionOptionsType(kCumulativeSumOptionsType)); + DCHECK_OK(registry->AddFunctionOptionsType(kCumulativeOptionsType)); DCHECK_OK(registry->AddFunctionOptionsType(kRankOptionsType)); } } // namespace internal @@ -375,12 +378,28 @@ Result> DropNull(const Array& values, ExecContext* ctx) { // ---------------------------------------------------------------------- // Cumulative functions -Result CumulativeSum(const Datum& values, const CumulativeSumOptions& options, +Result CumulativeSum(const Datum& values, const CumulativeOptions& options, bool check_overflow, ExecContext* ctx) { auto func_name = check_overflow ? "cumulative_sum_checked" : "cumulative_sum"; return CallFunction(func_name, {Datum(values)}, &options, ctx); } +Result CumulativeProd(const Datum& values, const CumulativeOptions& options, + bool check_overflow, ExecContext* ctx) { + auto func_name = check_overflow ? "cumulative_prod_checked" : "cumulative_prod"; + return CallFunction(func_name, {Datum(values)}, &options, ctx); +} + +Result CumulativeMax(const Datum& values, const CumulativeOptions& options, + ExecContext* ctx) { + return CallFunction("cumulative_max", {Datum(values)}, &options, ctx); +} + +Result CumulativeMin(const Datum& values, const CumulativeOptions& options, + ExecContext* ctx) { + return CallFunction("cumulative_min", {Datum(values)}, &options, ctx); +} + // ---------------------------------------------------------------------- // Deprecated functions diff --git a/cpp/src/arrow/compute/api_vector.h b/cpp/src/arrow/compute/api_vector.h index d02c505f3e59a..35899e51f8eeb 100644 --- a/cpp/src/arrow/compute/api_vector.h +++ b/cpp/src/arrow/compute/api_vector.h @@ -210,21 +210,29 @@ class ARROW_EXPORT PartitionNthOptions : public FunctionOptions { NullPlacement null_placement; }; -/// \brief Options for cumulative sum function -class ARROW_EXPORT CumulativeSumOptions : public FunctionOptions { +/// \brief Options for cumulative functions +/// \note Also aliased as CumulativeSumOptions for backward compatibility +class ARROW_EXPORT CumulativeOptions : public FunctionOptions { public: - explicit CumulativeSumOptions(double start = 0, bool skip_nulls = false); - explicit CumulativeSumOptions(std::shared_ptr start, bool skip_nulls = false); - static constexpr char const kTypeName[] = "CumulativeSumOptions"; - static CumulativeSumOptions Defaults() { return CumulativeSumOptions(); } - - /// Optional starting value for cumulative operation computation - std::shared_ptr start; + explicit CumulativeOptions(bool skip_nulls = false); + explicit CumulativeOptions(double start, bool skip_nulls = false); + explicit CumulativeOptions(std::shared_ptr start, bool skip_nulls = false); + static constexpr char const kTypeName[] = "CumulativeOptions"; + static CumulativeOptions Defaults() { return CumulativeOptions(); } + + /// Optional starting value for cumulative operation computation, default depends on the + /// operation and input type. + /// - sum: 0 + /// - prod: 1 + /// - min: maximum of the input type + /// - max: minimum of the input type + std::optional> start; /// If true, nulls in the input are ignored and produce a corresponding null output. /// When false, the first null encountered is propagated through the remaining output. bool skip_nulls = false; }; +using CumulativeSumOptions = CumulativeOptions; // For backward compatibility /// @} @@ -601,10 +609,41 @@ Result RunEndDecode(const Datum& value, ExecContext* ctx = NULLPTR); /// \param[in] ctx the function execution context, optional ARROW_EXPORT Result CumulativeSum( - const Datum& values, - const CumulativeSumOptions& options = CumulativeSumOptions::Defaults(), + const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), bool check_overflow = false, ExecContext* ctx = NULLPTR); +/// \brief Compute the cumulative product of an array-like object +/// +/// \param[in] values array-like input +/// \param[in] options configures cumulative prod behavior +/// \param[in] check_overflow whether to check for overflow, if true, return Invalid +/// status on overflow, otherwise wrap around on overflow +/// \param[in] ctx the function execution context, optional +ARROW_EXPORT +Result CumulativeProd( + const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), + bool check_overflow = false, ExecContext* ctx = NULLPTR); + +/// \brief Compute the cumulative max of an array-like object +/// +/// \param[in] values array-like input +/// \param[in] options configures cumulative max behavior +/// \param[in] ctx the function execution context, optional +ARROW_EXPORT +Result CumulativeMax( + const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), + ExecContext* ctx = NULLPTR); + +/// \brief Compute the cumulative min of an array-like object +/// +/// \param[in] values array-like input +/// \param[in] options configures cumulative min behavior +/// \param[in] ctx the function execution context, optional +ARROW_EXPORT +Result CumulativeMin( + const Datum& values, const CumulativeOptions& options = CumulativeOptions::Defaults(), + ExecContext* ctx = NULLPTR); + // ---------------------------------------------------------------------- // Deprecated functions diff --git a/cpp/src/arrow/compute/function_internal.h b/cpp/src/arrow/compute/function_internal.h index c0dbaac1004ce..d5152e74365eb 100644 --- a/cpp/src/arrow/compute/function_internal.h +++ b/cpp/src/arrow/compute/function_internal.h @@ -28,6 +28,7 @@ #include "arrow/compute/function.h" #include "arrow/compute/type_fwd.h" #include "arrow/result.h" +#include "arrow/scalar.h" #include "arrow/status.h" #include "arrow/util/checked_cast.h" #include "arrow/util/key_value_metadata.h" @@ -283,12 +284,6 @@ static inline Result()))> GenericToScalar( return MakeScalar(value); } -template -static inline Result()))> GenericToScalar( - const std::optional& value) { - return value.has_value() ? MakeScalar(value.value()) : MakeScalar(""); -} - // For Clang/libc++: when iterating through vector, we can't // pass it by reference so the overload above doesn't apply static inline Result> GenericToScalar(bool value) { @@ -382,6 +377,12 @@ static inline Result> GenericToScalar(const Datum& value } } +template +static inline Result()))> GenericToScalar( + const std::optional& value) { + return value.has_value() ? MakeScalar(value.value()) : std::make_shared(); +} + template static inline enable_if_primitive_ctype::ArrowType, Result> GenericFromScalar(const std::shared_ptr& value) { @@ -404,26 +405,6 @@ GenericFromScalar(const std::shared_ptr& value) { return ValidateEnumValue(raw_val); } -template -constexpr bool is_optional_impl = false; -template -constexpr bool is_optional_impl> = true; - -template -using is_optional = - std::integral_constant> || - std::is_same::value>; - -template -using enable_if_optional = enable_if_t::value, Result>; - -template -static inline enable_if_optional GenericFromScalar( - const std::shared_ptr& value) { - using value_type = typename T::value_type; - return GenericFromScalar(value); -} - template using enable_if_same_result = enable_if_same>; @@ -510,6 +491,29 @@ static inline enable_if_same_result GenericFromScalar( return Status::Invalid("Cannot deserialize Datum from ", value->ToString()); } +template +constexpr bool is_optional_impl = false; +template +constexpr bool is_optional_impl> = true; + +template +using is_optional = + std::integral_constant> || + std::is_same::value>; + +template +using enable_if_optional = enable_if_t::value, Result>; + +template +static inline enable_if_optional GenericFromScalar( + const std::shared_ptr& value) { + using value_type = typename T::value_type; + if (value->type->id() == Type::NA) { + return std::nullopt; + } + return GenericFromScalar(value); +} + template static enable_if_same::ArrowType, ListType, Result> GenericFromScalar(const std::shared_ptr& value) { diff --git a/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h b/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h index 65329e10db603..a066ffd18076d 100644 --- a/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h +++ b/cpp/src/arrow/compute/kernels/base_arithmetic_internal.h @@ -60,6 +60,11 @@ struct Add { static enable_if_decimal_value Call(KernelContext*, Arg0 left, Arg1 right, Status*) { return left + right; } + + template + static constexpr T Identity() { + return static_cast(0); + } }; struct AddChecked { @@ -85,6 +90,11 @@ struct AddChecked { static enable_if_decimal_value Call(KernelContext*, Arg0 left, Arg1 right, Status*) { return left + right; } + + template + static constexpr T Identity() { + return static_cast(0); + } }; template @@ -331,6 +341,11 @@ struct Multiply { static enable_if_decimal_value Call(KernelContext*, Arg0 left, Arg1 right, Status*) { return left * right; } + + template + static constexpr T Identity() { + return static_cast(1); + } }; struct MultiplyChecked { @@ -356,6 +371,11 @@ struct MultiplyChecked { static enable_if_decimal_value Call(KernelContext*, Arg0 left, Arg1 right, Status*) { return left * right; } + + template + static constexpr T Identity() { + return static_cast(1); + } }; struct Divide { @@ -605,6 +625,70 @@ struct Sign { } }; +struct Max { + template + static constexpr enable_if_not_floating_value Call(KernelContext*, Arg0 arg0, + Arg1 arg1, Status*) { + static_assert(std::is_same::value && std::is_same::value); + return std::max(arg0, arg1); + } + + template + static constexpr enable_if_floating_value Call(KernelContext*, Arg0 left, Arg1 right, + Status*) { + static_assert(std::is_same::value && std::is_same::value); + if (std::isnan(left)) { + return right; + } else if (std::isnan(right)) { + return left; + } else { + return std::max(left, right); + } + } + + template + static constexpr enable_if_decimal_value Identity() { + return T::GetMinSentinel(); + } + + template + static constexpr T Identity() { + return std::numeric_limits::min(); + } +}; + +struct Min { + template + static constexpr enable_if_not_floating_value Call(KernelContext*, Arg0 arg0, + Arg1 arg1, Status*) { + static_assert(std::is_same::value && std::is_same::value); + return std::min(arg0, arg1); + } + + template + static constexpr enable_if_floating_value Call(KernelContext*, Arg0 left, Arg1 right, + Status*) { + static_assert(std::is_same::value && std::is_same::value); + if (std::isnan(left)) { + return right; + } else if (std::isnan(right)) { + return left; + } else { + return std::min(left, right); + } + } + + template + static constexpr enable_if_decimal_value Identity() { + return T::GetMaxSentinel(); + } + + template + static constexpr T Identity() { + return std::numeric_limits::max(); + } +}; + } // namespace internal } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/compute/kernels/codegen_internal.h b/cpp/src/arrow/compute/kernels/codegen_internal.h index 3dd1f2b81128e..6224a9fc2af99 100644 --- a/cpp/src/arrow/compute/kernels/codegen_internal.h +++ b/cpp/src/arrow/compute/kernels/codegen_internal.h @@ -213,6 +213,9 @@ using enable_if_integer_value = template using enable_if_floating_value = enable_if_t::value, R>; +template +using enable_if_not_floating_value = enable_if_t::value, R>; + template using enable_if_decimal_value = enable_if_t::value || std::is_same::value, diff --git a/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc b/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc index 91d78f690b54a..2b68f55af22fa 100644 --- a/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc +++ b/cpp/src/arrow/compute/kernels/vector_cumulative_ops.cc @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +#include #include "arrow/array/array_base.h" #include "arrow/array/builder_primitive.h" #include "arrow/compute/api_scalar.h" @@ -49,15 +50,12 @@ struct CumulativeOptionsWrapper : public OptionsWrapper { } const auto& start = options->start; - if (!start || !start->is_valid) { - return Status::Invalid("Cumulative `start` option must be non-null and valid"); - } - // Ensure `start` option matches input type - if (!start->type->Equals(*args.inputs[0])) { - ARROW_ASSIGN_OR_RAISE( - auto casted_start, - Cast(Datum(start), args.inputs[0], CastOptions::Safe(), ctx->exec_context())); + // Ensure `start` option, if given, matches input type, + if (start.has_value() && !start.value()->type->Equals(*args.inputs[0])) { + ARROW_ASSIGN_OR_RAISE(auto casted_start, + Cast(Datum(start.value()), args.inputs[0], + CastOptions::Safe(), ctx->exec_context())); auto new_options = OptionsType(casted_start.scalar(), options->skip_nulls); return std::make_unique(new_options); } @@ -66,10 +64,11 @@ struct CumulativeOptionsWrapper : public OptionsWrapper { }; // The driver kernel for all cumulative compute functions. Op is a compute kernel -// representing any binary associative operation (add, product, min, max, etc.) and -// OptionsType the options type corresponding to Op. ArgType and OutType are the input -// and output types, which will normally be the same (e.g. the cumulative sum of an array -// of Int64Type will result in an array of Int64Type). +// representing any binary associative operation with an identity element (add, product, +// min, max, etc.), i.e. ones that form a monoid, and OptionsType the options type +// corresponding to Op. ArgType and OutType are the input and output types, which will +// normally be the same (e.g. the cumulative sum of an array of Int64Type will result in +// an array of Int64Type). template struct Accumulator { using OutValue = typename GetOutputType::T; @@ -118,10 +117,15 @@ struct Accumulator { template struct CumulativeKernel { + using OutValue = typename GetOutputType::T; static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const auto& options = CumulativeOptionsWrapper::Get(ctx); Accumulator accumulator(ctx); - accumulator.current_value = UnboxScalar::Unbox(*(options.start)); + if (options.start.has_value()) { + accumulator.current_value = UnboxScalar::Unbox(*(options.start.value())); + } else { + accumulator.current_value = Op::template Identity(); + } accumulator.skip_nulls = options.skip_nulls; RETURN_NOT_OK(accumulator.builder.Reserve(batch.length)); @@ -136,10 +140,15 @@ struct CumulativeKernel { template struct CumulativeKernelChunked { + using OutValue = typename GetOutputType::T; static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { const auto& options = CumulativeOptionsWrapper::Get(ctx); Accumulator accumulator(ctx); - accumulator.current_value = UnboxScalar::Unbox(*(options.start)); + if (options.start.has_value()) { + accumulator.current_value = UnboxScalar::Unbox(*(options.start.value())); + } else { + accumulator.current_value = Op::template Identity(); + } accumulator.skip_nulls = options.skip_nulls; const ChunkedArray& chunked_input = *batch[0].chunked_array(); @@ -160,18 +169,52 @@ const FunctionDoc cumulative_sum_doc{ ("`values` must be numeric. Return an array/chunked array which is the\n" "cumulative sum computed over `values`. Results will wrap around on\n" "integer overflow. Use function \"cumulative_sum_checked\" if you want\n" - "overflow to return an error."), + "overflow to return an error. The default start is 0."), {"values"}, - "CumulativeSumOptions"}; + "CumulativeOptions"}; const FunctionDoc cumulative_sum_checked_doc{ "Compute the cumulative sum over a numeric input", ("`values` must be numeric. Return an array/chunked array which is the\n" "cumulative sum computed over `values`. This function returns an error\n" "on overflow. For a variant that doesn't fail on overflow, use\n" - "function \"cumulative_sum\"."), + "function \"cumulative_sum\". The default start is 0."), + {"values"}, + "CumulativeOptions"}; + +const FunctionDoc cumulative_prod_doc{ + "Compute the cumulative product over a numeric input", + ("`values` must be numeric. Return an array/chunked array which is the\n" + "cumulative product computed over `values`. Results will wrap around on\n" + "integer overflow. Use function \"cumulative_prod_checked\" if you want\n" + "overflow to return an error. The default start is 1."), + {"values"}, + "CumulativeOptions"}; + +const FunctionDoc cumulative_prod_checked_doc{ + "Compute the cumulative product over a numeric input", + ("`values` must be numeric. Return an array/chunked array which is the\n" + "cumulative product computed over `values`. This function returns an error\n" + "on overflow. For a variant that doesn't fail on overflow, use\n" + "function \"cumulative_prod\". The default start is 1."), + {"values"}, + "CumulativeOptions"}; + +const FunctionDoc cumulative_max_doc{ + "Compute the cumulative max over a numeric input", + ("`values` must be numeric. Return an array/chunked array which is the\n" + "cumulative max computed over `values`. The default start is the minimum\n" + "value of input type."), {"values"}, - "CumulativeSumOptions"}; + "CumulativeOptions"}; + +const FunctionDoc cumulative_min_doc{ + "Compute the cumulative min over a numeric input", + ("`values` must be numeric. Return an array/chunked array which is the\n" + "cumulative min computed over `values`. The default start is the maximum\n" + "value of input type."), + {"values"}, + "CumulativeOptions"}; } // namespace template @@ -203,10 +246,20 @@ void MakeVectorCumulativeFunction(FunctionRegistry* registry, const std::string } void RegisterVectorCumulativeSum(FunctionRegistry* registry) { - MakeVectorCumulativeFunction(registry, "cumulative_sum", - cumulative_sum_doc); - MakeVectorCumulativeFunction( + MakeVectorCumulativeFunction(registry, "cumulative_sum", + cumulative_sum_doc); + MakeVectorCumulativeFunction( registry, "cumulative_sum_checked", cumulative_sum_checked_doc); + + MakeVectorCumulativeFunction(registry, "cumulative_prod", + cumulative_prod_doc); + MakeVectorCumulativeFunction( + registry, "cumulative_prod_checked", cumulative_prod_checked_doc); + + MakeVectorCumulativeFunction(registry, "cumulative_min", + cumulative_min_doc); + MakeVectorCumulativeFunction(registry, "cumulative_max", + cumulative_max_doc); } } // namespace internal diff --git a/cpp/src/arrow/compute/kernels/vector_cumulative_ops_test.cc b/cpp/src/arrow/compute/kernels/vector_cumulative_ops_test.cc index 3c6bb3c1d10d9..4ff46eb4acdfd 100644 --- a/cpp/src/arrow/compute/kernels/vector_cumulative_ops_test.cc +++ b/cpp/src/arrow/compute/kernels/vector_cumulative_ops_test.cc @@ -24,6 +24,7 @@ #include "arrow/array.h" #include "arrow/chunked_array.h" #include "arrow/compute/api_vector.h" +#include "arrow/scalar.h" #include "arrow/testing/gtest_util.h" #include "arrow/testing/util.h" #include "arrow/type.h" @@ -36,43 +37,44 @@ namespace arrow { namespace compute { -TEST(TestCumulativeSum, Empty) { - CumulativeSumOptions options; - for (auto ty : NumericTypes()) { - auto empty_arr = ArrayFromJSON(ty, "[]"); - auto empty_chunked = ChunkedArrayFromJSON(ty, {"[]"}); - CheckVectorUnary("cumulative_sum", empty_arr, empty_arr, &options); - CheckVectorUnary("cumulative_sum_checked", empty_arr, empty_arr, &options); +constexpr static std::array kCumulativeFunctionNames{ + "cumulative_sum", "cumulative_sum_checked", "cumulative_prod", + "cumulative_prod_checked", "cumulative_min", "cumulative_max"}; + +TEST(TestCumulative, Empty) { + for (auto function : kCumulativeFunctionNames) { + CumulativeOptions options; + for (auto ty : NumericTypes()) { + auto empty_arr = ArrayFromJSON(ty, "[]"); + auto empty_chunked = ChunkedArrayFromJSON(ty, {"[]"}); + CheckVectorUnary(function, empty_arr, empty_arr, &options); - CheckVectorUnary("cumulative_sum", empty_chunked, empty_chunked, &options); - CheckVectorUnary("cumulative_sum_checked", empty_chunked, empty_chunked, &options); + CheckVectorUnary(function, empty_chunked, empty_chunked, &options); + } } } -TEST(TestCumulativeSum, AllNulls) { - CumulativeSumOptions options; - for (auto ty : NumericTypes()) { - auto nulls_arr = ArrayFromJSON(ty, "[null, null, null]"); - auto nulls_one_chunk = ChunkedArrayFromJSON(ty, {"[null, null, null]"}); - auto nulls_three_chunks = ChunkedArrayFromJSON(ty, {"[null]", "[null]", "[null]"}); - CheckVectorUnary("cumulative_sum", nulls_arr, nulls_arr, &options); - CheckVectorUnary("cumulative_sum_checked", nulls_arr, nulls_arr, &options); - - CheckVectorUnary("cumulative_sum", nulls_one_chunk, nulls_one_chunk, &options); - CheckVectorUnary("cumulative_sum_checked", nulls_one_chunk, nulls_one_chunk, - &options); +TEST(TestCumulative, AllNulls) { + for (auto function : kCumulativeFunctionNames) { + CumulativeOptions options; + for (auto ty : NumericTypes()) { + auto nulls_arr = ArrayFromJSON(ty, "[null, null, null]"); + auto nulls_one_chunk = ChunkedArrayFromJSON(ty, {"[null, null, null]"}); + auto nulls_three_chunks = ChunkedArrayFromJSON(ty, {"[null]", "[null]", "[null]"}); + CheckVectorUnary(function, nulls_arr, nulls_arr, &options); - CheckVectorUnary("cumulative_sum", nulls_three_chunks, nulls_one_chunk, &options); - CheckVectorUnary("cumulative_sum_checked", nulls_three_chunks, nulls_one_chunk, - &options); + CheckVectorUnary(function, nulls_one_chunk, nulls_one_chunk, &options); + + CheckVectorUnary(function, nulls_three_chunks, nulls_one_chunk, &options); + } } } TEST(TestCumulativeSum, ScalarInput) { - CumulativeSumOptions no_start_no_skip; - CumulativeSumOptions no_start_do_skip(0, true); - CumulativeSumOptions has_start_no_skip(10); - CumulativeSumOptions has_start_do_skip(10, true); + CumulativeOptions no_start_no_skip; + CumulativeOptions no_start_do_skip(0, true); + CumulativeOptions has_start_no_skip(10.0); + CumulativeOptions has_start_do_skip(10, true); for (auto ty : NumericTypes()) { CheckVectorUnary("cumulative_sum", ScalarFromJSON(ty, "10"), @@ -105,6 +107,43 @@ TEST(TestCumulativeSum, ScalarInput) { } } +TEST(TestCumulativeProd, ScalarInput) { + CumulativeOptions no_start_no_skip; + CumulativeOptions no_start_do_skip(1, true); + CumulativeOptions has_start_no_skip(10.0); + CumulativeOptions has_start_do_skip(10, true); + + for (auto ty : NumericTypes()) { + CheckVectorUnary("cumulative_prod", ScalarFromJSON(ty, "10"), + ArrayFromJSON(ty, "[10]"), &no_start_no_skip); + CheckVectorUnary("cumulative_prod_checked", ScalarFromJSON(ty, "10"), + ArrayFromJSON(ty, "[10]"), &no_start_no_skip); + + CheckVectorUnary("cumulative_prod", ScalarFromJSON(ty, "10"), + ArrayFromJSON(ty, "[100]"), &has_start_no_skip); + CheckVectorUnary("cumulative_prod_checked", ScalarFromJSON(ty, "10"), + ArrayFromJSON(ty, "[100]"), &has_start_no_skip); + + CheckVectorUnary("cumulative_prod", ScalarFromJSON(ty, "null"), + ArrayFromJSON(ty, "[null]"), &no_start_no_skip); + CheckVectorUnary("cumulative_prod_checked", ScalarFromJSON(ty, "null"), + ArrayFromJSON(ty, "[null]"), &no_start_no_skip); + CheckVectorUnary("cumulative_prod", ScalarFromJSON(ty, "null"), + ArrayFromJSON(ty, "[null]"), &has_start_no_skip); + CheckVectorUnary("cumulative_prod_checked", ScalarFromJSON(ty, "null"), + ArrayFromJSON(ty, "[null]"), &has_start_no_skip); + + CheckVectorUnary("cumulative_prod", ScalarFromJSON(ty, "null"), + ArrayFromJSON(ty, "[null]"), &no_start_do_skip); + CheckVectorUnary("cumulative_prod_checked", ScalarFromJSON(ty, "null"), + ArrayFromJSON(ty, "[null]"), &no_start_do_skip); + CheckVectorUnary("cumulative_prod", ScalarFromJSON(ty, "null"), + ArrayFromJSON(ty, "[null]"), &has_start_do_skip); + CheckVectorUnary("cumulative_prod_checked", ScalarFromJSON(ty, "null"), + ArrayFromJSON(ty, "[null]"), &has_start_do_skip); + } +} + using testing::HasSubstr; template @@ -112,7 +151,7 @@ void CheckCumulativeSumUnsignedOverflow() { using CType = typename TypeTraits::CType; using BuilderType = typename TypeTraits::BuilderType; - CumulativeSumOptions pos_overflow(1); + CumulativeOptions pos_overflow(1.0); auto max = std::numeric_limits::max(); auto min = std::numeric_limits::lowest(); @@ -138,7 +177,7 @@ void CheckCumulativeSumSignedOverflow() { CheckCumulativeSumUnsignedOverflow(); - CumulativeSumOptions neg_overflow(-1); + CumulativeOptions neg_overflow(-1.0); auto max = std::numeric_limits::max(); auto min = std::numeric_limits::lowest(); @@ -167,8 +206,64 @@ TEST(TestCumulativeSum, IntegerOverflow) { CheckCumulativeSumSignedOverflow(); } +template +void CheckCumulativeProdUnsignedOverflow() { + using CType = typename TypeTraits::CType; + using BuilderType = typename TypeTraits::BuilderType; + + CumulativeOptions pos_overflow(2.0); + auto max = std::numeric_limits::max(); + auto min = std::numeric_limits::lowest(); + + BuilderType builder; + std::shared_ptr half_max_arr; + std::shared_ptr min_arr; + ASSERT_OK(builder.Append(max / 2 + 1)); // 2 * (max / 2 + 1) overflows to min + ASSERT_OK(builder.Finish(&half_max_arr)); + builder.Reset(); + ASSERT_OK(builder.Append(min)); + ASSERT_OK(builder.Finish(&min_arr)); + + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, HasSubstr("overflow"), + CallFunction("cumulative_prod_checked", {half_max_arr}, &pos_overflow)); + CheckVectorUnary("cumulative_prod", half_max_arr, min_arr, &pos_overflow); +} + +template +void CheckCumulativeProdSignedOverflow() { + using CType = typename TypeTraits::CType; + using BuilderType = typename TypeTraits::BuilderType; + + CheckCumulativeSumUnsignedOverflow(); + + CumulativeOptions neg_overflow(-1.0); // min * -1 overflows to min + auto min = std::numeric_limits::lowest(); + + BuilderType builder; + std::shared_ptr min_arr; + builder.Reset(); + ASSERT_OK(builder.Append(min)); + ASSERT_OK(builder.Finish(&min_arr)); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, HasSubstr("overflow"), + CallFunction("cumulative_prod_checked", {min_arr}, &neg_overflow)); + CheckVectorUnary("cumulative_prod", min_arr, min_arr, &neg_overflow); +} + +TEST(TestCumulativeProd, IntegerOverflow) { + CheckCumulativeProdUnsignedOverflow(); + CheckCumulativeProdUnsignedOverflow(); + CheckCumulativeProdUnsignedOverflow(); + CheckCumulativeProdUnsignedOverflow(); + CheckCumulativeProdSignedOverflow(); + CheckCumulativeProdSignedOverflow(); + CheckCumulativeProdSignedOverflow(); + CheckCumulativeProdSignedOverflow(); +} + TEST(TestCumulativeSum, NoStartNoSkip) { - CumulativeSumOptions options; + CumulativeOptions options; for (auto ty : NumericTypes()) { CheckVectorUnary("cumulative_sum", ArrayFromJSON(ty, "[1, 2, 3, 4, 5, 6]"), ArrayFromJSON(ty, "[1, 3, 6, 10, 15, 21]"), &options); @@ -212,52 +307,53 @@ TEST(TestCumulativeSum, NoStartNoSkip) { } } -TEST(TestCumulativeSum, NoStartDoSkip) { - CumulativeSumOptions options(0, true); +TEST(TestCumulativeSum, HasStartNoSkip) { + CumulativeOptions options(10.0); for (auto ty : NumericTypes()) { CheckVectorUnary("cumulative_sum", ArrayFromJSON(ty, "[1, 2, 3, 4, 5, 6]"), - ArrayFromJSON(ty, "[1, 3, 6, 10, 15, 21]"), &options); + ArrayFromJSON(ty, "[11, 13, 16, 20, 25, 31]"), &options); CheckVectorUnary("cumulative_sum_checked", ArrayFromJSON(ty, "[1, 2, 3, 4, 5, 6]"), - ArrayFromJSON(ty, "[1, 3, 6, 10, 15, 21]"), &options); + ArrayFromJSON(ty, "[11, 13, 16, 20, 25, 31]"), &options); CheckVectorUnary("cumulative_sum", ArrayFromJSON(ty, "[1, 2, null, 4, null, 6]"), - ArrayFromJSON(ty, "[1, 3, null, 7, null, 13]"), &options); + ArrayFromJSON(ty, "[11, 13, null, null, null, null]"), &options); CheckVectorUnary("cumulative_sum_checked", ArrayFromJSON(ty, "[1, 2, null, 4, null, 6]"), - ArrayFromJSON(ty, "[1, 3, null, 7, null, 13]"), &options); + ArrayFromJSON(ty, "[11, 13, null, null, null, null]"), &options); CheckVectorUnary("cumulative_sum", ArrayFromJSON(ty, "[null, 2, null, 4, null, 6]"), - ArrayFromJSON(ty, "[null, 2, null, 6, null, 12]"), &options); + ArrayFromJSON(ty, "[null, null, null, null, null, null]"), &options); CheckVectorUnary("cumulative_sum_checked", ArrayFromJSON(ty, "[null, 2, null, 4, null, 6]"), - ArrayFromJSON(ty, "[null, 2, null, 6, null, 12]"), &options); + ArrayFromJSON(ty, "[null, null, null, null, null, null]"), &options); CheckVectorUnary("cumulative_sum", ChunkedArrayFromJSON(ty, {"[1, 2, 3]", "[4, 5, 6]"}), - ChunkedArrayFromJSON(ty, {"[1, 3, 6, 10, 15, 21]"}), &options); + ChunkedArrayFromJSON(ty, {"[11, 13, 16, 20, 25, 31]"}), &options); CheckVectorUnary("cumulative_sum_checked", ChunkedArrayFromJSON(ty, {"[1, 2, 3]", "[4, 5, 6]"}), - ChunkedArrayFromJSON(ty, {"[1, 3, 6, 10, 15, 21]"}), &options); + ChunkedArrayFromJSON(ty, {"[11, 13, 16, 20, 25, 31]"}), &options); - CheckVectorUnary("cumulative_sum", - ChunkedArrayFromJSON(ty, {"[1, 2, null]", "[4, null, 6]"}), - ChunkedArrayFromJSON(ty, {"[1, 3, null, 7, null, 13]"}), &options); + CheckVectorUnary( + "cumulative_sum", ChunkedArrayFromJSON(ty, {"[1, 2, null]", "[4, null, 6]"}), + ChunkedArrayFromJSON(ty, {"[11, 13, null, null, null, null]"}), &options); CheckVectorUnary("cumulative_sum_checked", ChunkedArrayFromJSON(ty, {"[1, 2, null]", "[4, null, 6]"}), - ChunkedArrayFromJSON(ty, {"[1, 3, null, 7, null, 13]"}), &options); + ChunkedArrayFromJSON(ty, {"[11, 13, null, null, null, null]"}), + &options); CheckVectorUnary( "cumulative_sum", ChunkedArrayFromJSON(ty, {"[null, 2, null]", "[4, null, 6]"}), - ChunkedArrayFromJSON(ty, {"[null, 2, null, 6, null, 12]"}), &options); + ChunkedArrayFromJSON(ty, {"[null, null, null, null, null, null]"}), &options); CheckVectorUnary("cumulative_sum_checked", ChunkedArrayFromJSON(ty, {"[null, 2, null]", "[4, null, 6]"}), - ChunkedArrayFromJSON(ty, {"[null, 2, null, 6, null, 12]"}), + ChunkedArrayFromJSON(ty, {"[null, null, null, null, null, null]"}), &options); } } -TEST(TestCumulativeSum, HasStartNoSkip) { - CumulativeSumOptions options(10); +TEST(TestCumulativeSum, HasStartDoSkip) { + CumulativeOptions options(10, true); for (auto ty : NumericTypes()) { CheckVectorUnary("cumulative_sum", ArrayFromJSON(ty, "[1, 2, 3, 4, 5, 6]"), ArrayFromJSON(ty, "[11, 13, 16, 20, 25, 31]"), &options); @@ -265,16 +361,16 @@ TEST(TestCumulativeSum, HasStartNoSkip) { ArrayFromJSON(ty, "[11, 13, 16, 20, 25, 31]"), &options); CheckVectorUnary("cumulative_sum", ArrayFromJSON(ty, "[1, 2, null, 4, null, 6]"), - ArrayFromJSON(ty, "[11, 13, null, null, null, null]"), &options); + ArrayFromJSON(ty, "[11, 13, null, 17, null, 23]"), &options); CheckVectorUnary("cumulative_sum_checked", ArrayFromJSON(ty, "[1, 2, null, 4, null, 6]"), - ArrayFromJSON(ty, "[11, 13, null, null, null, null]"), &options); + ArrayFromJSON(ty, "[11, 13, null, 17, null, 23]"), &options); CheckVectorUnary("cumulative_sum", ArrayFromJSON(ty, "[null, 2, null, 4, null, 6]"), - ArrayFromJSON(ty, "[null, null, null, null, null, null]"), &options); + ArrayFromJSON(ty, "[null, 12, null, 16, null, 22]"), &options); CheckVectorUnary("cumulative_sum_checked", ArrayFromJSON(ty, "[null, 2, null, 4, null, 6]"), - ArrayFromJSON(ty, "[null, null, null, null, null, null]"), &options); + ArrayFromJSON(ty, "[null, 12, null, 16, null, 22]"), &options); CheckVectorUnary("cumulative_sum", ChunkedArrayFromJSON(ty, {"[1, 2, 3]", "[4, 5, 6]"}), @@ -285,76 +381,487 @@ TEST(TestCumulativeSum, HasStartNoSkip) { CheckVectorUnary( "cumulative_sum", ChunkedArrayFromJSON(ty, {"[1, 2, null]", "[4, null, 6]"}), - ChunkedArrayFromJSON(ty, {"[11, 13, null, null, null, null]"}), &options); + ChunkedArrayFromJSON(ty, {"[11, 13, null, 17, null, 23]"}), &options); CheckVectorUnary("cumulative_sum_checked", ChunkedArrayFromJSON(ty, {"[1, 2, null]", "[4, null, 6]"}), - ChunkedArrayFromJSON(ty, {"[11, 13, null, null, null, null]"}), + ChunkedArrayFromJSON(ty, {"[11, 13, null, 17, null, 23]"}), &options); CheckVectorUnary( "cumulative_sum", ChunkedArrayFromJSON(ty, {"[null, 2, null]", "[4, null, 6]"}), - ChunkedArrayFromJSON(ty, {"[null, null, null, null, null, null]"}), &options); + ChunkedArrayFromJSON(ty, {"[null, 12, null, 16, null, 22]"}), &options); CheckVectorUnary("cumulative_sum_checked", ChunkedArrayFromJSON(ty, {"[null, 2, null]", "[4, null, 6]"}), - ChunkedArrayFromJSON(ty, {"[null, null, null, null, null, null]"}), + ChunkedArrayFromJSON(ty, {"[null, 12, null, 16, null, 22]"}), &options); } } -TEST(TestCumulativeSum, HasStartDoSkip) { - CumulativeSumOptions options(10, true); +TEST(TestCumulativeSum, NoStartDoSkip) { + CumulativeOptions options(0, true); for (auto ty : NumericTypes()) { CheckVectorUnary("cumulative_sum", ArrayFromJSON(ty, "[1, 2, 3, 4, 5, 6]"), - ArrayFromJSON(ty, "[11, 13, 16, 20, 25, 31]"), &options); + ArrayFromJSON(ty, "[1, 3, 6, 10, 15, 21]"), &options); CheckVectorUnary("cumulative_sum_checked", ArrayFromJSON(ty, "[1, 2, 3, 4, 5, 6]"), - ArrayFromJSON(ty, "[11, 13, 16, 20, 25, 31]"), &options); + ArrayFromJSON(ty, "[1, 3, 6, 10, 15, 21]"), &options); CheckVectorUnary("cumulative_sum", ArrayFromJSON(ty, "[1, 2, null, 4, null, 6]"), - ArrayFromJSON(ty, "[11, 13, null, 17, null, 23]"), &options); + ArrayFromJSON(ty, "[1, 3, null, 7, null, 13]"), &options); CheckVectorUnary("cumulative_sum_checked", ArrayFromJSON(ty, "[1, 2, null, 4, null, 6]"), - ArrayFromJSON(ty, "[11, 13, null, 17, null, 23]"), &options); + ArrayFromJSON(ty, "[1, 3, null, 7, null, 13]"), &options); CheckVectorUnary("cumulative_sum", ArrayFromJSON(ty, "[null, 2, null, 4, null, 6]"), - ArrayFromJSON(ty, "[null, 12, null, 16, null, 22]"), &options); + ArrayFromJSON(ty, "[null, 2, null, 6, null, 12]"), &options); CheckVectorUnary("cumulative_sum_checked", ArrayFromJSON(ty, "[null, 2, null, 4, null, 6]"), - ArrayFromJSON(ty, "[null, 12, null, 16, null, 22]"), &options); + ArrayFromJSON(ty, "[null, 2, null, 6, null, 12]"), &options); CheckVectorUnary("cumulative_sum", ChunkedArrayFromJSON(ty, {"[1, 2, 3]", "[4, 5, 6]"}), - ChunkedArrayFromJSON(ty, {"[11, 13, 16, 20, 25, 31]"}), &options); + ChunkedArrayFromJSON(ty, {"[1, 3, 6, 10, 15, 21]"}), &options); CheckVectorUnary("cumulative_sum_checked", ChunkedArrayFromJSON(ty, {"[1, 2, 3]", "[4, 5, 6]"}), - ChunkedArrayFromJSON(ty, {"[11, 13, 16, 20, 25, 31]"}), &options); + ChunkedArrayFromJSON(ty, {"[1, 3, 6, 10, 15, 21]"}), &options); - CheckVectorUnary( - "cumulative_sum", ChunkedArrayFromJSON(ty, {"[1, 2, null]", "[4, null, 6]"}), - ChunkedArrayFromJSON(ty, {"[11, 13, null, 17, null, 23]"}), &options); + CheckVectorUnary("cumulative_sum", + ChunkedArrayFromJSON(ty, {"[1, 2, null]", "[4, null, 6]"}), + ChunkedArrayFromJSON(ty, {"[1, 3, null, 7, null, 13]"}), &options); CheckVectorUnary("cumulative_sum_checked", ChunkedArrayFromJSON(ty, {"[1, 2, null]", "[4, null, 6]"}), - ChunkedArrayFromJSON(ty, {"[11, 13, null, 17, null, 23]"}), - &options); + ChunkedArrayFromJSON(ty, {"[1, 3, null, 7, null, 13]"}), &options); CheckVectorUnary( "cumulative_sum", ChunkedArrayFromJSON(ty, {"[null, 2, null]", "[4, null, 6]"}), - ChunkedArrayFromJSON(ty, {"[null, 12, null, 16, null, 22]"}), &options); + ChunkedArrayFromJSON(ty, {"[null, 2, null, 6, null, 12]"}), &options); CheckVectorUnary("cumulative_sum_checked", ChunkedArrayFromJSON(ty, {"[null, 2, null]", "[4, null, 6]"}), - ChunkedArrayFromJSON(ty, {"[null, 12, null, 16, null, 22]"}), + ChunkedArrayFromJSON(ty, {"[null, 2, null, 6, null, 12]"}), + &options); + } +} + +TEST(TestCumulativeProd, NoStartNoSkip) { + CumulativeOptions options; + for (auto ty : NumericTypes()) { + CheckVectorUnary("cumulative_prod", ArrayFromJSON(ty, "[1, 2, 3, 4, 5]"), + ArrayFromJSON(ty, "[1, 2, 6, 24, 120]"), &options); + CheckVectorUnary("cumulative_prod_checked", ArrayFromJSON(ty, "[1, 2, 3, 4, 5]"), + ArrayFromJSON(ty, "[1, 2, 6, 24, 120]"), &options); + + CheckVectorUnary("cumulative_prod", ArrayFromJSON(ty, "[1, 2, null, 4, null, 6]"), + ArrayFromJSON(ty, "[1, 2, null, null, null, null]"), &options); + CheckVectorUnary("cumulative_prod_checked", + ArrayFromJSON(ty, "[1, 2, null, 4, null, 6]"), + ArrayFromJSON(ty, "[1, 2, null, null, null, null]"), &options); + + CheckVectorUnary("cumulative_prod", ArrayFromJSON(ty, "[null, 2, null, 4, null, 6]"), + ArrayFromJSON(ty, "[null, null, null, null, null, null]"), &options); + CheckVectorUnary("cumulative_prod_checked", + ArrayFromJSON(ty, "[null, 2, null, 4, null, 6]"), + ArrayFromJSON(ty, "[null, null, null, null, null, null]"), &options); + + CheckVectorUnary("cumulative_prod", ChunkedArrayFromJSON(ty, {"[1, 2, 3]", "[4, 5]"}), + ChunkedArrayFromJSON(ty, {"[1, 2, 6, 24, 120]"}), &options); + CheckVectorUnary("cumulative_prod_checked", + ChunkedArrayFromJSON(ty, {"[1, 2, 3]", "[4, 5]"}), + ChunkedArrayFromJSON(ty, {"[1, 2, 6, 24, 120]"}), &options); + + CheckVectorUnary( + "cumulative_prod", ChunkedArrayFromJSON(ty, {"[1, 2, null]", "[4, null, 6]"}), + ChunkedArrayFromJSON(ty, {"[1, 2, null, null, null, null]"}), &options); + CheckVectorUnary("cumulative_prod_checked", + ChunkedArrayFromJSON(ty, {"[1, 2, null]", "[4, null, 6]"}), + ChunkedArrayFromJSON(ty, {"[1, 2, null, null, null, null]"}), &options); + + CheckVectorUnary( + "cumulative_prod", ChunkedArrayFromJSON(ty, {"[null, 2, null]", "[4, null, 6]"}), + ChunkedArrayFromJSON(ty, {"[null, null, null, null, null, null]"}), &options); + CheckVectorUnary("cumulative_prod_checked", + ChunkedArrayFromJSON(ty, {"[null, 2, null]", "[4, null, 6]"}), + ChunkedArrayFromJSON(ty, {"[null, null, null, null, null, null]"}), + &options); + } +} + +TEST(TestCumulativeProd, HasStartNoSkip) { + CumulativeOptions options(2.0); + for (auto ty : NumericTypes()) { + CheckVectorUnary("cumulative_prod", ArrayFromJSON(ty, "[1, 2, 3, 4]"), + ArrayFromJSON(ty, "[2, 4, 12, 48]"), &options); + CheckVectorUnary("cumulative_prod_checked", ArrayFromJSON(ty, "[1, 2, 3, 4]"), + ArrayFromJSON(ty, "[2, 4, 12, 48]"), &options); + + CheckVectorUnary("cumulative_prod", ArrayFromJSON(ty, "[1, 2, null, 4]"), + ArrayFromJSON(ty, "[2, 4, null, null]"), &options); + CheckVectorUnary("cumulative_prod_checked", ArrayFromJSON(ty, "[1, 2, null, 4]"), + ArrayFromJSON(ty, "[2, 4, null, null]"), &options); + + CheckVectorUnary("cumulative_prod", ArrayFromJSON(ty, "[null, 2, null, 4]"), + ArrayFromJSON(ty, "[null, null, null, null]"), &options); + CheckVectorUnary("cumulative_prod_checked", ArrayFromJSON(ty, "[null, 2, null, 4]"), + ArrayFromJSON(ty, "[null, null, null, null]"), &options); + + CheckVectorUnary("cumulative_prod", ChunkedArrayFromJSON(ty, {"[1, 2]", "[3, 4]"}), + ChunkedArrayFromJSON(ty, {"[2, 4, 12, 48]"}), &options); + CheckVectorUnary("cumulative_prod_checked", + ChunkedArrayFromJSON(ty, {"[1, 2]", "[3, 4]"}), + ChunkedArrayFromJSON(ty, {"[2, 4, 12, 48]"}), &options); + + CheckVectorUnary("cumulative_prod", ChunkedArrayFromJSON(ty, {"[1, 2]", "[null, 4]"}), + ChunkedArrayFromJSON(ty, {"[2, 4, null, null]"}), &options); + CheckVectorUnary("cumulative_prod_checked", + ChunkedArrayFromJSON(ty, {"[1, 2]", "[null, 4]"}), + ChunkedArrayFromJSON(ty, {"[2, 4, null, null]"}), &options); + + CheckVectorUnary("cumulative_prod", + ChunkedArrayFromJSON(ty, {"[null, 2]", "[null, 4]"}), + ChunkedArrayFromJSON(ty, {"[null, null, null, null]"}), &options); + CheckVectorUnary("cumulative_prod_checked", + ChunkedArrayFromJSON(ty, {"[null, 2]", "[null, 4]"}), + ChunkedArrayFromJSON(ty, {"[null, null, null, null]"}), &options); + } +} + +TEST(TestCumulativeProd, HasStartDoSkip) { + CumulativeOptions options(2.0, true); + for (auto ty : NumericTypes()) { + CheckVectorUnary("cumulative_prod", ArrayFromJSON(ty, "[1, 2, 3, 4]"), + ArrayFromJSON(ty, "[2, 4, 12, 48]"), &options); + CheckVectorUnary("cumulative_prod_checked", ArrayFromJSON(ty, "[1, 2, 3, 4]"), + ArrayFromJSON(ty, "[2, 4, 12, 48]"), &options); + + CheckVectorUnary("cumulative_prod", ArrayFromJSON(ty, "[1, 2, null, 4]"), + ArrayFromJSON(ty, "[2, 4, null, 16]"), &options); + CheckVectorUnary("cumulative_prod_checked", ArrayFromJSON(ty, "[1, 2, null, 4]"), + ArrayFromJSON(ty, "[2, 4, null, 16]"), &options); + + CheckVectorUnary("cumulative_prod", ArrayFromJSON(ty, "[null, 2, null, 4]"), + ArrayFromJSON(ty, "[null, 4, null, 16]"), &options); + CheckVectorUnary("cumulative_prod_checked", ArrayFromJSON(ty, "[null, 2, null, 4]"), + ArrayFromJSON(ty, "[null, 4, null, 16]"), &options); + + CheckVectorUnary("cumulative_prod", ChunkedArrayFromJSON(ty, {"[1, 2]", "[3, 4]"}), + ChunkedArrayFromJSON(ty, {"[2, 4, 12, 48]"}), &options); + CheckVectorUnary("cumulative_prod_checked", + ChunkedArrayFromJSON(ty, {"[1, 2]", "[3, 4]"}), + ChunkedArrayFromJSON(ty, {"[2, 4, 12, 48]"}), &options); + + CheckVectorUnary("cumulative_prod", ChunkedArrayFromJSON(ty, {"[1, 2]", "[null, 4]"}), + ChunkedArrayFromJSON(ty, {"[2, 4, null, 16]"}), &options); + CheckVectorUnary("cumulative_prod_checked", + ChunkedArrayFromJSON(ty, {"[1, 2]", "[null, 4]"}), + ChunkedArrayFromJSON(ty, {"[2, 4, null, 16]"}), &options); + + CheckVectorUnary("cumulative_prod", + ChunkedArrayFromJSON(ty, {"[null, 2]", "[null, 4]"}), + ChunkedArrayFromJSON(ty, {"[null, 4, null, 16]"}), &options); + CheckVectorUnary("cumulative_prod_checked", + ChunkedArrayFromJSON(ty, {"[null, 2]", "[null, 4]"}), + ChunkedArrayFromJSON(ty, {"[null, 4, null, 16]"}), &options); + } +} + +TEST(TestCumulativeProd, NoStartDoSkip) { + CumulativeOptions options(true); + for (auto ty : NumericTypes()) { + CheckVectorUnary("cumulative_prod", ArrayFromJSON(ty, "[1, 2, 3, 4]"), + ArrayFromJSON(ty, "[1, 2, 6, 24]"), &options); + CheckVectorUnary("cumulative_prod_checked", ArrayFromJSON(ty, "[1, 2, 3, 4]"), + ArrayFromJSON(ty, "[1, 2, 6, 24]"), &options); + + CheckVectorUnary("cumulative_prod", ArrayFromJSON(ty, "[1, 2, null, 4]"), + ArrayFromJSON(ty, "[1, 2, null, 8]"), &options); + CheckVectorUnary("cumulative_prod_checked", ArrayFromJSON(ty, "[1, 2, null, 4]"), + ArrayFromJSON(ty, "[1, 2, null, 8]"), &options); + + CheckVectorUnary("cumulative_prod", ArrayFromJSON(ty, "[null, 2, null, 4]"), + ArrayFromJSON(ty, "[null, 2, null, 8]"), &options); + CheckVectorUnary("cumulative_prod_checked", ArrayFromJSON(ty, "[null, 2, null, 4]"), + ArrayFromJSON(ty, "[null, 2, null, 8]"), &options); + + CheckVectorUnary("cumulative_prod", ChunkedArrayFromJSON(ty, {"[1, 2]", "[3, 4]"}), + ChunkedArrayFromJSON(ty, {"[1, 2, 6, 24]"}), &options); + CheckVectorUnary("cumulative_prod_checked", + ChunkedArrayFromJSON(ty, {"[1, 2]", "[3, 4]"}), + ChunkedArrayFromJSON(ty, {"[1, 2, 6, 24]"}), &options); + + CheckVectorUnary("cumulative_prod", ChunkedArrayFromJSON(ty, {"[1, 2]", "[null, 4]"}), + ChunkedArrayFromJSON(ty, {"[1, 2, null, 8]"}), &options); + CheckVectorUnary("cumulative_prod_checked", + ChunkedArrayFromJSON(ty, {"[1, 2]", "[null, 4]"}), + ChunkedArrayFromJSON(ty, {"[1, 2, null, 8]"}), &options); + + CheckVectorUnary("cumulative_prod", + ChunkedArrayFromJSON(ty, {"[null, 2]", "[null, 4]"}), + ChunkedArrayFromJSON(ty, {"[null, 2, null, 8]"}), &options); + CheckVectorUnary("cumulative_prod_checked", + ChunkedArrayFromJSON(ty, {"[null, 2]", "[null, 4]"}), + ChunkedArrayFromJSON(ty, {"[null, 2, null, 8]"}), &options); + } +} + +TEST(TestCumulativeMax, NoStartNoSkip) { + CumulativeOptions options; + for (auto ty : NumericTypes()) { + CheckVectorUnary("cumulative_max", ArrayFromJSON(ty, "[2, 1, 3, 5, 4, 6]"), + ArrayFromJSON(ty, "[2, 2, 3, 5, 5, 6]"), &options); + + CheckVectorUnary("cumulative_max", ArrayFromJSON(ty, "[2, 1, null, 5, null, 6]"), + ArrayFromJSON(ty, "[2, 2, null, null, null, null]"), &options); + + CheckVectorUnary("cumulative_max", ArrayFromJSON(ty, "[null, 1, null, 5, null, 6]"), + ArrayFromJSON(ty, "[null, null, null, null, null, null]"), &options); + + CheckVectorUnary("cumulative_max", + ChunkedArrayFromJSON(ty, {"[2, 1, 3]", "[5, 4, 6]"}), + ChunkedArrayFromJSON(ty, {"[2, 2, 3, 5, 5, 6]"}), &options); + + CheckVectorUnary( + "cumulative_max", ChunkedArrayFromJSON(ty, {"[2, 1, null]", "[5, null, 6]"}), + ChunkedArrayFromJSON(ty, {"[2, 2, null, null, null, null]"}), &options); + + CheckVectorUnary( + "cumulative_max", ChunkedArrayFromJSON(ty, {"[null, 1, null]", "[5, null, 6]"}), + ChunkedArrayFromJSON(ty, {"[null, null, null, null, null, null]"}), &options); + } +} + +TEST(TestCumulativeMax, HasStartNoSkip) { + CumulativeOptions options(3.0); + for (auto ty : NumericTypes()) { + CheckVectorUnary("cumulative_max", ArrayFromJSON(ty, "[2, 1, 3, 5, 4, 6]"), + ArrayFromJSON(ty, "[3, 3, 3, 5, 5, 6]"), &options); + + CheckVectorUnary("cumulative_max", ArrayFromJSON(ty, "[2, 1, null, 5, null, 6]"), + ArrayFromJSON(ty, "[3, 3, null, null, null, null]"), &options); + + CheckVectorUnary("cumulative_max", ArrayFromJSON(ty, "[null, 1, null, 5, null, 6]"), + ArrayFromJSON(ty, "[null, null, null, null, null, null]"), &options); + + CheckVectorUnary("cumulative_max", + ChunkedArrayFromJSON(ty, {"[2, 1, 3]", "[5, 4, 6]"}), + ChunkedArrayFromJSON(ty, {"[3, 3, 3, 5, 5, 6]"}), &options); + + CheckVectorUnary( + "cumulative_max", ChunkedArrayFromJSON(ty, {"[2, 1, null]", "[5, null, 6]"}), + ChunkedArrayFromJSON(ty, {"[3, 3, null, null, null, null]"}), &options); + + CheckVectorUnary( + "cumulative_max", ChunkedArrayFromJSON(ty, {"[null, 1, null]", "[5, null, 6]"}), + ChunkedArrayFromJSON(ty, {"[null, null, null, null, null, null]"}), &options); + } +} + +TEST(TestCumulativeMax, HasStartDoSkip) { + CumulativeOptions options(3.0, true); + for (auto ty : NumericTypes()) { + CheckVectorUnary("cumulative_max", ArrayFromJSON(ty, "[2, 1, 3, 5, 4, 6]"), + ArrayFromJSON(ty, "[3, 3, 3, 5, 5, 6]"), &options); + + CheckVectorUnary("cumulative_max", ArrayFromJSON(ty, "[2, 1, null, 5, null, 6]"), + ArrayFromJSON(ty, "[3, 3, null, 5, null, 6]"), &options); + + CheckVectorUnary("cumulative_max", ArrayFromJSON(ty, "[null, 1, null, 5, null, 6]"), + ArrayFromJSON(ty, "[null, 3, null, 5, null, 6]"), &options); + + CheckVectorUnary("cumulative_max", + ChunkedArrayFromJSON(ty, {"[2, 1, 3]", "[5, 4, 6]"}), + ChunkedArrayFromJSON(ty, {"[3, 3, 3, 5, 5, 6]"}), &options); + + CheckVectorUnary("cumulative_max", + ChunkedArrayFromJSON(ty, {"[2, 1, null]", "[5, null, 6]"}), + ChunkedArrayFromJSON(ty, {"[3, 3, null, 5, null, 6]"}), &options); + + CheckVectorUnary("cumulative_max", + ChunkedArrayFromJSON(ty, {"[null, 1, null]", "[5, null, 6]"}), + ChunkedArrayFromJSON(ty, {"[null, 3, null, 5, null, 6]"}), &options); + } +} + +TEST(TestCumulativeMax, NoStartDoSkip) { + CumulativeOptions options(true); + for (auto ty : NumericTypes()) { + CheckVectorUnary("cumulative_max", ArrayFromJSON(ty, "[2, 1, 3, 5, 4, 6]"), + ArrayFromJSON(ty, "[2, 2, 3, 5, 5, 6]"), &options); + + CheckVectorUnary("cumulative_max", ArrayFromJSON(ty, "[2, 1, null, 5, null, 6]"), + ArrayFromJSON(ty, "[2, 2, null, 5, null, 6]"), &options); + + CheckVectorUnary("cumulative_max", ArrayFromJSON(ty, "[null, 1, null, 5, null, 6]"), + ArrayFromJSON(ty, "[null, 1, null, 5, null, 6]"), &options); + + CheckVectorUnary("cumulative_max", + ChunkedArrayFromJSON(ty, {"[2, 1, 3]", "[5, 4, 6]"}), + ChunkedArrayFromJSON(ty, {"[2, 2, 3, 5, 5, 6]"}), &options); + + CheckVectorUnary("cumulative_max", + ChunkedArrayFromJSON(ty, {"[2, 1, null]", "[5, null, 6]"}), + ChunkedArrayFromJSON(ty, {"[2, 2, null, 5, null, 6]"}), &options); + + CheckVectorUnary("cumulative_max", + ChunkedArrayFromJSON(ty, {"[null, 1, null]", "[5, null, 6]"}), + ChunkedArrayFromJSON(ty, {"[null, 1, null, 5, null, 6]"}), &options); + } +} + +TEST(TestCumulativeMin, NoStartNoSkip) { + CumulativeOptions options; + for (auto ty : NumericTypes()) { + CheckVectorUnary("cumulative_min", ArrayFromJSON(ty, "[5, 6, 4, 2, 3, 1]"), + ArrayFromJSON(ty, "[5, 5, 4, 2, 2, 1]"), &options); + + CheckVectorUnary("cumulative_min", ArrayFromJSON(ty, "[5, 6, null, 2, null, 1]"), + ArrayFromJSON(ty, "[5, 5, null, null, null, null]"), &options); + + CheckVectorUnary("cumulative_min", ArrayFromJSON(ty, "[null, 6, null, 2, null, 1]"), + ArrayFromJSON(ty, "[null, null, null, null, null, null]"), &options); + + CheckVectorUnary("cumulative_min", + ChunkedArrayFromJSON(ty, {"[5, 6, 4]", "[2, 3, 1]"}), + ChunkedArrayFromJSON(ty, {"[5, 5, 4, 2, 2, 1]"}), &options); + + CheckVectorUnary( + "cumulative_min", ChunkedArrayFromJSON(ty, {"[5, 6, null]", "[2, null, 1]"}), + ChunkedArrayFromJSON(ty, {"[5, 5, null, null, null, null]"}), &options); + + CheckVectorUnary( + "cumulative_min", ChunkedArrayFromJSON(ty, {"[null, 6, null]", "[2, null, 1]"}), + ChunkedArrayFromJSON(ty, {"[null, null, null, null, null, null]"}), &options); + } +} + +TEST(TestCumulativeMin, HasStartNoSkip) { + CumulativeOptions options(3.0); + for (auto ty : NumericTypes()) { + CheckVectorUnary("cumulative_min", ArrayFromJSON(ty, "[5, 6, 4, 2, 3, 1]"), + ArrayFromJSON(ty, "[3, 3, 3, 2, 2, 1]"), &options); + + CheckVectorUnary("cumulative_min", ArrayFromJSON(ty, "[5, 6, null, 2, null, 1]"), + ArrayFromJSON(ty, "[3, 3, null, null, null, null]"), &options); + + CheckVectorUnary("cumulative_min", ArrayFromJSON(ty, "[null, 6, null, 2, null, 1]"), + ArrayFromJSON(ty, "[null, null, null, null, null, null]"), &options); + + CheckVectorUnary("cumulative_min", + ChunkedArrayFromJSON(ty, {"[5, 6, 4]", "[2, 3, 1]"}), + ChunkedArrayFromJSON(ty, {"[3, 3, 3, 2, 2, 1]"}), &options); + + CheckVectorUnary( + "cumulative_min", ChunkedArrayFromJSON(ty, {"[5, 6, null]", "[2, null, 1]"}), + ChunkedArrayFromJSON(ty, {"[3, 3, null, null, null, null]"}), &options); + + CheckVectorUnary( + "cumulative_min", ChunkedArrayFromJSON(ty, {"[null, 6, null]", "[2, null, 1]"}), + ChunkedArrayFromJSON(ty, {"[null, null, null, null, null, null]"}), &options); + } +} + +TEST(TestCumulativeMin, HasStartDoSkip) { + CumulativeOptions options(3.0, true); + for (auto ty : NumericTypes()) { + CheckVectorUnary("cumulative_min", ArrayFromJSON(ty, "[5, 6, 4, 2, 3, 1]"), + ArrayFromJSON(ty, "[3, 3, 3, 2, 2, 1]"), &options); + + CheckVectorUnary("cumulative_min", ArrayFromJSON(ty, "[5, 6, null, 2, null, 1]"), + ArrayFromJSON(ty, "[3, 3, null, 2, null, 1]"), &options); + + CheckVectorUnary("cumulative_min", ArrayFromJSON(ty, "[null, 6, null, 2, null, 1]"), + ArrayFromJSON(ty, "[null, 3, null, 2, null, 1]"), &options); + + CheckVectorUnary("cumulative_min", + ChunkedArrayFromJSON(ty, {"[5, 6, 4]", "[2, 3, 1]"}), + ChunkedArrayFromJSON(ty, {"[3, 3, 3, 2, 2, 1]"}), &options); + + CheckVectorUnary("cumulative_min", + ChunkedArrayFromJSON(ty, {"[5, 6, null]", "[2, null, 1]"}), + ChunkedArrayFromJSON(ty, {"[3, 3, null, 2, null, 1]"}), &options); + + CheckVectorUnary("cumulative_min", + ChunkedArrayFromJSON(ty, {"[null, 6, null]", "[2, null, 1]"}), + ChunkedArrayFromJSON(ty, {"[null, 3, null, 2, null, 1]"}), &options); + } +} + +TEST(TestCumulativeMin, NoStartDoSkip) { + CumulativeOptions options(true); + for (auto ty : NumericTypes()) { + CheckVectorUnary("cumulative_min", ArrayFromJSON(ty, "[5, 6, 4, 2, 3, 1]"), + ArrayFromJSON(ty, "[5, 5, 4, 2, 2, 1]"), &options); + + CheckVectorUnary("cumulative_min", ArrayFromJSON(ty, "[5, 6, null, 2, null, 1]"), + ArrayFromJSON(ty, "[5, 5, null, 2, null, 1]"), &options); + + CheckVectorUnary("cumulative_min", ArrayFromJSON(ty, "[null, 6, null, 2, null, 1]"), + ArrayFromJSON(ty, "[null, 6, null, 2, null, 1]"), &options); + + CheckVectorUnary("cumulative_min", + ChunkedArrayFromJSON(ty, {"[5, 6, 4]", "[2, 3, 1]"}), + ChunkedArrayFromJSON(ty, {"[5, 5, 4, 2, 2, 1]"}), &options); + + CheckVectorUnary("cumulative_min", + ChunkedArrayFromJSON(ty, {"[5, 6, null]", "[2, null, 1]"}), + ChunkedArrayFromJSON(ty, {"[5, 5, null, 2, null, 1]"}), &options); + + CheckVectorUnary("cumulative_min", + ChunkedArrayFromJSON(ty, {"[null, 6, null]", "[2, null, 1]"}), + ChunkedArrayFromJSON(ty, {"[null, 6, null, 2, null, 1]"}), &options); } } TEST(TestCumulativeSum, ConvenienceFunctionCheckOverflow) { ASSERT_ARRAYS_EQUAL(*CumulativeSum(ArrayFromJSON(int8(), "[127, 1]"), - CumulativeSumOptions::Defaults(), false) + CumulativeOptions::Defaults(), false) ->make_array(), *ArrayFromJSON(int8(), "[127, -128]")); EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("overflow"), CumulativeSum(ArrayFromJSON(int8(), "[127, 1]"), - CumulativeSumOptions::Defaults(), true)); + CumulativeOptions::Defaults(), true)); +} + +TEST(TestCumulativeProd, ConvenienceFunctionCheckOverflow) { + ASSERT_ARRAYS_EQUAL(*CumulativeProd(ArrayFromJSON(int8(), "[-128, -1]"), + CumulativeOptions::Defaults(), false) + ->make_array(), + *ArrayFromJSON(int8(), "[-128, -128]")); + + EXPECT_RAISES_WITH_MESSAGE_THAT(Invalid, HasSubstr("overflow"), + CumulativeSum(ArrayFromJSON(int8(), "[-128, -1]"), + CumulativeOptions::Defaults(), true)); +} + +TEST(TestCumulativeMax, ConvenienceFunction) { + ASSERT_ARRAYS_EQUAL( + *CumulativeMax(ArrayFromJSON(int8(), "[1, 2, 3]"), CumulativeOptions::Defaults()) + ->make_array(), + *ArrayFromJSON(int8(), "[1, 2, 3]")); +} + +TEST(TestCumulativeMin, ConvenienceFunction) { + ASSERT_ARRAYS_EQUAL( + *CumulativeMin(ArrayFromJSON(int8(), "[-1, -2, -3]"), CumulativeOptions::Defaults()) + ->make_array(), + *ArrayFromJSON(int8(), "[-1, -2, -3]")); +} + +TEST(TestCumulative, NaN) { + // addition with NaN is always NaN + CheckVectorUnary("cumulative_sum", ArrayFromJSON(float64(), "[1, 2, NaN, 4, 5]"), + ArrayFromJSON(float64(), "[1, 3, NaN, NaN, NaN]")); + + // multiply with Nan is always NaN + CheckVectorUnary("cumulative_prod", ArrayFromJSON(float64(), "[1, 2, NaN, 4, 5]"), + ArrayFromJSON(float64(), "[1, 2, NaN, NaN, NaN]")); + + // max with NaN is always ignored because Nan > a always returns false + CheckVectorUnary("cumulative_max", ArrayFromJSON(float64(), "[1, 2, NaN, 4, 5]"), + ArrayFromJSON(float64(), "[1, 2, 2, 4, 5]")); + + // min with NaN is always ignored because Nan < a always returns false + CheckVectorUnary("cumulative_min", ArrayFromJSON(float64(), "[5, 4, NaN, 2, 1]"), + ArrayFromJSON(float64(), "[5, 4, 4, 2, 1]")); } } // namespace compute } // namespace arrow diff --git a/cpp/src/arrow/scalar.h b/cpp/src/arrow/scalar.h index d23b33e28f75c..0797306a67413 100644 --- a/cpp/src/arrow/scalar.h +++ b/cpp/src/arrow/scalar.h @@ -693,6 +693,9 @@ inline std::shared_ptr MakeScalar(std::string value) { return std::make_shared(std::move(value)); } +inline std::shared_ptr MakeScalar(const std::shared_ptr& scalar) { + return scalar; +} /// @} template diff --git a/docs/source/cpp/compute.rst b/docs/source/cpp/compute.rst index 7a8aa67a0d7f1..70c17ae2b96ea 100644 --- a/docs/source/cpp/compute.rst +++ b/docs/source/cpp/compute.rst @@ -1613,28 +1613,39 @@ Array-wise ("vector") functions Cumulative Functions ~~~~~~~~~~~~~~~~~~~~ -Cumulative functions are vector functions that perform a running total on their -input using a given binary associative operation and output an array containing -the corresponding intermediate running values. The input is expected to be of -numeric type. By default these functions do not detect overflow. They are also -available in an overflow-checking variant, suffixed ``_checked``, which returns -an ``Invalid`` :class:`Status` when overflow is detected. +Cumulative functions are vector functions that perform a running accumulation on +their input using a given binary associative operation with an identidy element +(a monoid) and output an array containing the corresponding intermediate running +values. The input is expected to be of numeric type. By default these functions +do not detect overflow. They are alsoavailable in an overflow-checking variant, +suffixed ``_checked``, which returns an ``Invalid`` :class:`Status` when +overflow is detected. +------------------------+-------+-------------+-------------+--------------------------------+-------+ -| Function name | Arity | Input types | Output type | Options class | Notes | -+========================+=======+=============+=============+================================+=======+ -| cumulative_sum | Unary | Numeric | Numeric | :struct:`CumulativeSumOptions` | \(1) | -+------------------------+-------+-------------+-------------+--------------------------------+-------+ -| cumulative_sum_checked | Unary | Numeric | Numeric | :struct:`CumulativeSumOptions` | \(1) | -+------------------------+-------+-------------+-------------+--------------------------------+-------+ - -* \(1) CumulativeSumOptions has two optional parameters. The first parameter - :member:`CumulativeSumOptions::start` is a starting value for the running - sum. It has a default value of 0. Specified values of ``start`` must have the - same type as the input. The second parameter - :member:`CumulativeSumOptions::skip_nulls` is a boolean. When set to +| Function name | Arity | Input types | Output type | Options class | Notes | ++=========================+=======+=============+=============+================================+=======+ +| cumulative_sum | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | ++-------------------------+-------+-------------+-------------+--------------------------------+-------+ +| cumulative_sum_checked | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | ++-------------------------+-------+-------------+-------------+--------------------------------+-------+ +| cumulative_prod | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | ++-------------------------+-------+-------------+-------------+--------------------------------+-------+ +| cumulative_prod_checked | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | ++-------------------------+-------+-------------+-------------+--------------------------------+-------+ +| cumulative_max | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | ++-------------------------+-------+-------------+-------------+--------------------------------+-------+ +| cumulative_min | Unary | Numeric | Numeric | :struct:`CumulativeOptions` | \(1) | ++-------------------------+-------+-------------+-------------+--------------------------------+-------+ + +* \(1) CumulativeOptions has two optional parameters. The first parameter + :member:`CumulativeOptions::start` is a starting value for the running + accumulation. It has a default value of 0 for `sum`, 1 for `prod`, min of + input type for `max`, and max of input type for `min`. Specified values of + ``start`` must be castable to the input type. The second parameter + :member:`CumulativeOptions::skip_nulls` is a boolean. When set to false (the default), the first encountered null is propagated. When set to - true, each null in the input produces a corresponding null in the output. + true, each null in the input produces a corresponding null in the output and + doesn't affect the accumulation forward. Associative transforms ~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/python/api/compute.rst b/docs/source/python/api/compute.rst index c04652e79cdab..43deedd653425 100644 --- a/docs/source/python/api/compute.rst +++ b/docs/source/python/api/compute.rst @@ -52,9 +52,11 @@ Aggregations Cumulative Functions -------------------- -Cumulative functions are vector functions that perform a running total on their -input and output an array containing the corresponding intermediate running values. -By default these functions do not detect overflow. They are also +Cumulative functions are vector functions that perform a running accumulation on +their input using a given binary associative operation with an identidy element +(a monoid) and output an array containing the corresponding intermediate running +values. The input is expected to be of numeric type. By default these functions +do not detect overflow. They are also available in an overflow-checking variant, suffixed ``_checked``, which throws an ``ArrowInvalid`` exception when overflow is detected. @@ -63,6 +65,10 @@ throws an ``ArrowInvalid`` exception when overflow is detected. cumulative_sum cumulative_sum_checked + cumulative_prod + cumulative_prod_checked + cumulative_max + cumulative_min Arithmetic Functions -------------------- diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index a5db5be551456..0faeac89ed156 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -1928,31 +1928,37 @@ class PartitionNthOptions(_PartitionNthOptions): self._set_options(pivot, null_placement) -cdef class _CumulativeSumOptions(FunctionOptions): +cdef class _CumulativeOptions(FunctionOptions): def _set_options(self, start, skip_nulls): - if not isinstance(start, Scalar): + if start is None: + self.wrapped.reset(new CCumulativeOptions(skip_nulls)) + elif isinstance(start, Scalar): + self.wrapped.reset(new CCumulativeOptions( + pyarrow_unwrap_scalar(start), skip_nulls)) + else: try: start = lib.scalar(start) + self.wrapped.reset(new CCumulativeOptions( + pyarrow_unwrap_scalar(start), skip_nulls)) except Exception: _raise_invalid_function_option( start, "`start` type for CumulativeSumOptions", TypeError) - self.wrapped.reset(new CCumulativeSumOptions(( start).unwrap(), skip_nulls)) - -class CumulativeSumOptions(_CumulativeSumOptions): +class CumulativeOptions(_CumulativeOptions): """ - Options for `cumulative_sum` function. + Options for `cumulative` functions. Parameters ---------- - start : Scalar, default 0.0 - Starting value for sum computation + start : Scalar, default None + Starting value for the cumulative operation. If none is given, + a default value depending on the operation and input type is used. skip_nulls : bool, default False When false, the first encountered null is propagated. """ - def __init__(self, start=0.0, *, skip_nulls=False): + def __init__(self, start=None, *, skip_nulls=False): self._set_options(start, skip_nulls) diff --git a/python/pyarrow/compute.py b/python/pyarrow/compute.py index e299d44c04e16..65073725fe109 100644 --- a/python/pyarrow/compute.py +++ b/python/pyarrow/compute.py @@ -33,7 +33,7 @@ AssumeTimezoneOptions, CastOptions, CountOptions, - CumulativeSumOptions, + CumulativeOptions, DayOfWeekOptions, DictionaryEncodeOptions, RunEndEncodeOptions, diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 3190877ea0997..80df0106912d7 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -2400,10 +2400,11 @@ cdef extern from "arrow/compute/api.h" namespace "arrow::compute" nogil: int64_t pivot CNullPlacement null_placement - cdef cppclass CCumulativeSumOptions \ - "arrow::compute::CumulativeSumOptions"(CFunctionOptions): - CCumulativeSumOptions(shared_ptr[CScalar] start, c_bool skip_nulls) - shared_ptr[CScalar] start + cdef cppclass CCumulativeOptions \ + "arrow::compute::CumulativeOptions"(CFunctionOptions): + CCumulativeOptions(c_bool skip_nulls) + CCumulativeOptions(shared_ptr[CScalar] start, c_bool skip_nulls) + optional[shared_ptr[CScalar]] start c_bool skip_nulls cdef cppclass CArraySortOptions \ diff --git a/python/pyarrow/tests/test_compute.py b/python/pyarrow/tests/test_compute.py index 875d0e613b6ca..79acd4b140c12 100644 --- a/python/pyarrow/tests/test_compute.py +++ b/python/pyarrow/tests/test_compute.py @@ -151,7 +151,7 @@ def test_option_class_equality(): pc.NullOptions(), pc.PadOptions(5), pc.PartitionNthOptions(1, null_placement="at_start"), - pc.CumulativeSumOptions(start=0, skip_nulls=False), + pc.CumulativeOptions(start=None, skip_nulls=False), pc.QuantileOptions(), pc.RandomOptions(), pc.RankOptions(sort_keys="ascending", @@ -2660,7 +2660,7 @@ def test_min_max_element_wise(): def test_cumulative_sum(start, skip_nulls): # Exact tests (e.g., integral types) start_int = int(start) - starts = [start_int, pa.scalar(start_int, type=pa.int8()), + starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), pa.scalar(start_int, type=pa.int64())] for strt in starts: arrays = [ @@ -2678,10 +2678,10 @@ def test_cumulative_sum(start, skip_nulls): for i, arr in enumerate(arrays): result = pc.cumulative_sum(arr, start=strt, skip_nulls=skip_nulls) # Add `start` offset to expected array before comparing - expected = pc.add(expected_arrays[i], strt) + expected = pc.add(expected_arrays[i], strt if strt is not None else 0) assert result.equals(expected) - starts = [start, pa.scalar(start, type=pa.float32()), + starts = [None, start, pa.scalar(start, type=pa.float32()), pa.scalar(start, type=pa.float64())] for strt in starts: arrays = [ @@ -2698,7 +2698,7 @@ def test_cumulative_sum(start, skip_nulls): for i, arr in enumerate(arrays): result = pc.cumulative_sum(arr, start=strt, skip_nulls=skip_nulls) # Add `start` offset to expected array before comparing - expected = pc.add(expected_arrays[i], strt) + expected = pc.add(expected_arrays[i], strt if strt is not None else 0) np.testing.assert_array_almost_equal(result.to_numpy( zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) @@ -2707,6 +2707,170 @@ def test_cumulative_sum(start, skip_nulls): pc.cumulative_sum([1, 2, 3], start=strt) +@pytest.mark.parametrize('start', (1.25, 10.5, -10.5)) +@pytest.mark.parametrize('skip_nulls', (True, False)) +def test_cumulative_prod(start, skip_nulls): + # Exact tests (e.g., integral types) + start_int = int(start) + starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), + pa.scalar(start_int, type=pa.int64())] + for strt in starts: + arrays = [ + pa.array([1, 2, 3]), + pa.array([1, None, 20, 5]), + pa.chunked_array([[1, None], [20, 5]]) + ] + expected_arrays = [ + pa.array([1, 2, 6]), + pa.array([1, None, 20, 100]) + if skip_nulls else pa.array([1, None, None, None]), + pa.chunked_array([[1, None, 20, 100]]) + if skip_nulls else pa.chunked_array([[1, None, None, None]]) + ] + for i, arr in enumerate(arrays): + result = pc.cumulative_prod(arr, start=strt, skip_nulls=skip_nulls) + # Multiply `start` offset to expected array before comparing + expected = pc.multiply(expected_arrays[i], strt if strt is not None else 1) + assert result.equals(expected) + + starts = [None, start, pa.scalar(start, type=pa.float32()), + pa.scalar(start, type=pa.float64())] + for strt in starts: + arrays = [ + pa.array([1.5, 2.5, 3.5]), + pa.array([1, np.nan, 2, -3, 4, 5]), + pa.array([1, np.nan, None, 3, None, 5]) + ] + expected_arrays = [ + np.array([1.5, 3.75, 13.125]), + np.array([1, np.nan, np.nan, np.nan, np.nan, np.nan]), + np.array([1, np.nan, None, np.nan, None, np.nan]) + if skip_nulls else np.array([1, np.nan, None, None, None, None]) + ] + for i, arr in enumerate(arrays): + result = pc.cumulative_prod(arr, start=strt, skip_nulls=skip_nulls) + # Multiply `start` offset to expected array before comparing + expected = pc.multiply(expected_arrays[i], strt if strt is not None else 1) + np.testing.assert_array_almost_equal(result.to_numpy( + zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) + + for strt in ['a', pa.scalar('arrow'), 1.1]: + with pytest.raises(pa.ArrowInvalid): + pc.cumulative_prod([1, 2, 3], start=strt) + + +@pytest.mark.parametrize('start', (0.5, 3.5, 6.5)) +@pytest.mark.parametrize('skip_nulls', (True, False)) +def test_cumulative_max(start, skip_nulls): + # Exact tests (e.g., integral types) + start_int = int(start) + starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), + pa.scalar(start_int, type=pa.int64())] + for strt in starts: + arrays = [ + pa.array([2, 1, 3, 5, 4, 6]), + pa.array([2, 1, None, 5, 4, None]), + pa.chunked_array([[2, 1, None], [5, 4, None]]) + ] + expected_arrays = [ + pa.array([2, 2, 3, 5, 5, 6]), + pa.array([2, 2, None, 5, 5, None]) + if skip_nulls else pa.array([2, 2, None, None, None, None]), + pa.chunked_array([[2, 2, None, 5, 5, None]]) + if skip_nulls else + pa.chunked_array([[2, 2, None, None, None, None]]) + ] + for i, arr in enumerate(arrays): + result = pc.cumulative_max(arr, start=strt, skip_nulls=skip_nulls) + # Max `start` offset with expected array before comparing + expected = pc.max_element_wise( + expected_arrays[i], strt if strt is not None else int(-1e9), + skip_nulls=False) + assert result.equals(expected) + + starts = [None, start, pa.scalar(start, type=pa.float32()), + pa.scalar(start, type=pa.float64())] + for strt in starts: + arrays = [ + pa.array([2.5, 1.3, 3.7, 5.1, 4.9, 6.2]), + pa.array([2.5, 1.3, 3.7, np.nan, 4.9, 6.2]), + pa.array([2.5, 1.3, None, np.nan, 4.9, None]) + ] + expected_arrays = [ + np.array([2.5, 2.5, 3.7, 5.1, 5.1, 6.2]), + np.array([2.5, 2.5, 3.7, 3.7, 4.9, 6.2]), + np.array([2.5, 2.5, None, 2.5, 4.9, None]) + if skip_nulls else np.array([2.5, 2.5, None, None, None, None]) + ] + for i, arr in enumerate(arrays): + result = pc.cumulative_max(arr, start=strt, skip_nulls=skip_nulls) + # Max `start` offset with expected array before comparing + expected = pc.max_element_wise( + expected_arrays[i], strt if strt is not None else -1e9, skip_nulls=False) + np.testing.assert_array_almost_equal(result.to_numpy( + zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) + + for strt in ['a', pa.scalar('arrow'), 1.1]: + with pytest.raises(pa.ArrowInvalid): + pc.cumulative_max([1, 2, 3], start=strt) + + +@pytest.mark.parametrize('start', (0.5, 3.5, 6.5)) +@pytest.mark.parametrize('skip_nulls', (True, False)) +def test_cumulative_min(start, skip_nulls): + # Exact tests (e.g., integral types) + start_int = int(start) + starts = [None, start_int, pa.scalar(start_int, type=pa.int8()), + pa.scalar(start_int, type=pa.int64())] + for strt in starts: + arrays = [ + pa.array([5, 6, 4, 2, 3, 1]), + pa.array([5, 6, None, 2, 3, None]), + pa.chunked_array([[5, 6, None], [2, 3, None]]) + ] + expected_arrays = [ + pa.array([5, 5, 4, 2, 2, 1]), + pa.array([5, 5, None, 2, 2, None]) + if skip_nulls else pa.array([5, 5, None, None, None, None]), + pa.chunked_array([[5, 5, None, 2, 2, None]]) + if skip_nulls else + pa.chunked_array([[5, 5, None, None, None, None]]) + ] + for i, arr in enumerate(arrays): + result = pc.cumulative_min(arr, start=strt, skip_nulls=skip_nulls) + # Min `start` offset with expected array before comparing + expected = pc.min_element_wise( + expected_arrays[i], strt if strt is not None else int(1e9), + skip_nulls=False) + assert result.equals(expected) + + starts = [None, start, pa.scalar(start, type=pa.float32()), + pa.scalar(start, type=pa.float64())] + for strt in starts: + arrays = [ + pa.array([5.5, 6.3, 4.7, 2.1, 3.9, 1.2]), + pa.array([5.5, 6.3, 4.7, np.nan, 3.9, 1.2]), + pa.array([5.5, 6.3, None, np.nan, 3.9, None]) + ] + expected_arrays = [ + np.array([5.5, 5.5, 4.7, 2.1, 2.1, 1.2]), + np.array([5.5, 5.5, 4.7, 4.7, 3.9, 1.2]), + np.array([5.5, 5.5, None, 5.5, 3.9, None]) + if skip_nulls else np.array([5.5, 5.5, None, None, None, None]) + ] + for i, arr in enumerate(arrays): + result = pc.cumulative_min(arr, start=strt, skip_nulls=skip_nulls) + # Min `start` offset with expected array before comparing + expected = pc.min_element_wise( + expected_arrays[i], strt if strt is not None else 1e9, skip_nulls=False) + np.testing.assert_array_almost_equal(result.to_numpy( + zero_copy_only=False), expected.to_numpy(zero_copy_only=False)) + + for strt in ['a', pa.scalar('arrow'), 1.1]: + with pytest.raises(pa.ArrowInvalid): + pc.cumulative_max([1, 2, 3], start=strt) + + def test_make_struct(): assert pc.make_struct(1, 'a').as_py() == {'0': 1, '1': 'a'}