From 772a01c080ad57eb11e9323f5347472b769d45de Mon Sep 17 00:00:00 2001 From: Junming Chen Date: Sat, 23 Sep 2023 00:40:31 +0800 Subject: [PATCH] GH-36420: [C++] Add An Enum Option For SetLookup Options (#36739) ### Rationale for this change As #36420 says, we want add an sql-compatible `is_in` variant, which has a different logic handling Null. After a dicussion with @ ianmcook and @ bkietz, we decide to support an enum option `null_matching_behavior` for SetLookup, which actually adds two semantics of null handling for `is_in` and doesn't add an new behavior for `index_in`. The enum option `null_matching_behavior` will replace `skip_nulls` in the future. ### What changes are included in this PR? Add an enum parameter `null_matching_behavior` for SetLookupOptions. ### Are these changes tested? Two kinds of tests are implemented - Replace default parameter with `null_matching_behavior` instead of `skip_nulls` for `is_in` and `index_in` tests - Add tests for `NullMatchingBehavior::EMIT_NULL` and `NullMatchingBehavior::INCONCLUSIVE` for `is_in` Besides, since the `skip_nulls` is not deprecated now, I still preserve the old tests with `skip_nulls`. When the `skip_nulls` is totally deprecated, we can replace the test parameter `skip_nulls=false` with `null_matching_behavior=MATCH` and `skip_nulls=true` with `null_matching_behavior=SKIP` for these old tests. ### Are there any user-facing changes? No. Currently we support backward compatibility. In the future, we plan to replace `skip_nulls` with `null_matching_behavior` completely. * Closes: #36420 Lead-authored-by: Junming Chen Co-authored-by: Sutou Kouhei Co-authored-by: Benjamin Kietzman Signed-off-by: Benjamin Kietzman --- c_glib/arrow-glib/compute.cpp | 17 +- cpp/src/arrow/compute/api_scalar.cc | 52 +- cpp/src/arrow/compute/api_scalar.h | 34 +- cpp/src/arrow/compute/expression_test.cc | 5 +- .../compute/kernels/scalar_set_lookup.cc | 104 ++- .../compute/kernels/scalar_set_lookup_test.cc | 756 +++++++++++++++++- cpp/src/arrow/util/reflection_internal.h | 24 + python/pyarrow/_compute.pyx | 2 +- 8 files changed, 942 insertions(+), 52 deletions(-) diff --git a/c_glib/arrow-glib/compute.cpp b/c_glib/arrow-glib/compute.cpp index 7fe005f94a5bb..9692f277d183f 100644 --- a/c_glib/arrow-glib/compute.cpp +++ b/c_glib/arrow-glib/compute.cpp @@ -3346,7 +3346,7 @@ garrow_set_lookup_options_get_property(GObject *object, g_value_set_object(value, priv->value_set); break; case PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS: - g_value_set_boolean(value, options->skip_nulls); + g_value_set_boolean(value, options->skip_nulls.has_value() && options->skip_nulls.value()); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID(object, prop_id, pspec); @@ -3398,13 +3398,11 @@ garrow_set_lookup_options_class_init(GArrowSetLookupOptionsClass *klass) * * Since: 6.0.0 */ - spec = g_param_spec_boolean("skip-nulls", - "Skip NULLs", - "Whether NULLs are skipped or not", - options.skip_nulls, - static_cast(G_PARAM_READWRITE)); - g_object_class_install_property(gobject_class, - PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS, + auto skip_nulls = (options.skip_nulls.has_value() && options.skip_nulls.value()); + spec = + g_param_spec_boolean("skip-nulls", "Skip NULLs", "Whether NULLs are skipped or not", + skip_nulls, static_cast(G_PARAM_READWRITE)); + g_object_class_install_property(gobject_class, PROP_SET_LOOKUP_OPTIONS_SKIP_NULLS, spec); } @@ -6458,9 +6456,10 @@ garrow_set_lookup_options_new_raw( arrow_copied_options.get()); auto value_set = garrow_datum_new_raw(&(arrow_copied_set_lookup_options->value_set)); + auto skip_nulls = (arrow_options->skip_nulls.has_value() && arrow_options->skip_nulls.value()); auto options = g_object_new(GARROW_TYPE_SET_LOOKUP_OPTIONS, "value-set", value_set, - "skip-nulls", arrow_options->skip_nulls, + "skip-nulls", skip_nulls, NULL); return GARROW_SET_LOOKUP_OPTIONS(options); } diff --git a/cpp/src/arrow/compute/api_scalar.cc b/cpp/src/arrow/compute/api_scalar.cc index d7a61d0a55985..eaec940556361 100644 --- a/cpp/src/arrow/compute/api_scalar.cc +++ b/cpp/src/arrow/compute/api_scalar.cc @@ -275,6 +275,29 @@ struct EnumTraits } }; +template <> +struct EnumTraits + : BasicEnumTraits { + static std::string name() { return "SetLookupOptions::NullMatchingBehavior"; } + static std::string value_name(compute::SetLookupOptions::NullMatchingBehavior value) { + switch (value) { + case compute::SetLookupOptions::NullMatchingBehavior::MATCH: + return "MATCH"; + case compute::SetLookupOptions::NullMatchingBehavior::SKIP: + return "SKIP"; + case compute::SetLookupOptions::NullMatchingBehavior::EMIT_NULL: + return "EMIT_NULL"; + case compute::SetLookupOptions::NullMatchingBehavior::INCONCLUSIVE: + return "INCONCLUSIVE"; + } + return ""; + } +}; + } // namespace internal namespace compute { @@ -286,6 +309,7 @@ using ::arrow::internal::checked_cast; namespace internal { namespace { +using ::arrow::internal::CoercedDataMember; using ::arrow::internal::DataMember; static auto kArithmeticOptionsType = GetFunctionOptionsType( DataMember("check_overflow", &ArithmeticOptions::check_overflow)); @@ -344,7 +368,8 @@ static auto kRoundToMultipleOptionsType = GetFunctionOptionsType( DataMember("value_set", &SetLookupOptions::value_set), - DataMember("skip_nulls", &SetLookupOptions::skip_nulls)); + CoercedDataMember("null_matching_behavior", &SetLookupOptions::null_matching_behavior, + &SetLookupOptions::GetNullMatchingBehavior)); static auto kSliceOptionsType = GetFunctionOptionsType( DataMember("start", &SliceOptions::start), DataMember("stop", &SliceOptions::stop), DataMember("step", &SliceOptions::step)); @@ -540,8 +565,29 @@ constexpr char RoundToMultipleOptions::kTypeName[]; SetLookupOptions::SetLookupOptions(Datum value_set, bool skip_nulls) : FunctionOptions(internal::kSetLookupOptionsType), value_set(std::move(value_set)), - skip_nulls(skip_nulls) {} -SetLookupOptions::SetLookupOptions() : SetLookupOptions({}, false) {} + skip_nulls(skip_nulls) { + if (skip_nulls) { + this->null_matching_behavior = SetLookupOptions::SKIP; + } else { + this->null_matching_behavior = SetLookupOptions::MATCH; + } +} +SetLookupOptions::SetLookupOptions( + Datum value_set, SetLookupOptions::NullMatchingBehavior null_matching_behavior) + : FunctionOptions(internal::kSetLookupOptionsType), + value_set(std::move(value_set)), + null_matching_behavior(std::move(null_matching_behavior)) {} +SetLookupOptions::SetLookupOptions() + : SetLookupOptions({}, SetLookupOptions::NullMatchingBehavior::MATCH) {} +SetLookupOptions::NullMatchingBehavior SetLookupOptions::GetNullMatchingBehavior() const { + if (!this->skip_nulls.has_value()) { + return this->null_matching_behavior; + } else if (this->skip_nulls.value()) { + return SetLookupOptions::SKIP; + } else { + return SetLookupOptions::MATCH; + } +} constexpr char SetLookupOptions::kTypeName[]; SliceOptions::SliceOptions(int64_t start, int64_t stop, int64_t step) diff --git a/cpp/src/arrow/compute/api_scalar.h b/cpp/src/arrow/compute/api_scalar.h index 0a06a2829f0da..9f12471ddca14 100644 --- a/cpp/src/arrow/compute/api_scalar.h +++ b/cpp/src/arrow/compute/api_scalar.h @@ -268,19 +268,49 @@ class ARROW_EXPORT ExtractRegexOptions : public FunctionOptions { /// Options for IsIn and IndexIn functions class ARROW_EXPORT SetLookupOptions : public FunctionOptions { public: - explicit SetLookupOptions(Datum value_set, bool skip_nulls = false); + /// How to handle null values. + enum NullMatchingBehavior { + /// MATCH, any null in `value_set` is successfully matched in + /// the input. + MATCH, + /// SKIP, any null in `value_set` is ignored and nulls in the input + /// produce null (IndexIn) or false (IsIn) values in the output. + SKIP, + /// EMIT_NULL, any null in `value_set` is ignored and nulls in the + /// input produce null (IndexIn and IsIn) values in the output. + EMIT_NULL, + /// INCONCLUSIVE, null values are regarded as unknown values, which is + /// sql-compatible. nulls in the input produce null (IndexIn and IsIn) + /// values in the output. Besides, if `value_set` contains a null, + /// non-null unmatched values in the input also produce null values + /// (IndexIn and IsIn) in the output. + INCONCLUSIVE + }; + + explicit SetLookupOptions(Datum value_set, NullMatchingBehavior = MATCH); SetLookupOptions(); + + // DEPRECATED(will be removed after removing of skip_nulls) + explicit SetLookupOptions(Datum value_set, bool skip_nulls); + static constexpr char const kTypeName[] = "SetLookupOptions"; /// The set of values to look up input values into. Datum value_set; + + NullMatchingBehavior null_matching_behavior; + + // DEPRECATED(will be removed after removing of skip_nulls) + NullMatchingBehavior GetNullMatchingBehavior() const; + + // DEPRECATED(use null_matching_behavior instead) /// Whether nulls in `value_set` count for lookup. /// /// If true, any null in `value_set` is ignored and nulls in the input /// produce null (IndexIn) or false (IsIn) values in the output. /// If false, any null in `value_set` is successfully matched in /// the input. - bool skip_nulls; + std::optional skip_nulls; }; /// Options for struct_field function diff --git a/cpp/src/arrow/compute/expression_test.cc b/cpp/src/arrow/compute/expression_test.cc index b852f6f6b0cdb..44159e76600fb 100644 --- a/cpp/src/arrow/compute/expression_test.cc +++ b/cpp/src/arrow/compute/expression_test.cc @@ -263,8 +263,9 @@ TEST(Expression, ToString) { auto in_12 = call("index_in", {field_ref("beta")}, compute::SetLookupOptions{ArrayFromJSON(int32(), "[1,2]")}); - EXPECT_EQ(in_12.ToString(), - "index_in(beta, {value_set=int32:[\n 1,\n 2\n], skip_nulls=false})"); + EXPECT_EQ( + in_12.ToString(), + "index_in(beta, {value_set=int32:[\n 1,\n 2\n], null_matching_behavior=MATCH})"); EXPECT_EQ(and_(field_ref("a"), field_ref("b")).ToString(), "(a and b)"); EXPECT_EQ(or_(field_ref("a"), field_ref("b")).ToString(), "(a or b)"); diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc index 00d391653d240..e2d5583e36e6b 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup.cc @@ -44,6 +44,7 @@ struct SetLookupState : public SetLookupStateBase { explicit SetLookupState(MemoryPool* pool) : memory_pool(pool) {} Status Init(const SetLookupOptions& options) { + this->null_matching_behavior = options.GetNullMatchingBehavior(); if (options.value_set.is_array()) { const ArrayData& value_set = *options.value_set.array(); memo_index_to_value_index.reserve(value_set.length); @@ -66,7 +67,8 @@ struct SetLookupState : public SetLookupStateBase { } else { return Status::Invalid("value_set should be an array or chunked array"); } - if (!options.skip_nulls && lookup_table->GetNull() >= 0) { + if (this->null_matching_behavior != SetLookupOptions::SKIP && + lookup_table->GetNull() >= 0) { null_index = memo_index_to_value_index[lookup_table->GetNull()]; } value_set_type = options.value_set.type(); @@ -117,19 +119,23 @@ struct SetLookupState : public SetLookupStateBase { // be mapped back to indices in the value_set. std::vector memo_index_to_value_index; int32_t null_index = -1; + SetLookupOptions::NullMatchingBehavior null_matching_behavior; }; template <> struct SetLookupState : public SetLookupStateBase { explicit SetLookupState(MemoryPool*) {} - Status Init(const SetLookupOptions& options) { - value_set_has_null = (options.value_set.length() > 0) && !options.skip_nulls; + Status Init(SetLookupOptions& options) { + null_matching_behavior = options.GetNullMatchingBehavior(); + value_set_has_null = (options.value_set.length() > 0) && + this->null_matching_behavior != SetLookupOptions::SKIP; value_set_type = null(); return Status::OK(); } bool value_set_has_null; + SetLookupOptions::NullMatchingBehavior null_matching_behavior; }; // TODO: Put this concept somewhere reusable @@ -270,14 +276,20 @@ struct IndexInVisitor { : ctx(ctx), data(data), out(out), out_bitmap(out->buffers[0].data) {} Status Visit(const DataType& type) { - DCHECK_EQ(type.id(), Type::NA); + DCHECK(false) << "IndexIn " << type; + return Status::NotImplemented("IndexIn has no implementation with value type ", type); + } + + Status Visit(const NullType&) { const auto& state = checked_cast&>(*ctx->state()); if (data.length != 0) { - // skip_nulls is honored for consistency with other types - bit_util::SetBitsTo(out_bitmap, out->offset, out->length, state.value_set_has_null); + bit_util::SetBitsTo(out_bitmap, out->offset, out->length, + state.null_matching_behavior == SetLookupOptions::MATCH && + state.value_set_has_null); // Set all values to 0, which will be unmasked only if null is in the value_set + // and null_matching_behavior is equal to MATCH std::memset(out->GetValues(1), 0x00, out->length * sizeof(int32_t)); } return Status::OK(); @@ -305,7 +317,8 @@ struct IndexInVisitor { bitmap_writer.Next(); }, [&]() { - if (state.null_index != -1) { + if (state.null_index != -1 && + state.null_matching_behavior == SetLookupOptions::MATCH) { bitmap_writer.Set(); // value_set included null @@ -379,49 +392,86 @@ Status ExecIndexIn(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { return IndexInVisitor(ctx, batch[0].array, out->array_span_mutable()).Execute(); } -// ---------------------------------------------------------------------- - // IsIn writes the results into a preallocated boolean data bitmap struct IsInVisitor { KernelContext* ctx; const ArraySpan& data; ArraySpan* out; + uint8_t* out_boolean_bitmap; + uint8_t* out_null_bitmap; IsInVisitor(KernelContext* ctx, const ArraySpan& data, ArraySpan* out) - : ctx(ctx), data(data), out(out) {} + : ctx(ctx), + data(data), + out(out), + out_boolean_bitmap(out->buffers[1].data), + out_null_bitmap(out->buffers[0].data) {} Status Visit(const DataType& type) { - DCHECK_EQ(type.id(), Type::NA); + DCHECK(false) << "IndexIn " << type; + return Status::NotImplemented("IsIn has no implementation with value type ", type); + } + + Status Visit(const NullType&) { const auto& state = checked_cast&>(*ctx->state()); - // skip_nulls is honored for consistency with other types - bit_util::SetBitsTo(out->buffers[1].data, out->offset, out->length, - state.value_set_has_null); + + if (state.null_matching_behavior == SetLookupOptions::MATCH && + state.value_set_has_null) { + bit_util::SetBitsTo(out_boolean_bitmap, out->offset, out->length, true); + bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, true); + } else if (state.null_matching_behavior == SetLookupOptions::SKIP || + (!state.value_set_has_null && + state.null_matching_behavior == SetLookupOptions::MATCH)) { + bit_util::SetBitsTo(out_boolean_bitmap, out->offset, out->length, false); + bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, true); + } else { + bit_util::SetBitsTo(out_null_bitmap, out->offset, out->length, false); + } return Status::OK(); } template Status ProcessIsIn(const SetLookupState& state, const ArraySpan& input) { using T = typename GetViewType::T; - FirstTimeBitmapWriter writer(out->buffers[1].data, out->offset, out->length); + FirstTimeBitmapWriter writer_boolean(out_boolean_bitmap, out->offset, out->length); + FirstTimeBitmapWriter writer_null(out_null_bitmap, out->offset, out->length); + bool value_set_has_null = state.null_index != -1; VisitArraySpanInline( input, [&](T v) { - if (state.lookup_table->Get(v) != -1) { - writer.Set(); - } else { - writer.Clear(); + if (state.lookup_table->Get(v) != -1) { // true + writer_boolean.Set(); + writer_null.Set(); + } else if (state.null_matching_behavior == SetLookupOptions::INCONCLUSIVE && + value_set_has_null) { // null + writer_boolean.Clear(); + writer_null.Clear(); + } else { // false + writer_boolean.Clear(); + writer_null.Set(); } - writer.Next(); + writer_boolean.Next(); + writer_null.Next(); }, [&]() { - if (state.null_index != -1) { - writer.Set(); - } else { - writer.Clear(); + if (state.null_matching_behavior == SetLookupOptions::MATCH && + value_set_has_null) { // true + writer_boolean.Set(); + writer_null.Set(); + } else if (state.null_matching_behavior == SetLookupOptions::SKIP || + (!value_set_has_null && state.null_matching_behavior == + SetLookupOptions::MATCH)) { // false + writer_boolean.Clear(); + writer_null.Set(); + } else { // null + writer_boolean.Clear(); + writer_null.Clear(); } - writer.Next(); + writer_boolean.Next(); + writer_null.Next(); }); - writer.Finish(); + writer_boolean.Finish(); + writer_null.Finish(); return Status::OK(); } @@ -598,7 +648,7 @@ void RegisterScalarSetLookup(FunctionRegistry* registry) { ScalarKernel isin_base; isin_base.init = InitSetLookup; isin_base.exec = ExecIsIn; - isin_base.null_handling = NullHandling::OUTPUT_NOT_NULL; + isin_base.null_handling = NullHandling::COMPUTED_PREALLOCATE; auto is_in = std::make_shared("is_in", Arity::Unary(), is_in_doc); AddBasicSetLookupKernels(isin_base, /*output_type=*/boolean(), is_in.get()); diff --git a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc index d1645eb8d9a49..89e10d1b54103 100644 --- a/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc +++ b/cpp/src/arrow/compute/kernels/scalar_set_lookup_test.cc @@ -50,7 +50,67 @@ namespace compute { void CheckIsIn(const std::shared_ptr input, const std::shared_ptr& value_set, const std::string& expected_json, - bool skip_nulls = false) { + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + auto expected = ArrayFromJSON(boolean(), expected_json); + + ASSERT_OK_AND_ASSIGN(Datum actual_datum, + IsIn(input, SetLookupOptions(value_set, null_matching_behavior))); + std::shared_ptr actual = actual_datum.make_array(); + ValidateOutput(actual_datum); + AssertArraysEqual(*expected, *actual, /*verbose=*/true); +} + +void CheckIsIn(const std::shared_ptr& type, const std::string& input_json, + const std::string& value_set_json, const std::string& expected_json, + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + auto input = ArrayFromJSON(type, input_json); + auto value_set = ArrayFromJSON(type, value_set_json); + CheckIsIn(input, value_set, expected_json, null_matching_behavior); +} + +void CheckIsInChunked(const std::shared_ptr& input, + const std::shared_ptr& value_set, + const std::shared_ptr& expected, + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + ASSERT_OK_AND_ASSIGN(Datum actual_datum, + IsIn(input, SetLookupOptions(value_set, null_matching_behavior))); + auto actual = actual_datum.chunked_array(); + ValidateOutput(actual_datum); + + // Output contiguous in a single chunk + ASSERT_EQ(1, actual->num_chunks()); + ASSERT_TRUE(actual->Equals(*expected)); +} + +void CheckIsInDictionary(const std::shared_ptr& type, + const std::shared_ptr& index_type, + const std::string& input_dictionary_json, + const std::string& input_index_json, + const std::string& value_set_json, + const std::string& expected_json, + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + auto dict_type = dictionary(index_type, type); + auto indices = ArrayFromJSON(index_type, input_index_json); + auto dict = ArrayFromJSON(type, input_dictionary_json); + + ASSERT_OK_AND_ASSIGN(auto input, DictionaryArray::FromArrays(dict_type, indices, dict)); + auto value_set = ArrayFromJSON(type, value_set_json); + auto expected = ArrayFromJSON(boolean(), expected_json); + + ASSERT_OK_AND_ASSIGN(Datum actual_datum, + IsIn(input, SetLookupOptions(value_set, null_matching_behavior))); + std::shared_ptr actual = actual_datum.make_array(); + ValidateOutput(actual_datum); + AssertArraysEqual(*expected, *actual, /*verbose=*/true); +} + +void CheckIsIn(const std::shared_ptr input, + const std::shared_ptr& value_set, const std::string& expected_json, + bool skip_nulls) { auto expected = ArrayFromJSON(boolean(), expected_json); ASSERT_OK_AND_ASSIGN(Datum actual_datum, @@ -62,7 +122,7 @@ void CheckIsIn(const std::shared_ptr input, void CheckIsIn(const std::shared_ptr& type, const std::string& input_json, const std::string& value_set_json, const std::string& expected_json, - bool skip_nulls = false) { + bool skip_nulls) { auto input = ArrayFromJSON(type, input_json); auto value_set = ArrayFromJSON(type, value_set_json); CheckIsIn(input, value_set, expected_json, skip_nulls); @@ -70,8 +130,7 @@ void CheckIsIn(const std::shared_ptr& type, const std::string& input_j void CheckIsInChunked(const std::shared_ptr& input, const std::shared_ptr& value_set, - const std::shared_ptr& expected, - bool skip_nulls = false) { + const std::shared_ptr& expected, bool skip_nulls) { ASSERT_OK_AND_ASSIGN(Datum actual_datum, IsIn(input, SetLookupOptions(value_set, skip_nulls))); auto actual = actual_datum.chunked_array(); @@ -87,7 +146,7 @@ void CheckIsInDictionary(const std::shared_ptr& type, const std::string& input_dictionary_json, const std::string& input_index_json, const std::string& value_set_json, - const std::string& expected_json, bool skip_nulls = false) { + const std::string& expected_json, bool skip_nulls) { auto dict_type = dictionary(index_type, type); auto indices = ArrayFromJSON(index_type, input_index_json); auto dict = ArrayFromJSON(type, input_dictionary_json); @@ -185,18 +244,43 @@ TYPED_TEST(TestIsInKernelPrimitive, IsIn) { /*skip_nulls=*/false); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[false, true, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[null, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, 1]", "[null, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Nulls in right array CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", /*skip_nulls=*/false); CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[0, 1, 2, 3, 2]", "[2, null, 1]", "[null, true, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Nulls in both the arrays CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[true, true, true, false, true]", /*skip_nulls=*/false); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[false, true, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[true, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", + "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[null, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[2, null, 1]", "[null, true, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]", @@ -204,6 +288,18 @@ TYPED_TEST(TestIsInKernelPrimitive, IsIn) { /*skip_nulls=*/false); CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]", "[false, true, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]", + "[true, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]", + "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]", + "[null, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[null, 1, 2, 3, 2]", "[null, 2, 2, null, 1, 1]", + "[null, true, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Empty Arrays CheckIsIn(type, "[]", "[]", "[]"); @@ -217,11 +313,30 @@ TEST_F(TestIsInKernel, NullType) { CheckIsIn(type, "[]", "[]", "[]"); CheckIsIn(type, "[null, null]", "[null]", "[false, false]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, null]", "[null]", "[false, false]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, "[null, null]", "[null]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[null, null]", "[null]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); + CheckIsIn(type, "[null, null]", "[]", "[false, false]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, null]", "[]", "[false, false]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, "[null, null]", "[]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[null, null]", "[]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, "[null, null, null]", "[null, null]", "[true, true, true]"); CheckIsIn(type, "[null, null]", "[null, null]", "[false, false]", /*skip_nulls=*/true); + CheckIsIn(type, "[null, null]", "[null, null]", "[false, false]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, "[null, null]", "[null, null]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[null, null]", "[null, null]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); } TEST_F(TestIsInKernel, TimeTimestamp) { @@ -232,12 +347,36 @@ TEST_F(TestIsInKernel, TimeTimestamp) { "[true, true, false, true, true]", /*skip_nulls=*/false); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", "[true, false, false, true, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, false, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, null, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, null, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", "[true, true, false, true, true]", /*skip_nulls=*/false); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", "[true, false, false, true, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, false, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, null, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, null, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); } // Disallow mixing timezone-aware and timezone-naive values @@ -260,12 +399,36 @@ TEST_F(TestIsInKernel, TimeDuration) { "[true, true, false, true, true]", /*skip_nulls=*/false); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", "[true, false, false, true, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, false, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, null, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, null]", + "[true, null, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", "[true, true, false, true, true]", /*skip_nulls=*/false); CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", "[true, false, false, true, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, false, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, null, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[1, null, 5, 1, 2]", "[2, 1, 1, null, 2]", + "[true, null, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); } // Different units, cast value_set to values will fail, then cast values to value_set @@ -285,17 +448,53 @@ TEST_F(TestIsInKernel, Boolean) { "[false, true, false, false, true]", /*skip_nulls=*/false); CheckIsIn(type, "[true, false, null, true, false]", "[false]", "[false, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[true, false, null, true, false]", "[false]", + "[false, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[true, false, null, true, false]", "[false]", + "[false, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, "[true, false, null, true, false]", "[false]", + "[false, true, null, false, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[true, false, null, true, false]", "[false]", + "[false, true, null, false, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", "[false, true, true, false, true]", /*skip_nulls=*/false); CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", "[false, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", + "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", + "[false, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", + "[false, true, null, false, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[true, false, null, true, false]", "[false, null]", + "[null, true, null, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]", "[false, true, true, false, true]", /*skip_nulls=*/false); CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]", "[false, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]", + "[false, true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]", + "[false, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]", + "[false, true, null, false, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, "[true, false, null, true, false]", "[null, false, false, null]", + "[null, true, null, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); } TYPED_TEST_SUITE(TestIsInKernelBinary, BaseBinaryArrowTypes); @@ -309,6 +508,18 @@ TYPED_TEST(TestIsInKernelBinary, Binary) { CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])", + "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])", + "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])", + "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", ""])", + "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", "[true, true, false, true, true]", @@ -316,6 +527,18 @@ TYPED_TEST(TestIsInKernelBinary, Binary) { CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", + "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", + "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", + "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"(["aaa", "", null])", + "[true, true, null, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", @@ -324,6 +547,18 @@ TYPED_TEST(TestIsInKernelBinary, Binary) { CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", R"([null, "aaa", "aaa", "", "", null])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", + R"([null, "aaa", "aaa", "", "", null])", "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", + R"([null, "aaa", "aaa", "", "", null])", "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", + R"([null, "aaa", "aaa", "", "", null])", "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["aaa", "", "cc", null, ""])", + R"([null, "aaa", "aaa", "", "", null])", "[true, true, null, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); } TEST_F(TestIsInKernel, FixedSizeBinary) { @@ -335,6 +570,18 @@ TEST_F(TestIsInKernel, FixedSizeBinary) { CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])", + "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])", + "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])", + "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb"])", + "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", "[true, true, false, true, true]", @@ -342,6 +589,18 @@ TEST_F(TestIsInKernel, FixedSizeBinary) { CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", + "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", + "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", + "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", R"(["aaa", "bbb", null])", + "[true, true, null, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", @@ -352,6 +611,22 @@ TEST_F(TestIsInKernel, FixedSizeBinary) { R"(["aaa", null, "aaa", "bbb", "bbb", null])", "[true, true, false, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", + R"(["aaa", null, "aaa", "bbb", "bbb", null])", + "[true, true, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", + R"(["aaa", null, "aaa", "bbb", "bbb", null])", + "[true, true, false, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", + R"(["aaa", null, "aaa", "bbb", "bbb", null])", + "[true, true, false, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["aaa", "bbb", "ccc", null, "bbb"])", + R"(["aaa", null, "aaa", "bbb", "bbb", null])", + "[true, true, null, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); ASSERT_RAISES(Invalid, IsIn(ArrayFromJSON(fixed_size_binary(3), R"(["abc"])"), @@ -366,6 +641,18 @@ TEST_F(TestIsInKernel, Decimal) { CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])", "[true, false, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])", + "[true, false, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])", + "[true, false, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])", + "[true, false, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9"])", + "[true, false, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9", null])", "[true, false, true, true, true]", @@ -373,6 +660,18 @@ TEST_F(TestIsInKernel, Decimal) { CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", R"(["12.3", "78.9", null])", "[true, false, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"(["12.3", "78.9", null])", "[true, false, true, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"(["12.3", "78.9", null])", "[true, false, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"(["12.3", "78.9", null])", "[true, false, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"(["12.3", "78.9", null])", "[true, null, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in right array CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", @@ -383,6 +682,22 @@ TEST_F(TestIsInKernel, Decimal) { R"([null, "12.3", "12.3", "78.9", "78.9", null])", "[true, false, true, false, true]", /*skip_nulls=*/true); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"([null, "12.3", "12.3", "78.9", "78.9", null])", + "[true, false, true, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"([null, "12.3", "12.3", "78.9", "78.9", null])", + "[true, false, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"([null, "12.3", "12.3", "78.9", "78.9", null])", + "[true, false, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsIn(type, R"(["12.3", "45.6", "78.9", null, "12.3"])", + R"([null, "12.3", "12.3", "78.9", "78.9", null])", + "[true, null, true, null, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); CheckIsIn(ArrayFromJSON(decimal128(4, 2), R"(["12.30", "45.60", "78.90"])"), ArrayFromJSON(type, R"(["12.3", "78.9"])"), "[true, false, true]"); @@ -405,6 +720,20 @@ TEST_F(TestIsInKernel, DictionaryArray) { /*value_set_json=*/"[4.1, 42, -1.0]", /*expected_json=*/"[true, true, false, true]", /*skip_nulls=*/false); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 2, null, 0]", + /*value_set_json=*/R"(["A", "B", "C"])", + /*expected_json=*/"[true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsInDictionary(/*type=*/float32(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/"[4.1, -1.0, 42, 9.8]", + /*input_index_json=*/"[1, 2, null, 0]", + /*value_set_json=*/"[4.1, 42, -1.0]", + /*expected_json=*/"[true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); // With nulls and skip_nulls=false CheckIsInDictionary(/*type=*/utf8(), @@ -428,6 +757,27 @@ TEST_F(TestIsInKernel, DictionaryArray) { /*value_set_json=*/R"(["C", "B", "A"])", /*expected_json=*/"[false, false, false, true, false]", /*skip_nulls=*/false); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[true, false, true, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[true, false, true, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A"])", + /*expected_json=*/"[false, false, false, true, false]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); // With nulls and skip_nulls=true CheckIsInDictionary(/*type=*/utf8(), @@ -451,6 +801,73 @@ TEST_F(TestIsInKernel, DictionaryArray) { /*value_set_json=*/R"(["C", "B", "A"])", /*expected_json=*/"[false, false, false, true, false]", /*skip_nulls=*/true); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[true, false, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[false, false, false, true, false]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A"])", + /*expected_json=*/"[false, false, false, true, false]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + + // With nulls and null_matching_behavior=EMIT_NULL + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[true, false, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[null, false, null, true, null]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A"])", + /*expected_json=*/"[null, false, null, true, null]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + + // With nulls and null_matching_behavior=INCONCLUSIVE + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[true, null, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[null, null, null, true, null]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A"])", + /*expected_json=*/"[null, false, null, true, null]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // With duplicates in value_set CheckIsInDictionary(/*type=*/utf8(), @@ -474,6 +891,41 @@ TEST_F(TestIsInKernel, DictionaryArray) { /*value_set_json=*/R"(["C", "C", "B", "A", null, null, "B"])", /*expected_json=*/"[true, false, false, true, true]", /*skip_nulls=*/true); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 2, null, 0]", + /*value_set_json=*/R"(["A", "A", "B", "A", "B", "C"])", + /*expected_json=*/"[true, true, false, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "C", "B", "A", null, null, "B"])", + /*expected_json=*/"[true, false, true, true, true]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "C", "B", "A", null, null, "B"])", + /*expected_json=*/"[true, false, false, true, true]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "C", "B", "A", null, null, "B"])", + /*expected_json=*/"[true, false, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + CheckIsInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "C", "B", "A", null, null, "B"])", + /*expected_json=*/"[true, null, null, true, true]", + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); } } @@ -487,14 +939,38 @@ TEST_F(TestIsInKernel, ChunkedArrayInvoke) { CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/false); CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/true); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::SKIP); + expected = ChunkedArrayFromJSON( + boolean(), {"[true, true, true, true, false]", "[true, null, true, false]"}); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + expected = ChunkedArrayFromJSON( + boolean(), {"[true, true, true, true, false]", "[true, null, true, false]"}); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); value_set = ChunkedArrayFromJSON(utf8(), {R"(["", "def"])", R"([null])"}); expected = ChunkedArrayFromJSON( boolean(), {"[false, true, true, false, false]", "[true, true, false, false]"}); CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/false); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::MATCH); expected = ChunkedArrayFromJSON( boolean(), {"[false, true, true, false, false]", "[true, false, false, false]"}); CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/true); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::SKIP); + expected = ChunkedArrayFromJSON( + boolean(), {"[false, true, true, false, false]", "[true, null, false, false]"}); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + expected = ChunkedArrayFromJSON( + boolean(), {"[null, true, true, null, null]", "[true, null, null, null]"}); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); // Duplicates in value_set value_set = @@ -502,9 +978,21 @@ TEST_F(TestIsInKernel, ChunkedArrayInvoke) { expected = ChunkedArrayFromJSON( boolean(), {"[false, true, true, false, false]", "[true, true, false, false]"}); CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/false); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::MATCH); expected = ChunkedArrayFromJSON( boolean(), {"[false, true, true, false, false]", "[true, false, false, false]"}); CheckIsInChunked(input, value_set, expected, /*skip_nulls=*/true); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::SKIP); + expected = ChunkedArrayFromJSON( + boolean(), {"[false, true, true, false, false]", "[true, null, false, false]"}); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::EMIT_NULL); + expected = ChunkedArrayFromJSON( + boolean(), {"[null, true, true, null, null]", "[true, null, null, null]"}); + CheckIsInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::INCONCLUSIVE); } // ---------------------------------------------------------------------- @@ -514,7 +1002,70 @@ class TestIndexInKernel : public ::testing::Test { public: void CheckIndexIn(const std::shared_ptr& input, const std::shared_ptr& value_set, - const std::string& expected_json, bool skip_nulls = false) { + const std::string& expected_json, + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + std::shared_ptr expected = ArrayFromJSON(int32(), expected_json); + + SetLookupOptions options(value_set, null_matching_behavior); + ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(input, options)); + std::shared_ptr actual = actual_datum.make_array(); + ValidateOutput(actual_datum); + AssertArraysEqual(*expected, *actual, /*verbose=*/true); + } + + void CheckIndexIn(const std::shared_ptr& type, const std::string& input_json, + const std::string& value_set_json, const std::string& expected_json, + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + std::shared_ptr input = ArrayFromJSON(type, input_json); + std::shared_ptr value_set = ArrayFromJSON(type, value_set_json); + return CheckIndexIn(input, value_set, expected_json, null_matching_behavior); + } + + void CheckIndexInChunked(const std::shared_ptr& input, + const std::shared_ptr& value_set, + const std::shared_ptr& expected, + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + ASSERT_OK_AND_ASSIGN( + Datum actual, + IndexIn(input, SetLookupOptions(value_set, null_matching_behavior))); + ASSERT_EQ(Datum::CHUNKED_ARRAY, actual.kind()); + ValidateOutput(actual); + + auto actual_chunked = actual.chunked_array(); + + // Output contiguous in a single chunk + ASSERT_EQ(1, actual_chunked->num_chunks()); + ASSERT_TRUE(actual_chunked->Equals(*expected)); + } + + void CheckIndexInDictionary( + const std::shared_ptr& type, const std::shared_ptr& index_type, + const std::string& input_dictionary_json, const std::string& input_index_json, + const std::string& value_set_json, const std::string& expected_json, + SetLookupOptions::NullMatchingBehavior null_matching_behavior = + SetLookupOptions::MATCH) { + auto dict_type = dictionary(index_type, type); + auto indices = ArrayFromJSON(index_type, input_index_json); + auto dict = ArrayFromJSON(type, input_dictionary_json); + + ASSERT_OK_AND_ASSIGN(auto input, + DictionaryArray::FromArrays(dict_type, indices, dict)); + auto value_set = ArrayFromJSON(type, value_set_json); + auto expected = ArrayFromJSON(int32(), expected_json); + + SetLookupOptions options(value_set, null_matching_behavior); + ASSERT_OK_AND_ASSIGN(Datum actual_datum, IndexIn(input, options)); + std::shared_ptr actual = actual_datum.make_array(); + ValidateOutput(actual_datum); + AssertArraysEqual(*expected, *actual, /*verbose=*/true); + } + + void CheckIndexIn(const std::shared_ptr& input, + const std::shared_ptr& value_set, + const std::string& expected_json, bool skip_nulls) { std::shared_ptr expected = ArrayFromJSON(int32(), expected_json); SetLookupOptions options(value_set, skip_nulls); @@ -526,7 +1077,7 @@ class TestIndexInKernel : public ::testing::Test { void CheckIndexIn(const std::shared_ptr& type, const std::string& input_json, const std::string& value_set_json, const std::string& expected_json, - bool skip_nulls = false) { + bool skip_nulls) { std::shared_ptr input = ArrayFromJSON(type, input_json); std::shared_ptr value_set = ArrayFromJSON(type, value_set_json); return CheckIndexIn(input, value_set, expected_json, skip_nulls); @@ -553,7 +1104,7 @@ class TestIndexInKernel : public ::testing::Test { const std::string& input_dictionary_json, const std::string& input_index_json, const std::string& value_set_json, - const std::string& expected_json, bool skip_nulls = false) { + const std::string& expected_json, bool skip_nulls) { auto dict_type = dictionary(index_type, type); auto indices = ArrayFromJSON(index_type, input_index_json); auto dict = ArrayFromJSON(type, input_dictionary_json); @@ -656,6 +1207,16 @@ TYPED_TEST(TestIndexInKernelPrimitive, SkipNulls) { /*value_set=*/"[1, 3]", /*expected=*/"[null, 0, null, 1, null]", /*skip_nulls=*/true); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, 3]", + /*expected=*/"[null, 0, null, 1, null]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, 3]", + /*expected=*/"[null, 0, null, 1, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Same with duplicates in value_set this->CheckIndexIn(type, /*input=*/"[0, 1, 2, 3, null]", @@ -667,6 +1228,16 @@ TYPED_TEST(TestIndexInKernelPrimitive, SkipNulls) { /*value_set=*/"[1, 1, 3, 3]", /*expected=*/"[null, 0, null, 2, null]", /*skip_nulls=*/true); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, 1, 3, 3]", + /*expected=*/"[null, 0, null, 2, null]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, 1, 3, 3]", + /*expected=*/"[null, 0, null, 2, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Nulls in value_set this->CheckIndexIn(type, @@ -679,12 +1250,27 @@ TYPED_TEST(TestIndexInKernelPrimitive, SkipNulls) { /*value_set=*/"[1, 1, null, null, 3, 3]", /*expected=*/"[null, 0, null, 4, null]", /*skip_nulls=*/true); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, null, 3]", + /*expected=*/"[null, 0, null, 2, 1]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, 1, null, null, 3, 3]", + /*expected=*/"[null, 0, null, 4, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Same with duplicates in value_set this->CheckIndexIn(type, /*input=*/"[0, 1, 2, 3, null]", /*value_set=*/"[1, 1, null, null, 3, 3]", /*expected=*/"[null, 0, null, 4, 2]", /*skip_nulls=*/false); + this->CheckIndexIn(type, + /*input=*/"[0, 1, 2, 3, null]", + /*value_set=*/"[1, 1, null, null, 3, 3]", + /*expected=*/"[null, 0, null, 4, 2]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); } TEST_F(TestIndexInKernel, NullType) { @@ -695,6 +1281,10 @@ TEST_F(TestIndexInKernel, NullType) { CheckIndexIn(null(), "[null, null]", "[null]", "[null, null]", /*skip_nulls=*/true); CheckIndexIn(null(), "[null, null]", "[]", "[null, null]", /*skip_nulls=*/true); + CheckIndexIn(null(), "[null, null]", "[null]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIndexIn(null(), "[null, null]", "[]", "[null, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); } TEST_F(TestIndexInKernel, TimeTimestamp) { @@ -979,6 +1569,11 @@ TEST_F(TestIndexInKernel, FixedSizeBinary) { /*value_set=*/R"(["aaa", null, "bbb", "ccc"])", /*expected=*/R"([2, null, null, 0, 3, 0])", /*skip_nulls=*/true); + CheckIndexIn(fixed_size_binary(3), + /*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])", + /*value_set=*/R"(["aaa", null, "bbb", "ccc"])", + /*expected=*/R"([2, null, null, 0, 3, 0])", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIndexIn(fixed_size_binary(3), /*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])", @@ -989,6 +1584,11 @@ TEST_F(TestIndexInKernel, FixedSizeBinary) { /*value_set=*/R"(["aaa", "bbb", "ccc"])", /*expected=*/R"([1, null, null, 0, 2, 0])", /*skip_nulls=*/true); + CheckIndexIn(fixed_size_binary(3), + /*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])", + /*value_set=*/R"(["aaa", "bbb", "ccc"])", + /*expected=*/R"([1, null, null, 0, 2, 0])", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Duplicates in value_set CheckIndexIn(fixed_size_binary(3), @@ -1000,6 +1600,11 @@ TEST_F(TestIndexInKernel, FixedSizeBinary) { /*value_set=*/R"(["aaa", "aaa", null, null, "bbb", "bbb", "ccc"])", /*expected=*/R"([4, null, null, 0, 6, 0])", /*skip_nulls=*/true); + CheckIndexIn(fixed_size_binary(3), + /*input=*/R"(["bbb", null, "ddd", "aaa", "ccc", "aaa"])", + /*value_set=*/R"(["aaa", "aaa", null, null, "bbb", "bbb", "ccc"])", + /*expected=*/R"([4, null, null, 0, 6, 0])", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Empty input array CheckIndexIn(fixed_size_binary(5), R"([])", R"(["bbbbb", null, "aaaaa", "ccccc"])", @@ -1026,6 +1631,11 @@ TEST_F(TestIndexInKernel, MonthDayNanoInterval) { /*value_set=*/R"([null, [4, 5, 6], [5, -1, 5]])", /*expected=*/R"([2, 0, 1, 2, null])", /*skip_nulls=*/false); + CheckIndexIn(type, + /*input=*/R"([[5, -1, 5], null, [4, 5, 6], [5, -1, 5], [1, 2, 3]])", + /*value_set=*/R"([null, [4, 5, 6], [5, -1, 5]])", + /*expected=*/R"([2, 0, 1, 2, null])", + /*null_matching_behavior=*/SetLookupOptions::MATCH); // Duplicates in value_set CheckIndexIn( @@ -1034,6 +1644,12 @@ TEST_F(TestIndexInKernel, MonthDayNanoInterval) { /*value_set=*/R"([null, null, [0, 0, 0], [0, 0, 0], [7, 8, 0], [7, 8, 0]])", /*expected=*/R"([4, 0, 2, 4, null])", /*skip_nulls=*/false); + CheckIndexIn( + type, + /*input=*/R"([[7, 8, 0], null, [0, 0, 0], [7, 8, 0], [0, 0, 1]])", + /*value_set=*/R"([null, null, [0, 0, 0], [0, 0, 0], [7, 8, 0], [7, 8, 0]])", + /*expected=*/R"([4, 0, 2, 4, null])", + /*null_matching_behavior=*/SetLookupOptions::MATCH); } TEST_F(TestIndexInKernel, Decimal) { @@ -1048,6 +1664,16 @@ TEST_F(TestIndexInKernel, Decimal) { /*value_set=*/R"([null, "11", "12"])", /*expected=*/R"([2, null, 1, 2, null])", /*skip_nulls=*/true); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"([null, "11", "12"])", + /*expected=*/R"([2, 0, 1, 2, null])", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"([null, "11", "12"])", + /*expected=*/R"([2, null, 1, 2, null])", + /*null_matching_behavior=*/SetLookupOptions::SKIP); CheckIndexIn(type, /*input=*/R"(["12", null, "11", "12", "13"])", @@ -1059,6 +1685,16 @@ TEST_F(TestIndexInKernel, Decimal) { /*value_set=*/R"(["11", "12"])", /*expected=*/R"([1, null, 0, 1, null])", /*skip_nulls=*/true); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"(["11", "12"])", + /*expected=*/R"([1, null, 0, 1, null])", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"(["11", "12"])", + /*expected=*/R"([1, null, 0, 1, null])", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Duplicates in value_set CheckIndexIn(type, @@ -1076,6 +1712,21 @@ TEST_F(TestIndexInKernel, Decimal) { /*value_set=*/R"([null, "11", "12"])", /*expected=*/R"([2, 0, 1, 2, null])", /*skip_nulls=*/false); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"([null, null, "11", "11", "12", "12"])", + /*expected=*/R"([4, 0, 2, 4, null])", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"([null, null, "11", "11", "12", "12"])", + /*expected=*/R"([4, null, 2, 4, null])", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIndexIn(type, + /*input=*/R"(["12", null, "11", "12", "13"])", + /*value_set=*/R"([null, "11", "12"])", + /*expected=*/R"([2, 0, 1, 2, null])", + /*null_matching_behavior=*/SetLookupOptions::MATCH); CheckIndexIn( ArrayFromJSON(decimal256(3, 1), R"(["12.0", null, "11.0", "12.0", "13.0"])"), @@ -1099,6 +1750,20 @@ TEST_F(TestIndexInKernel, DictionaryArray) { /*value_set_json=*/"[4.1, 42, -1.0]", /*expected_json=*/"[2, 1, null, 0]", /*skip_nulls=*/false); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 2, null, 0]", + /*value_set_json=*/R"(["A", "B", "C"])", + /*expected_json=*/"[1, 2, null, 0]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexInDictionary(/*type=*/float32(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/"[4.1, -1.0, 42, 9.8]", + /*input_index_json=*/"[1, 2, null, 0]", + /*value_set_json=*/"[4.1, 42, -1.0]", + /*expected_json=*/"[2, 1, null, 0]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); // With nulls and skip_nulls=false CheckIndexInDictionary(/*type=*/utf8(), @@ -1122,6 +1787,27 @@ TEST_F(TestIndexInKernel, DictionaryArray) { /*value_set_json=*/R"(["C", "B", "A"])", /*expected_json=*/"[null, null, null, 2, null]", /*skip_nulls=*/false); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[1, null, 3, 2, 1]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[3, null, 3, 2, 3]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A"])", + /*expected_json=*/"[null, null, null, 2, null]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); // With nulls and skip_nulls=true CheckIndexInDictionary(/*type=*/utf8(), @@ -1145,6 +1831,27 @@ TEST_F(TestIndexInKernel, DictionaryArray) { /*value_set_json=*/R"(["C", "B", "A"])", /*expected_json=*/"[null, null, null, 2, null]", /*skip_nulls=*/true); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[1, null, null, 2, 1]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A", null])", + /*expected_json=*/"[null, null, null, 2, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "B", "A"])", + /*expected_json=*/"[null, null, null, 2, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); // With duplicates in value_set CheckIndexInDictionary(/*type=*/utf8(), @@ -1168,6 +1875,27 @@ TEST_F(TestIndexInKernel, DictionaryArray) { /*value_set_json=*/R"(["C", "C", "B", "B", "A", "A", null])", /*expected_json=*/"[null, null, null, 4, null]", /*skip_nulls=*/true); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", "B", "C", "D"])", + /*input_index_json=*/"[1, 2, null, 0]", + /*value_set_json=*/R"(["A", "A", "B", "B", "C", "C"])", + /*expected_json=*/"[2, 4, null, 0]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "C", "B", "B", "A", "A", null])", + /*expected_json=*/"[6, null, 6, 4, 6]", + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexInDictionary(/*type=*/utf8(), + /*index_type=*/index_ty, + /*input_dictionary_json=*/R"(["A", null, "C", "D"])", + /*input_index_json=*/"[1, 3, null, 0, 1]", + /*value_set_json=*/R"(["C", "C", "B", "B", "A", "A", null])", + /*expected_json=*/"[null, null, null, 4, null]", + /*null_matching_behavior=*/SetLookupOptions::SKIP); } } @@ -1181,21 +1909,33 @@ TEST_F(TestIndexInKernel, ChunkedArrayInvoke) { CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/false); CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/true); + CheckIndexInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::MATCH); + CheckIndexInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Null in value_set value_set = ChunkedArrayFromJSON(utf8(), {R"(["ghi", "def"])", R"([null, "abc"])"}); expected = ChunkedArrayFromJSON(int32(), {"[3, 1, 0, 3, null]", "[1, 2, 3, null]"}); CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/false); + CheckIndexInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::MATCH); expected = ChunkedArrayFromJSON(int32(), {"[3, 1, 0, 3, null]", "[1, null, 3, null]"}); CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/true); + CheckIndexInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::SKIP); // Duplicates in value_set value_set = ChunkedArrayFromJSON( utf8(), {R"(["ghi", "ghi", "def"])", R"(["def", null, null, "abc"])"}); expected = ChunkedArrayFromJSON(int32(), {"[6, 2, 0, 6, null]", "[2, 4, 6, null]"}); CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/false); + CheckIndexInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::MATCH); expected = ChunkedArrayFromJSON(int32(), {"[6, 2, 0, 6, null]", "[2, null, 6, null]"}); CheckIndexInChunked(input, value_set, expected, /*skip_nulls=*/true); + CheckIndexInChunked(input, value_set, expected, + /*null_matching_behavior=*/SetLookupOptions::SKIP); } TEST(TestSetLookup, DispatchBest) { diff --git a/cpp/src/arrow/util/reflection_internal.h b/cpp/src/arrow/util/reflection_internal.h index d7de913bafd88..5d281a265ff71 100644 --- a/cpp/src/arrow/util/reflection_internal.h +++ b/cpp/src/arrow/util/reflection_internal.h @@ -71,6 +71,30 @@ constexpr DataMemberProperty DataMember(std::string_view name, return {name, ptr}; } +template +struct CoercedDataMemberProperty { + using Class = C; + using Type = T; + + constexpr Type get(const Class& obj) const { return (obj.*get_coerced_)(); } + + void set(Class* obj, Type value) const { (*obj).*ptr_for_set_ = std::move(value); } + + constexpr std::string_view name() const { return name_; } + + std::string_view name_; + Type Class::*ptr_for_set_; + Type (Class::*get_coerced_)() const; +}; + +template +constexpr CoercedDataMemberProperty CoercedDataMember(std::string_view name, + Type Class::*ptr, + Type (Class::*get)() + const) { + return {name, ptr, get}; +} + template struct PropertyTuple { template diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx index 609307528d2ec..25f77d8160ea8 100644 --- a/python/pyarrow/_compute.pyx +++ b/python/pyarrow/_compute.pyx @@ -2366,7 +2366,7 @@ cdef class Expression(_Weakrefable): 1, 2, 3 - ], skip_nulls=false})> + ], null_matching_behavior=MATCH})> """ def __init__(self):