diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 6dc8358f502f5..9c66a58c54261 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -412,6 +412,7 @@ arrow_add_object_library(ARROW_ARRAY array/concatenate.cc array/data.cc array/diff.cc + array/statistics.cc array/util.cc array/validate.cc) @@ -1168,6 +1169,7 @@ add_arrow_test(array_test array/array_struct_test.cc array/array_union_test.cc array/array_view_test.cc + array/statistics_test.cc PRECOMPILED_HEADERS "$<$:arrow/testing/pch.h>") diff --git a/cpp/src/arrow/array/statistics.cc b/cpp/src/arrow/array/statistics.cc new file mode 100644 index 0000000000000..b661c9fbaffed --- /dev/null +++ b/cpp/src/arrow/array/statistics.cc @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// This empty .cc file is for embedding not inlined symbols in +// arrow::ArrayStatistics into libarrow. + +#include "arrow/array/statistics.h" diff --git a/cpp/src/arrow/array/statistics.h b/cpp/src/arrow/array/statistics.h new file mode 100644 index 0000000000000..7357e27f41f5b --- /dev/null +++ b/cpp/src/arrow/array/statistics.h @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "arrow/util/float16.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +/// \brief Statistics for an Array +/// +/// Apache Arrow format doesn't have statistics but data source such +/// as Apache Parquet may have statistics. Statistics associated with +/// data source can be read unified API via this class. +struct ARROW_EXPORT ArrayStatistics { + using ValueType = + std::variant; + + ArrayStatistics() = default; + ~ArrayStatistics() = default; + + /// \brief The number of null values, may not be set + std::optional null_count = std::nullopt; + + /// \brief The number of distinct values, may not be set + std::optional distinct_count = std::nullopt; + + /// \brief The minimum value, may not be set + std::optional min = std::nullopt; + + /// \brief Whether the minimum value is exact or not, may not be set + std::optional is_min_exact = std::nullopt; + + /// \brief The maximum value, may not be set + std::optional max = std::nullopt; + + /// \brief Whether the maximum value is exact or not, may not be set + std::optional is_max_exact = std::nullopt; + + /// \brief Check two statistics for equality + bool Equals(const ArrayStatistics& other) const { + return null_count == other.null_count && distinct_count == other.distinct_count && + min == other.min && is_min_exact == other.is_min_exact && max == other.max && + is_max_exact == other.is_max_exact; + } + + /// \brief Check two statistics for equality + bool operator==(const ArrayStatistics& other) const { return Equals(other); } + + /// \brief Check two statistics for not equality + bool operator!=(const ArrayStatistics& other) const { return !Equals(other); } +}; + +} // namespace arrow diff --git a/cpp/src/arrow/array/statistics_test.cc b/cpp/src/arrow/array/statistics_test.cc new file mode 100644 index 0000000000000..a465ac0bc2e0d --- /dev/null +++ b/cpp/src/arrow/array/statistics_test.cc @@ -0,0 +1,103 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include "arrow/array/statistics.h" + +namespace arrow { + +TEST(ArrayStatisticsTest, TestNullCount) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.null_count.has_value()); + statistics.null_count = 29; + ASSERT_TRUE(statistics.null_count.has_value()); + ASSERT_EQ(29, statistics.null_count.value()); +} + +TEST(ArrayStatisticsTest, TestDistinctCount) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.distinct_count.has_value()); + statistics.distinct_count = 29; + ASSERT_TRUE(statistics.distinct_count.has_value()); + ASSERT_EQ(29, statistics.distinct_count.value()); +} + +TEST(ArrayStatisticsTest, TestMin) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.min.has_value()); + ASSERT_FALSE(statistics.is_min_exact.has_value()); + statistics.min = static_cast(29); + statistics.is_min_exact = true; + ASSERT_TRUE(statistics.min.has_value()); + ASSERT_TRUE(std::holds_alternative(statistics.min.value())); + ASSERT_EQ(29, std::get(statistics.min.value())); + ASSERT_TRUE(statistics.is_min_exact.has_value()); + ASSERT_TRUE(statistics.is_min_exact.value()); +} + +TEST(ArrayStatisticsTest, TestMax) { + ArrayStatistics statistics; + ASSERT_FALSE(statistics.max.has_value()); + ASSERT_FALSE(statistics.is_max_exact.has_value()); + statistics.max = std::string("hello"); + statistics.is_max_exact = false; + ASSERT_TRUE(statistics.max.has_value()); + ASSERT_TRUE(std::holds_alternative(statistics.max.value())); + ASSERT_EQ("hello", std::get(statistics.max.value())); + ASSERT_TRUE(statistics.is_max_exact.has_value()); + ASSERT_FALSE(statistics.is_max_exact.value()); +} + +TEST(ArrayStatisticsTest, TestEquality) { + ArrayStatistics statistics1; + ArrayStatistics statistics2; + + ASSERT_EQ(statistics1, statistics2); + + statistics1.null_count = 29; + ASSERT_NE(statistics1, statistics2); + statistics2.null_count = 29; + ASSERT_EQ(statistics1, statistics2); + + statistics1.distinct_count = 2929; + ASSERT_NE(statistics1, statistics2); + statistics2.distinct_count = 2929; + ASSERT_EQ(statistics1, statistics2); + + statistics1.min = std::string_view("world"); + ASSERT_NE(statistics1, statistics2); + statistics2.min = std::string_view("world"); + ASSERT_EQ(statistics1, statistics2); + + statistics1.is_min_exact = false; + ASSERT_NE(statistics1, statistics2); + statistics2.is_min_exact = false; + ASSERT_EQ(statistics1, statistics2); + + statistics1.max = arrow::util::Float16(-29); + ASSERT_NE(statistics1, statistics2); + statistics2.max = arrow::util::Float16(-29); + ASSERT_EQ(statistics1, statistics2); + + statistics1.is_max_exact = true; + ASSERT_NE(statistics1, statistics2); + statistics2.is_max_exact = true; + ASSERT_EQ(statistics1, statistics2); +} + +} // namespace arrow