Skip to content

Commit

Permalink
GH-41909: [C++] Add arrow::ArrayStatistics (#43273)
Browse files Browse the repository at this point in the history
### Rationale for this change

We're discussion API on the mailing list https://lists.apache.org/thread/kcpyq9npnh346pw90ljwbg0wxq6hwxxh and GH-41909.

If we have `arrow::ArrayStatistics`, we can attach statistics read from Apache Parquet to `arrow::Array`s.

This only includes `arrow::ArrayStatistics`. See GH-42133 how to use `arrow::ArrayStatitics` for Apache Parquet's statistics.

### What changes are included in this PR?

This only adds `arrow::ArrayStatistics` and its tests.

### Are these changes tested?

Yes.

### Are there any user-facing changes?

Yes.
* GitHub Issue: #41909

Authored-by: Sutou Kouhei <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
  • Loading branch information
kou authored Aug 4, 2024
1 parent 1992cc6 commit 39af73f
Show file tree
Hide file tree
Showing 4 changed files with 202 additions and 0 deletions.
2 changes: 2 additions & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -412,6 +412,7 @@ arrow_add_object_library(ARROW_ARRAY
array/concatenate.cc
array/data.cc
array/diff.cc
array/statistics.cc
array/util.cc
array/validate.cc)

Expand Down Expand Up @@ -1168,6 +1169,7 @@ add_arrow_test(array_test
array/array_struct_test.cc
array/array_union_test.cc
array/array_view_test.cc
array/statistics_test.cc
PRECOMPILED_HEADERS
"$<$<COMPILE_LANGUAGE:CXX>:arrow/testing/pch.h>")

Expand Down
21 changes: 21 additions & 0 deletions cpp/src/arrow/array/statistics.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

// This empty .cc file is for embedding not inlined symbols in
// arrow::ArrayStatistics into libarrow.

#include "arrow/array/statistics.h"
76 changes: 76 additions & 0 deletions cpp/src/arrow/array/statistics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <cstdint>
#include <optional>
#include <string>
#include <string_view>
#include <variant>

#include "arrow/util/float16.h"
#include "arrow/util/visibility.h"

namespace arrow {

/// \brief Statistics for an Array
///
/// Apache Arrow format doesn't have statistics but data source such
/// as Apache Parquet may have statistics. Statistics associated with
/// data source can be read unified API via this class.
struct ARROW_EXPORT ArrayStatistics {
using ValueType =
std::variant<bool, int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t,
uint64_t, util::Float16, float, double, std::string, std::string_view>;

ArrayStatistics() = default;
~ArrayStatistics() = default;

/// \brief The number of null values, may not be set
std::optional<int64_t> null_count = std::nullopt;

/// \brief The number of distinct values, may not be set
std::optional<int64_t> distinct_count = std::nullopt;

/// \brief The minimum value, may not be set
std::optional<ValueType> min = std::nullopt;

/// \brief Whether the minimum value is exact or not, may not be set
std::optional<bool> is_min_exact = std::nullopt;

/// \brief The maximum value, may not be set
std::optional<ValueType> max = std::nullopt;

/// \brief Whether the maximum value is exact or not, may not be set
std::optional<bool> is_max_exact = std::nullopt;

/// \brief Check two statistics for equality
bool Equals(const ArrayStatistics& other) const {
return null_count == other.null_count && distinct_count == other.distinct_count &&
min == other.min && is_min_exact == other.is_min_exact && max == other.max &&
is_max_exact == other.is_max_exact;
}

/// \brief Check two statistics for equality
bool operator==(const ArrayStatistics& other) const { return Equals(other); }

/// \brief Check two statistics for not equality
bool operator!=(const ArrayStatistics& other) const { return !Equals(other); }
};

} // namespace arrow
103 changes: 103 additions & 0 deletions cpp/src/arrow/array/statistics_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include <gtest/gtest.h>

#include "arrow/array/statistics.h"

namespace arrow {

TEST(ArrayStatisticsTest, TestNullCount) {
ArrayStatistics statistics;
ASSERT_FALSE(statistics.null_count.has_value());
statistics.null_count = 29;
ASSERT_TRUE(statistics.null_count.has_value());
ASSERT_EQ(29, statistics.null_count.value());
}

TEST(ArrayStatisticsTest, TestDistinctCount) {
ArrayStatistics statistics;
ASSERT_FALSE(statistics.distinct_count.has_value());
statistics.distinct_count = 29;
ASSERT_TRUE(statistics.distinct_count.has_value());
ASSERT_EQ(29, statistics.distinct_count.value());
}

TEST(ArrayStatisticsTest, TestMin) {
ArrayStatistics statistics;
ASSERT_FALSE(statistics.min.has_value());
ASSERT_FALSE(statistics.is_min_exact.has_value());
statistics.min = static_cast<int32_t>(29);
statistics.is_min_exact = true;
ASSERT_TRUE(statistics.min.has_value());
ASSERT_TRUE(std::holds_alternative<int32_t>(statistics.min.value()));
ASSERT_EQ(29, std::get<int32_t>(statistics.min.value()));
ASSERT_TRUE(statistics.is_min_exact.has_value());
ASSERT_TRUE(statistics.is_min_exact.value());
}

TEST(ArrayStatisticsTest, TestMax) {
ArrayStatistics statistics;
ASSERT_FALSE(statistics.max.has_value());
ASSERT_FALSE(statistics.is_max_exact.has_value());
statistics.max = std::string("hello");
statistics.is_max_exact = false;
ASSERT_TRUE(statistics.max.has_value());
ASSERT_TRUE(std::holds_alternative<std::string>(statistics.max.value()));
ASSERT_EQ("hello", std::get<std::string>(statistics.max.value()));
ASSERT_TRUE(statistics.is_max_exact.has_value());
ASSERT_FALSE(statistics.is_max_exact.value());
}

TEST(ArrayStatisticsTest, TestEquality) {
ArrayStatistics statistics1;
ArrayStatistics statistics2;

ASSERT_EQ(statistics1, statistics2);

statistics1.null_count = 29;
ASSERT_NE(statistics1, statistics2);
statistics2.null_count = 29;
ASSERT_EQ(statistics1, statistics2);

statistics1.distinct_count = 2929;
ASSERT_NE(statistics1, statistics2);
statistics2.distinct_count = 2929;
ASSERT_EQ(statistics1, statistics2);

statistics1.min = std::string_view("world");
ASSERT_NE(statistics1, statistics2);
statistics2.min = std::string_view("world");
ASSERT_EQ(statistics1, statistics2);

statistics1.is_min_exact = false;
ASSERT_NE(statistics1, statistics2);
statistics2.is_min_exact = false;
ASSERT_EQ(statistics1, statistics2);

statistics1.max = arrow::util::Float16(-29);
ASSERT_NE(statistics1, statistics2);
statistics2.max = arrow::util::Float16(-29);
ASSERT_EQ(statistics1, statistics2);

statistics1.is_max_exact = true;
ASSERT_NE(statistics1, statistics2);
statistics2.is_max_exact = true;
ASSERT_EQ(statistics1, statistics2);
}

} // namespace arrow

0 comments on commit 39af73f

Please sign in to comment.