From 3a5dc5832e9841d286429579acb26e5eeb2bbe97 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 22 Nov 2024 15:55:40 +0900 Subject: [PATCH] GH-44815: [C++][Parquet] Add an example to dump statistics read as `arrow::ArrayStatistics` --- cpp/tools/parquet/CMakeLists.txt | 7 ++- .../parquet/parquet_dump_arrow_statistics.cc | 58 +++++++++++++++++++ 2 files changed, 64 insertions(+), 1 deletion(-) create mode 100644 cpp/tools/parquet/parquet_dump_arrow_statistics.cc diff --git a/cpp/tools/parquet/CMakeLists.txt b/cpp/tools/parquet/CMakeLists.txt index 87c3254607589..5aaa456dcae08 100644 --- a/cpp/tools/parquet/CMakeLists.txt +++ b/cpp/tools/parquet/CMakeLists.txt @@ -16,7 +16,12 @@ # under the License. if(PARQUET_BUILD_EXECUTABLES) - set(PARQUET_TOOLS parquet-dump-footer parquet-dump-schema parquet-reader parquet-scan) + set(PARQUET_TOOLS + parquet-dump-arrow-statistics + parquet-dump-footer + parquet-dump-schema + parquet-reader + parquet-scan) foreach(TOOL ${PARQUET_TOOLS}) string(REGEX REPLACE "-" "_" TOOL_SOURCE ${TOOL}) diff --git a/cpp/tools/parquet/parquet_dump_arrow_statistics.cc b/cpp/tools/parquet/parquet_dump_arrow_statistics.cc new file mode 100644 index 0000000000000..8aeced94f6a74 --- /dev/null +++ b/cpp/tools/parquet/parquet_dump_arrow_statistics.cc @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include + +#include +#include + +namespace { +arrow::Status PrintArrowStatistics(const char* path) { + ARROW_ASSIGN_OR_RAISE( + auto input, arrow::io::MemoryMappedFile::Open(path, arrow::io::FileMode::READ)); + ARROW_ASSIGN_OR_RAISE(auto reader, + parquet::arrow::OpenFile(input, arrow::default_memory_pool())); + ARROW_ASSIGN_OR_RAISE(auto record_batch_reader, reader->GetRecordBatchReader()); + while (true) { + ARROW_ASSIGN_OR_RAISE(auto record_batch, record_batch_reader->Next()); + if (!record_batch) { + break; + } + ARROW_ASSIGN_OR_RAISE(auto statistics_array, record_batch->MakeStatisticsArray()); + std::cout << statistics_array->ToString() << std::endl; + } + return arrow::Status::OK(); +} +}; // namespace + +int main(int argc, char** argv) { + if (argc != 2) { + std::cerr << "Usage: " << argv[0] << " PARQUET_PATH" << std::endl; + std::cerr << " e.g.: " << argv[0] << " sample.parquet" << std::endl; + return EXIT_FAILURE; + } + + auto status = PrintArrowStatistics(argv[1]); + if (status.ok()) { + return EXIT_SUCCESS; + } else { + std::cerr << status.ToString() << std::endl; + return EXIT_FAILURE; + } +}