From baee5ad71820aa72b6a2a880afa05a6d26aef57a Mon Sep 17 00:00:00 2001 From: zhixingheyi-tian Date: Tue, 19 Jul 2022 14:16:28 +0800 Subject: [PATCH 1/3] Add parquet scan benchmark --- cpp/CMakeLists.txt | 5 + cpp/src/parquet/CMakeLists.txt | 2 + .../parquet/arrow/parquet_scan_benchmark.cc | 213 +++++++++++++++++ .../arrow/parquet_scan_string_benchmark.cc | 223 ++++++++++++++++++ cpp/src/parquet/arrow/test_utils.h | 132 +++++++++++ cpp/src/parquet/arrow/utils/exception.h | 25 ++ cpp/src/parquet/arrow/utils/macros.h | 104 ++++++++ 7 files changed, 704 insertions(+) create mode 100644 cpp/src/parquet/arrow/parquet_scan_benchmark.cc create mode 100644 cpp/src/parquet/arrow/parquet_scan_string_benchmark.cc create mode 100644 cpp/src/parquet/arrow/test_utils.h create mode 100644 cpp/src/parquet/arrow/utils/exception.h create mode 100644 cpp/src/parquet/arrow/utils/macros.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 234a66c00d875..0f112af489c88 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -48,6 +48,7 @@ if(POLICY CMP0074) endif() set(ARROW_VERSION "4.0.0") +add_compile_options(-g) string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" ARROW_BASE_VERSION "${ARROW_VERSION}") @@ -937,3 +938,7 @@ config_summary_message() if(${ARROW_BUILD_CONFIG_SUMMARY_JSON}) config_summary_json() endif() + + + + diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index 3f3ca5a529917..0aceae25fc98b 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -399,6 +399,8 @@ add_parquet_benchmark(column_io_benchmark) add_parquet_benchmark(encoding_benchmark) add_parquet_benchmark(level_conversion_benchmark) add_parquet_benchmark(arrow/reader_writer_benchmark PREFIX "parquet-arrow") +add_parquet_benchmark(arrow/parquet_scan_benchmark PREFIX "parquet-arrow") +add_parquet_benchmark(arrow/parquet_scan_string_benchmark PREFIX "parquet-arrow") if(ARROW_WITH_BROTLI) add_definitions(-DARROW_WITH_BROTLI) diff --git a/cpp/src/parquet/arrow/parquet_scan_benchmark.cc b/cpp/src/parquet/arrow/parquet_scan_benchmark.cc new file mode 100644 index 0000000000000..2ab95e1c380d0 --- /dev/null +++ b/cpp/src/parquet/arrow/parquet_scan_benchmark.cc @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/record_batch.h" +#include "parquet/arrow/utils/macros.h" +#include "parquet/arrow/test_utils.h" + + +// namespace parquet { +// namespace benchmark { + +const int batch_buffer_size = 32768; + +class GoogleBenchmarkColumnarToRow { + public: + GoogleBenchmarkColumnarToRow(std::string file_name) { GetRecordBatchReader(file_name); } + + void GetRecordBatchReader(const std::string& input_file) { + std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; + std::shared_ptr record_batch_reader; + + std::shared_ptr fs; + std::string file_name; + ARROW_ASSIGN_OR_THROW(fs, arrow::fs::FileSystemFromUriOrPath(input_file, &file_name)) + + ARROW_ASSIGN_OR_THROW(file, fs->OpenInputFile(file_name)); + + properties.set_batch_size(batch_buffer_size); + properties.set_pre_buffer(false); + properties.set_use_threads(false); + + ASSERT_NOT_OK(::parquet::arrow::FileReader::Make( + arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), + properties, &parquet_reader)); + + ASSERT_NOT_OK(parquet_reader->GetSchema(&schema)); + + auto num_rowgroups = parquet_reader->num_row_groups(); + + for (int i = 0; i < num_rowgroups; ++i) { + row_group_indices.push_back(i); + } + + auto num_columns = schema->num_fields(); + for (int i = 0; i < num_columns; ++i) { + column_indices.push_back(i); + } + } + + virtual void operator()(benchmark::State& state) {} + + protected: + long SetCPU(uint32_t cpuindex) { + cpu_set_t cs; + CPU_ZERO(&cs); + CPU_SET(cpuindex, &cs); + return sched_setaffinity(0, sizeof(cs), &cs); + } + + protected: + std::string file_name; + std::shared_ptr file; + std::vector row_group_indices; + std::vector column_indices; + std::shared_ptr schema; + parquet::ArrowReaderProperties properties; +}; +class GoogleBenchmarkColumnarToRow_CacheScan_Benchmark + : public GoogleBenchmarkColumnarToRow { + public: + GoogleBenchmarkColumnarToRow_CacheScan_Benchmark(std::string filename) + : GoogleBenchmarkColumnarToRow(filename) {} + void operator()(benchmark::State& state) { + if (state.range(0) == 0xffffffff) { + SetCPU(state.thread_index()); + } else { + SetCPU(state.range(0)); + } + + arrow::Compression::type compression_type = (arrow::Compression::type)1; + + std::shared_ptr record_batch; + int64_t elapse_read = 0; + int64_t num_batches = 0; + int64_t num_rows = 0; + int64_t init_time = 0; + int64_t write_time = 0; + + + std::vector local_column_indices = column_indices; + + std::shared_ptr local_schema; + local_schema = std::make_shared(*schema.get()); + + if (state.thread_index() == 0) std::cout << local_schema->ToString() << std::endl; + + for (auto _ : state) { + std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; + std::shared_ptr record_batch_reader; + ASSERT_NOT_OK(::parquet::arrow::FileReader::Make( + ::arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), + properties, &parquet_reader)); + + std::vector> batches; + ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader( + row_group_indices, local_column_indices, &record_batch_reader)); + do { + TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch)); + + if (record_batch) { + // batches.push_back(record_batch); + num_batches += 1; + num_rows += record_batch->num_rows(); + } + } while (record_batch); + + std::cout << " parquet parse done elapsed time = " << elapse_read / 1000000 + << " rows = " << num_rows << std::endl; + } + + state.counters["rowgroups"] = + benchmark::Counter(row_group_indices.size(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1000); + state.counters["columns"] = + benchmark::Counter(column_indices.size(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1000); + state.counters["batches"] = benchmark::Counter( + num_batches, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + state.counters["num_rows"] = benchmark::Counter( + num_rows, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + state.counters["batch_buffer_size"] = + benchmark::Counter(batch_buffer_size, benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1024); + + state.counters["parquet_parse"] = benchmark::Counter( + elapse_read, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + state.counters["init_time"] = benchmark::Counter( + init_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + state.counters["write_time"] = benchmark::Counter( + write_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + } +}; + +// } // namespace columnartorow +// } // namespace sparkcolumnarplugin + +int main(int argc, char** argv) { + uint32_t iterations = 1; + uint32_t threads = 1; + std::string datafile; + uint32_t cpu = 0xffffffff; + + for (int i = 0; i < argc; i++) { + if (strcmp(argv[i], "--iterations") == 0) { + iterations = atol(argv[i + 1]); + } else if (strcmp(argv[i], "--threads") == 0) { + threads = atol(argv[i + 1]); + } else if (strcmp(argv[i], "--file") == 0) { + datafile = argv[i + 1]; + } else if (strcmp(argv[i], "--cpu") == 0) { + cpu = atol(argv[i + 1]); + } + } + std::cout << "iterations = " << iterations << std::endl; + std::cout << "threads = " << threads << std::endl; + std::cout << "datafile = " << datafile << std::endl; + std::cout << "cpu = " << cpu << std::endl; + + GoogleBenchmarkColumnarToRow_CacheScan_Benchmark + bck(datafile); + + benchmark::RegisterBenchmark("GoogleBenchmarkColumnarToRow::CacheScan", bck) + ->Args({ + cpu, + }) + ->Iterations(iterations) + ->Threads(threads) + ->ReportAggregatesOnly(false) + ->MeasureProcessCPUTime() + ->Unit(benchmark::kSecond); + + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); +} diff --git a/cpp/src/parquet/arrow/parquet_scan_string_benchmark.cc b/cpp/src/parquet/arrow/parquet_scan_string_benchmark.cc new file mode 100644 index 0000000000000..58763e2edfee0 --- /dev/null +++ b/cpp/src/parquet/arrow/parquet_scan_string_benchmark.cc @@ -0,0 +1,223 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arrow/record_batch.h" +#include "parquet/arrow/utils/macros.h" +#include "parquet/arrow/test_utils.h" + + +// namespace parquet { +// namespace benchmark { + +const int batch_buffer_size = 32768; + +class GoogleBenchmarkParquetStringScan { + public: + GoogleBenchmarkParquetStringScan(std::string file_name) { GetRecordBatchReader(file_name); } + + void GetRecordBatchReader(const std::string& input_file) { + std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; + std::shared_ptr record_batch_reader; + + std::shared_ptr fs; + std::string file_name; + ARROW_ASSIGN_OR_THROW(fs, arrow::fs::FileSystemFromUriOrPath(input_file, &file_name)) + + ARROW_ASSIGN_OR_THROW(file, fs->OpenInputFile(file_name)); + + properties.set_batch_size(batch_buffer_size); + properties.set_pre_buffer(false); + properties.set_use_threads(false); + + ASSERT_NOT_OK(::parquet::arrow::FileReader::Make( + arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), + properties, &parquet_reader)); + + ASSERT_NOT_OK(parquet_reader->GetSchema(&schema)); + + auto num_rowgroups = parquet_reader->num_row_groups(); + + for (int i = 0; i < num_rowgroups; ++i) { + row_group_indices.push_back(i); + } + + auto num_columns = schema->num_fields(); + std::cout << "Enter Is_binary_like Check: " << std::endl; + for (int i = 0; i < num_columns; ++i) { + auto field = schema->field(i); + auto type = field->type(); + if (arrow::is_binary_like(type->id())) { + std::cout << "Is_binary_like colIndex: " << i << std::endl; + column_indices.push_back(i); + } + } + } + + virtual void operator()(benchmark::State& state) {} + + protected: + long SetCPU(uint32_t cpuindex) { + cpu_set_t cs; + CPU_ZERO(&cs); + CPU_SET(cpuindex, &cs); + return sched_setaffinity(0, sizeof(cs), &cs); + } + + protected: + std::string file_name; + std::shared_ptr file; + std::vector row_group_indices; + std::vector column_indices; + std::shared_ptr schema; + parquet::ArrowReaderProperties properties; +}; +class GoogleBenchmarkParquetStringScan_IteratorScan_Benchmark + : public GoogleBenchmarkParquetStringScan { + public: + GoogleBenchmarkParquetStringScan_IteratorScan_Benchmark(std::string filename) + : GoogleBenchmarkParquetStringScan(filename) {} + void operator()(benchmark::State& state) { + if (state.range(0) == 0xffffffff) { + SetCPU(state.thread_index()); + } else { + SetCPU(state.range(0)); + } + + arrow::Compression::type compression_type = (arrow::Compression::type)1; + + std::shared_ptr record_batch; + int64_t elapse_read = 0; + int64_t num_batches = 0; + int64_t num_rows = 0; + int64_t init_time = 0; + int64_t write_time = 0; + + + std::vector local_column_indices = column_indices; + + for (auto val : local_column_indices){ + std::cout << "local_column_indices: is_binary_like colIndex: " << val << std::endl; + } + + std::shared_ptr local_schema; + local_schema = std::make_shared(*schema.get()); + + if (state.thread_index() == 0) std::cout << local_schema->ToString() << std::endl; + + for (auto _ : state) { + std::unique_ptr<::parquet::arrow::FileReader> parquet_reader; + std::shared_ptr record_batch_reader; + ASSERT_NOT_OK(::parquet::arrow::FileReader::Make( + ::arrow::default_memory_pool(), ::parquet::ParquetFileReader::Open(file), + properties, &parquet_reader)); + + std::vector> batches; + ASSERT_NOT_OK(parquet_reader->GetRecordBatchReader( + row_group_indices, local_column_indices, &record_batch_reader)); + do { + TIME_NANO_OR_THROW(elapse_read, record_batch_reader->ReadNext(&record_batch)); + + if (record_batch) { + // batches.push_back(record_batch); + num_batches += 1; + num_rows += record_batch->num_rows(); + } + } while (record_batch); + + std::cout << " parquet parse done elapsed time = " << elapse_read / 1000000 + << " rows = " << num_rows << std::endl; + } + + state.counters["rowgroups"] = + benchmark::Counter(row_group_indices.size(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1000); + state.counters["columns"] = + benchmark::Counter(column_indices.size(), benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1000); + state.counters["batches"] = benchmark::Counter( + num_batches, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + state.counters["num_rows"] = benchmark::Counter( + num_rows, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + state.counters["batch_buffer_size"] = + benchmark::Counter(batch_buffer_size, benchmark::Counter::kAvgThreads, + benchmark::Counter::OneK::kIs1024); + + state.counters["parquet_parse"] = benchmark::Counter( + elapse_read, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + state.counters["init_time"] = benchmark::Counter( + init_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + state.counters["write_time"] = benchmark::Counter( + write_time, benchmark::Counter::kAvgThreads, benchmark::Counter::OneK::kIs1000); + } +}; + +// } // namespace ParquetStringScan +// } // namespace sparkcolumnarplugin + +int main(int argc, char** argv) { + uint32_t iterations = 1; + uint32_t threads = 1; + std::string datafile; + uint32_t cpu = 0xffffffff; + + for (int i = 0; i < argc; i++) { + if (strcmp(argv[i], "--iterations") == 0) { + iterations = atol(argv[i + 1]); + } else if (strcmp(argv[i], "--threads") == 0) { + threads = atol(argv[i + 1]); + } else if (strcmp(argv[i], "--file") == 0) { + datafile = argv[i + 1]; + } else if (strcmp(argv[i], "--cpu") == 0) { + cpu = atol(argv[i + 1]); + } + } + std::cout << "iterations = " << iterations << std::endl; + std::cout << "threads = " << threads << std::endl; + std::cout << "datafile = " << datafile << std::endl; + std::cout << "cpu = " << cpu << std::endl; + + GoogleBenchmarkParquetStringScan_IteratorScan_Benchmark + bck(datafile); + + benchmark::RegisterBenchmark("GoogleBenchmarkParquetStringScan::IteratorScan", bck) + ->Args({ + cpu, + }) + ->Iterations(iterations) + ->Threads(threads) + ->ReportAggregatesOnly(false) + ->MeasureProcessCPUTime() + ->Unit(benchmark::kSecond); + + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + benchmark::Shutdown(); +} diff --git a/cpp/src/parquet/arrow/test_utils.h b/cpp/src/parquet/arrow/test_utils.h new file mode 100644 index 0000000000000..d3afa459dfdf6 --- /dev/null +++ b/cpp/src/parquet/arrow/test_utils.h @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "utils/macros.h" +using namespace arrow; + +using TreeExprBuilder = gandiva::TreeExprBuilder; +using FunctionNode = gandiva::FunctionNode; + +#define ASSERT_NOT_OK(status) \ + do { \ + ::arrow::Status __s = (status); \ + if (!__s.ok()) { \ + throw std::runtime_error(__s.message()); \ + } \ + } while (false); + +#define ARROW_ASSIGN_OR_THROW_IMPL(status_name, lhs, rexpr) \ + do { \ + auto status_name = (rexpr); \ + auto __s = status_name.status(); \ + if (!__s.ok()) { \ + throw std::runtime_error(__s.message()); \ + } \ + lhs = std::move(status_name).ValueOrDie(); \ + } while (false); + +#define ARROW_ASSIGN_OR_THROW_NAME(x, y) ARROW_CONCAT(x, y) + +#define ARROW_ASSIGN_OR_THROW(lhs, rexpr) \ + ARROW_ASSIGN_OR_THROW_IMPL(ARROW_ASSIGN_OR_THROW_NAME(_error_or_value, __COUNTER__), \ + lhs, rexpr); + +template +Status Equals(const T& expected, const T& actual) { + if (expected.Equals(actual)) { + return arrow::Status::OK(); + } + std::stringstream pp_expected; + std::stringstream pp_actual; + ::arrow::PrettyPrintOptions options(/*indent=*/2); + options.window = 50; + ASSERT_NOT_OK(PrettyPrint(expected, options, &pp_expected)); + ASSERT_NOT_OK(PrettyPrint(actual, options, &pp_actual)); + if (pp_expected.str() == pp_actual.str()) { + return arrow::Status::OK(); + } + return Status::Invalid("Expected RecordBatch is ", pp_expected.str(), " with schema ", + expected.schema()->ToString(), ", while actual is ", + pp_actual.str(), " with schema ", actual.schema()->ToString()); +} + +void MakeInputBatch(std::vector input_data, + std::shared_ptr sch, + std::shared_ptr* input_batch) { + // prepare input record Batch + std::vector> array_list; + int length = -1; + int i = 0; + for (auto data : input_data) { + std::shared_ptr a0; + ASSERT_NOT_OK(arrow::ipc::internal::json::ArrayFromJSON(sch->field(i++)->type(), + data.c_str(), &a0)); + if (length == -1) { + length = a0->length(); + } + assert(length == a0->length()); + array_list.push_back(a0); + } + + *input_batch = RecordBatch::Make(sch, length, array_list); + return; +} + +void ConstructNullInputBatch(std::shared_ptr* null_batch) { + std::vector> columns; + arrow::Int64Builder builder1; + builder1.AppendNull(); + builder1.Append(1); + + arrow::Int64Builder builder2; + builder2.Append(1); + builder2.AppendNull(); + + std::shared_ptr array1; + builder1.Finish(&array1); + std::shared_ptr array2; + builder2.Finish(&array2); + + columns.push_back(array1); + columns.push_back(array2); + + std::vector> schema_vec{ + arrow::field("col1", arrow::int64()), + arrow::field("col2", arrow::int64()), + }; + + std::shared_ptr schema{std::make_shared(schema_vec)}; + *null_batch = arrow::RecordBatch::Make(schema, 2, columns); + return; +} diff --git a/cpp/src/parquet/arrow/utils/exception.h b/cpp/src/parquet/arrow/utils/exception.h new file mode 100644 index 0000000000000..582903d0ef0fa --- /dev/null +++ b/cpp/src/parquet/arrow/utils/exception.h @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +class JniPendingException : public std::runtime_error { + public: + explicit JniPendingException(const std::string& arg) : runtime_error(arg) {} +}; \ No newline at end of file diff --git a/cpp/src/parquet/arrow/utils/macros.h b/cpp/src/parquet/arrow/utils/macros.h new file mode 100644 index 0000000000000..e123d46f82854 --- /dev/null +++ b/cpp/src/parquet/arrow/utils/macros.h @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include + +#include "parquet/arrow/utils/exception.h" + +#define TIME_NANO_DIFF(finish, start) \ + (finish.tv_sec - start.tv_sec) * 1000000000 + (finish.tv_nsec - start.tv_nsec) + +#define TIME_MICRO_OR_RAISE(time, expr) \ + do { \ + auto start = std::chrono::steady_clock::now(); \ + auto __s = (expr); \ + if (!__s.ok()) { \ + return __s; \ + } \ + auto end = std::chrono::steady_clock::now(); \ + time += std::chrono::duration_cast(end - start).count(); \ + } while (false); + +#define TIME_MICRO_OR_THROW(time, expr) \ + do { \ + auto start = std::chrono::steady_clock::now(); \ + auto __s = (expr); \ + if (!__s.ok()) { \ + throw JniPendingException(__s.message()); \ + } \ + auto end = std::chrono::steady_clock::now(); \ + time += std::chrono::duration_cast(end - start).count(); \ + } while (false); + +#define TIME_MICRO(time, res, expr) \ + do { \ + auto start = std::chrono::steady_clock::now(); \ + res = (expr); \ + auto end = std::chrono::steady_clock::now(); \ + time += std::chrono::duration_cast(end - start).count(); \ + } while (false); + +#define TIME_NANO_OR_RAISE(time, expr) \ + do { \ + auto start = std::chrono::steady_clock::now(); \ + auto __s = (expr); \ + if (!__s.ok()) { \ + return __s; \ + } \ + auto end = std::chrono::steady_clock::now(); \ + time += std::chrono::duration_cast(end - start).count(); \ + } while (false); + +#define TIME_NANO_OR_THROW(time, expr) \ + do { \ + auto start = std::chrono::steady_clock::now(); \ + auto __s = (expr); \ + if (!__s.ok()) { \ + throw JniPendingException(__s.message()); \ + } \ + auto end = std::chrono::steady_clock::now(); \ + time += std::chrono::duration_cast(end - start).count(); \ + } while (false); + +#define VECTOR_PRINT(v, name) \ + std::cout << "[" << name << "]:"; \ + for (int i = 0; i < v.size(); i++) { \ + if (i != v.size() - 1) \ + std::cout << v[i] << ","; \ + else \ + std::cout << v[i]; \ + } \ + std::cout << std::endl; + +#define THROW_NOT_OK(expr) \ + do { \ + auto __s = (expr); \ + if (!__s.ok()) { \ + throw JniPendingException(__s.message()); \ + } \ + } while (false); + +#define TIME_TO_STRING(time) \ + (time > 10000 ? time / 1000 : time) << (time > 10000 ? " ms" : " us") + +#define TIME_NANO_TO_STRING(time) \ + (time > 1e7 ? time / 1e6 : ((time > 1e4) ? time / 1e3 : time)) \ + << (time > 1e7 ? "ms" : (time > 1e4 ? "us" : "ns")) From 7fdf91400f11036980d002da9b720c531d0db606 Mon Sep 17 00:00:00 2001 From: zhixingheyi-tian Date: Fri, 22 Jul 2022 15:29:47 +0800 Subject: [PATCH 2/3] Add Usage --- cpp/README.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/cpp/README.md b/cpp/README.md index b083f3fe78e74..ec6b136aa83a3 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -32,3 +32,16 @@ to install pre-compiled binary versions of the library. Please refer to our latest [C++ Development Documentation][1]. [1]: https://github.com/apache/arrow/blob/master/docs/source/developers/cpp + +## Run parquet string scan benchmark +#### Minimal benchmark build +cd arrow +mkdir -p cpp/debug +cd cpp/debug +cmake -DCMAKE_BUILD_TYPE=Release -DARROW_BUILD_BENCHMARKS=ON -DARROW_WITH_ZLIB=ON -DARROW_JEMALLOC=OFF -DARROW_PARQUET=ON -DARROW_COMPUTE=ON -DARROW_DATASET=ON -DARROW_WITH_SNAPPY=ON -DARROW_FILESYSTEM=ON .. + +#### Run benchmark and collect perf data +cpp/debug +./release/parquet-arrow-parquet-scan-string-benchmark --iterations 10 --threads 1 --file {parquet_path} --cpu 0 & +perf record -e cycles:ppp -C 0 sleep 10 + From 3b8ffa3eb3383eb43726e63b7f402ecec2efa53f Mon Sep 17 00:00:00 2001 From: zhixingheyi-tian Date: Fri, 22 Jul 2022 15:30:28 +0800 Subject: [PATCH 3/3] perf report --- cpp/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/README.md b/cpp/README.md index ec6b136aa83a3..9f563149beb77 100644 --- a/cpp/README.md +++ b/cpp/README.md @@ -44,4 +44,5 @@ cmake -DCMAKE_BUILD_TYPE=Release -DARROW_BUILD_BENCHMARKS=ON -DARROW_WITH_ZLIB=O cpp/debug ./release/parquet-arrow-parquet-scan-string-benchmark --iterations 10 --threads 1 --file {parquet_path} --cpu 0 & perf record -e cycles:ppp -C 0 sleep 10 +perf report