diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 3c27bd2b00ed1..7c17453cc2a87 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -346,12 +346,15 @@ Result> ParquetFileFormat::Inspect( } Result> ParquetFileFormat::GetReader( - const FileSource& source, const std::shared_ptr& options) const { - return GetReaderAsync(source, options).result(); + const FileSource& source, + const std::shared_ptr& options const + std::shared_ptr& metadata) const { + return GetReaderAsync(source, options, metadata).result(); } Future> ParquetFileFormat::GetReaderAsync( - const FileSource& source, const std::shared_ptr& options) const { + const FileSource& source, const std::shared_ptr& options, + const std::shared_ptr& metadata) const { ARROW_ASSIGN_OR_RAISE( auto parquet_scan_options, GetFragmentScanOptions(kParquetTypeName, options.get(), @@ -360,8 +363,8 @@ Future> ParquetFileFormat::GetReader MakeReaderProperties(*this, parquet_scan_options.get(), options->pool); ARROW_ASSIGN_OR_RAISE(auto input, source.Open()); // TODO(ARROW-12259): workaround since we have Future<(move-only type)> - auto reader_fut = - parquet::ParquetFileReader::OpenAsync(std::move(input), std::move(properties)); + auto reader_fut = parquet::ParquetFileReader::OpenAsync( + std::move(input), std::move(properties), metadata); auto path = source.path(); auto self = checked_pointer_cast(shared_from_this()); return reader_fut.Then( @@ -443,7 +446,7 @@ Result ParquetFileFormat::ScanBatchesAsync( // If RowGroup metadata is cached completely we can pre-filter RowGroups before opening // a FileReader, potentially avoiding IO altogether if all RowGroups are excluded due to // prior statistics knowledge. In the case where a RowGroup doesn't have statistics - // metdata, it will not be excluded. + // metadata, it will not be excluded. if (parquet_fragment->metadata() != nullptr) { ARROW_ASSIGN_OR_RAISE(row_groups, parquet_fragment->FilterRowGroups(options->filter)); pre_filtered = true; @@ -483,8 +486,9 @@ Result ParquetFileFormat::ScanBatchesAsync( MakeSerialReadaheadGenerator(std::move(sliced), batch_readahead); return sliced_readahead; }; - auto generator = MakeFromFuture(GetReaderAsync(parquet_fragment->source(), options) - .Then(std::move(make_generator))); + auto generator = MakeFromFuture( + GetReaderAsync(parquet_fragment->source(), options, parquet_fragment->metadata()) + .Then(std::move(make_generator))); WRAP_ASYNC_GENERATOR_WITH_CHILD_SPAN( generator, "arrow::dataset::ParquetFileFormat::ScanBatchesAsync::Next"); return generator; diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index 1087fb9f9def2..aa422fba133fa 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -117,10 +117,12 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { /// \brief Return a FileReader on the given source. Result> GetReader( - const FileSource& source, const std::shared_ptr& options) const; + const FileSource& source, const std::shared_ptr& options, + const std::shared_ptr& metadata = NULLPTR) const; Future> GetReaderAsync( - const FileSource& source, const std::shared_ptr& options) const; + const FileSource& source, const std::shared_ptr& options, + const std::shared_ptr& metadata = NULLPTR) const; Result> MakeWriter( std::shared_ptr destination, std::shared_ptr schema,