diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index b5423a64b2865..1a68fe238f10e 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -103,13 +103,14 @@ static parquet::ReaderProperties MakeReaderProperties( } static parquet::ArrowReaderProperties MakeArrowReaderProperties( - const ParquetFileFormat& format, const parquet::ParquetFileReader& reader) { + const ParquetFileFormat& format, int64_t batch_size, + const parquet::ParquetFileReader& reader) { parquet::ArrowReaderProperties properties(/* use_threads = */ false); for (const std::string& name : format.reader_options.dict_columns) { auto column_index = reader.metadata()->schema()->ColumnIndex(name); properties.set_read_dictionary(column_index, true); } - properties.set_batch_size(format.reader_options.batch_size); + properties.set_batch_size(batch_size); return properties; } @@ -351,7 +352,8 @@ Result> ParquetFileFormat::Inspect( auto properties = MakeReaderProperties(*this); ARROW_ASSIGN_OR_RAISE(auto reader, OpenReader(source, std::move(properties))); - auto arrow_properties = MakeArrowReaderProperties(*this, *reader); + auto arrow_properties = + MakeArrowReaderProperties(*this, parquet::kArrowDefaultBatchSize, *reader); std::unique_ptr arrow_reader; RETURN_NOT_OK(parquet::arrow::FileReader::Make(default_memory_pool(), std::move(reader), std::move(arrow_properties), @@ -368,7 +370,7 @@ Result ParquetFileFormat::ScanFile( auto properties = MakeReaderProperties(*this, context->pool); ARROW_ASSIGN_OR_RAISE(auto reader, OpenReader(source, std::move(properties))); - auto arrow_properties = MakeArrowReaderProperties(*this, *reader); + auto arrow_properties = MakeArrowReaderProperties(*this, options->batch_size, *reader); return ParquetScanTaskIterator::Make(options, context, std::move(reader), std::move(arrow_properties)); } diff --git a/cpp/src/arrow/dataset/file_parquet.h b/cpp/src/arrow/dataset/file_parquet.h index 752d01e7e4217..471650d4959ad 100644 --- a/cpp/src/arrow/dataset/file_parquet.h +++ b/cpp/src/arrow/dataset/file_parquet.h @@ -60,7 +60,6 @@ class ARROW_DS_EXPORT ParquetFileFormat : public FileFormat { /// /// @{ std::unordered_set dict_columns; - int64_t batch_size = parquet::kArrowDefaultBatchSize; /// @} } reader_options; diff --git a/cpp/src/arrow/dataset/scanner.h b/cpp/src/arrow/dataset/scanner.h index 690d1cee5c8fc..361b01ec58f4e 100644 --- a/cpp/src/arrow/dataset/scanner.h +++ b/cpp/src/arrow/dataset/scanner.h @@ -90,6 +90,9 @@ class ARROW_DS_EXPORT ScanOptions { // sub-selection optimization. std::vector MaterializedFields() const; + // Maximum row count for scanned batches. + int64_t batch_size = 64 << 10; + private: explicit ScanOptions(std::shared_ptr schema); }; diff --git a/cpp/src/arrow/dataset/scanner_test.cc b/cpp/src/arrow/dataset/scanner_test.cc index c683f8848c72a..804955b1311a7 100644 --- a/cpp/src/arrow/dataset/scanner_test.cc +++ b/cpp/src/arrow/dataset/scanner_test.cc @@ -70,6 +70,14 @@ TEST_F(TestScanner, Scan) { AssertScannerEqualsRepetitionsOf(MakeScanner(batch), batch); } +TEST_F(TestScanner, DISABLED_ScanWithCappedBatchSize) { + // TODO(bkietz) enable when InMemory* respects ScanOptions::batch_size + SetSchema({field("i32", int32()), field("f64", float64())}); + auto batch = ConstantArrayGenerator::Zeroes(kBatchSize, schema_); + options_->batch_size = kBatchSize / 2; + AssertScannerEqualsRepetitionsOf(MakeScanner(batch), batch->Slice(kBatchSize / 2)); +} + TEST_F(TestScanner, FilteredScan) { SetSchema({field("f64", float64())}); diff --git a/python/pyarrow/_dataset.pyx b/python/pyarrow/_dataset.pyx index 69bb96cf92caf..ae19bba871e18 100644 --- a/python/pyarrow/_dataset.pyx +++ b/python/pyarrow/_dataset.pyx @@ -110,15 +110,6 @@ cdef class ParquetFileFormatReaderOptions: for value in set(values): self.options.dict_columns.insert(tobytes(value)) - @property - def batch_size(self): - """Maximum number of rows in read record batches.""" - return self.options.batch_size - - @batch_size.setter - def batch_size(self, int value): - self.options.batch_size = value - cdef class ParquetFileFormat(FileFormat): diff --git a/python/pyarrow/includes/libarrow_dataset.pxd b/python/pyarrow/includes/libarrow_dataset.pxd index ae9e2870d38a1..ac52e00cb3384 100644 --- a/python/pyarrow/includes/libarrow_dataset.pxd +++ b/python/pyarrow/includes/libarrow_dataset.pxd @@ -287,7 +287,6 @@ cdef extern from "arrow/dataset/api.h" namespace "arrow::dataset" nogil: c_bool use_buffered_stream int64_t buffer_size unordered_set[c_string] dict_columns - int64_t batch_size cdef cppclass CParquetFileFormat "arrow::dataset::ParquetFileFormat"( CFileFormat): diff --git a/r/src/dataset.cpp b/r/src/dataset.cpp index 5894e42b7d435..a1e81c5cbaf54 100644 --- a/r/src/dataset.cpp +++ b/r/src/dataset.cpp @@ -83,9 +83,6 @@ std::shared_ptr dataset___ParquetFileFormat__Make(List_ o fmt->reader_options.dict_columns.insert(name); } } - if (reader_options.containsElementNamed("batch_size")) { - fmt->reader_options.batch_size = reader_options["batch_size"]; - } } return fmt; }