Skip to content

Commit

Permalink
using reader-propeties to set buffer size upon reading column chunks.
Browse files Browse the repository at this point in the history
it is useful upon very row-groups(huge number of values per column).

Signed-off-by: Gal Salomon <[email protected]>
  • Loading branch information
galsalomon66 committed Aug 27, 2024
1 parent a616839 commit 9b9f357
Showing 1 changed file with 15 additions and 6 deletions.
21 changes: 15 additions & 6 deletions include/s3select_parquet_intrf.h
Original file line number Diff line number Diff line change
Expand Up @@ -747,6 +747,15 @@ class PARQUET_EXPORT RowGroupReader {
std::unique_ptr<Contents> contents_;
};

//TODO external setting? RGW options ??
#define RGW_buffer_size 1024*1024*16
ReaderProperties s3select_reader_properties() {
static ReaderProperties default_reader_properties;
default_reader_properties.enable_buffered_stream();
default_reader_properties.set_buffer_size(RGW_buffer_size);
return default_reader_properties;
}

class PARQUET_EXPORT ParquetFileReader {
public:
// Declare a virtual class 'Contents' to aid dependency injection and more
Expand All @@ -755,7 +764,7 @@ class PARQUET_EXPORT ParquetFileReader {
struct PARQUET_EXPORT Contents {
static std::unique_ptr<Contents> Open(
std::shared_ptr<::arrow::io::RandomAccessFile> source,
const ReaderProperties& props = default_reader_properties(),
const ReaderProperties& props = s3select_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR);

virtual ~Contents() = default;
Expand All @@ -776,21 +785,21 @@ class PARQUET_EXPORT ParquetFileReader {
ARROW_DEPRECATED("Use arrow::io::RandomAccessFile version")
static std::unique_ptr<ParquetFileReader> Open(
std::unique_ptr<RandomAccessSource> source,
const ReaderProperties& props = default_reader_properties(),
const ReaderProperties& props = s3select_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR);

// Create a file reader instance from an Arrow file object. Thread-safety is
// the responsibility of the file implementation
static std::unique_ptr<ParquetFileReader> Open(
std::shared_ptr<::arrow::io::RandomAccessFile> source,
const ReaderProperties& props = default_reader_properties(),
const ReaderProperties& props = s3select_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR);

// API Convenience to open a serialized Parquet file on disk, using Arrow IO
// interfaces.
static std::unique_ptr<ParquetFileReader> OpenFile(
const std::string& path,s3selectEngine::rgw_s3select_api* rgw, bool memory_map = true,
const ReaderProperties& props = default_reader_properties(),
const ReaderProperties& props = s3select_reader_properties(),
std::shared_ptr<FileMetaData> metadata = NULLPTR
);

Expand Down Expand Up @@ -1034,7 +1043,7 @@ class SerializedRowGroup : public RowGroupReader::Contents {
class SerializedFile : public ParquetFileReader::Contents {
public:
SerializedFile(std::shared_ptr<ArrowInputFile> source,
const ReaderProperties& props = default_reader_properties())
const ReaderProperties& props = s3select_reader_properties())
: source_(std::move(source)), properties_(props) {
PARQUET_ASSIGN_OR_THROW(source_size_, source_->GetSize());
}
Expand Down Expand Up @@ -1241,7 +1250,7 @@ void SerializedFile::ParseMetaDataOfEncryptedFileWithEncryptedFooter(

#if ARROW_VERSION_MAJOR > 9
file_metadata_ =
FileMetaData::Make(metadata_buffer->data(), &metadata_len, default_reader_properties(), file_decryptor_);
FileMetaData::Make(metadata_buffer->data(), &metadata_len, s3select_reader_properties(), file_decryptor_);
#else
file_metadata_ =
FileMetaData::Make(metadata_buffer->data(), &metadata_len, file_decryptor_);
Expand Down

0 comments on commit 9b9f357

Please sign in to comment.