apache · corwinjoy · Dec 8, 2023 · Dec 8, 2023 · Dec 8, 2023 · Dec 19, 2023
diff --git a/cpp/src/generated/parquet_types.cpp b/cpp/src/generated/parquet_types.cpp
diff --git a/cpp/src/generated/parquet_types.h b/cpp/src/generated/parquet_types.h
@@ -323,6 +323,19 @@ class SerializedPageWriter : public PageWriter {
 
     PARQUET_THROW_NOT_OK(sink_->Write(output_data_buffer, output_data_len));
 
+    // Add the dictionary page to the offsets
+    if (offset_index_builder_ != nullptr) {
+      const int64_t compressed_size = output_data_len + header_size;
+      if (compressed_size > std::numeric_limits<int32_t>::max()) {
+        throw ParquetException("Compressed dictionary page size overflows to INT32_MAX.");
+      }
+
+      /// start_pos is a relative offset in the buffered mode. It should be
+      /// adjusted via OffsetIndexBuilder::Finish() after BufferedPageWriter
+      /// has flushed all data pages.
+      offset_index_builder_->AddPage(start_pos, static_cast<int32_t>(compressed_size), -1);
+    }
+
     total_uncompressed_size_ += uncompressed_size + header_size;
     total_compressed_size_ += output_data_len + header_size;
     ++dict_encoding_stats_[page.encoding()];

@@ -909,8 +909,9 @@ ::arrow::Future<> ParquetFileReader::WhenBuffered(
 // File metadata helpers
 
 std::shared_ptr<FileMetaData> ReadMetaData(
-    const std::shared_ptr<::arrow::io::RandomAccessFile>& source) {
-  return ParquetFileReader::Open(source)->metadata();
+    const std::shared_ptr<::arrow::io::RandomAccessFile>& source,
+    const ReaderProperties& props) {
+  return ParquetFileReader::Open(source, props)->metadata();
 }
 
 // ----------------------------------------------------------------------

@@ -217,7 +217,7 @@ class PARQUET_EXPORT ParquetFileReader {
 
 // Read only Parquet file metadata
 std::shared_ptr<FileMetaData> PARQUET_EXPORT
-ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source);
+ReadMetaData(const std::shared_ptr<::arrow::io::RandomAccessFile>& source, const ReaderProperties& props = default_reader_properties());
 
 /// \brief Scan all values in file. Useful for performance testing
 /// \param[in] columns the column numbers to scan. If empty scans all

@@ -35,6 +35,7 @@
 #include "parquet/schema.h"
 #include "parquet/schema_internal.h"
 #include "parquet/thrift_internal.h"
+#include "parquet/page_index.h"
 
 namespace parquet {
 
@@ -610,6 +611,10 @@ class FileMetaData::FileMetaDataImpl {
       : properties_(std::move(properties)), file_decryptor_(std::move(file_decryptor)) {
     metadata_ = std::make_unique<format::FileMetaData>();
 
+    if(properties_.read_only_rowgroup_0()) {
+      metadata_->read_only_rowgroup_0 = true;
+    }
+
     auto footer_decryptor =
         file_decryptor_ != nullptr ? file_decryptor_->GetFooterDecryptor() : nullptr;
 
@@ -826,6 +831,32 @@ class FileMetaData::FileMetaDataImpl {
     file_decryptor_ = std::move(file_decryptor);
   }
 
+  void set_column_offsets(const std::vector<std::shared_ptr<OffsetIndex>>& column_offsets, int64_t expected_num_rows) {
+    if (num_row_groups() != 1) {
+      throw ParquetException(
+              "This operation can only be applied to metadata with a single row group");
+    }
+
+    format::RowGroup& row_group = metadata_->row_groups[0];
+    int idx = 0;
+    for (format::ColumnChunk& chunk : row_group.columns) {
+      // Assume a chunk has only 1 page for now
+      auto pages = column_offsets[idx++].get()->page_locations();
+      for(PageLocation& page: pages) {
+        if(chunk.meta_data.dictionary_page_offset > 0 && page.first_row_index < 0) {
+          // Offset is a dictionary page
+          // Assumes OffsetIndex code has been updated to output dictionary offsets
+          chunk.meta_data.__set_dictionary_page_offset(page.offset);
+        } else {
+          chunk.meta_data.__set_data_page_offset(page.offset);
+          chunk.meta_data.__set_num_values(expected_num_rows);
+          // The compressed size can be set too large
+          // Use the value in row 0
+        }
+      }
+    }
+  }
+
  private:
   friend FileMetaDataBuilder;
   uint32_t metadata_len_ = 0;
@@ -985,6 +1016,24 @@ std::shared_ptr<FileMetaData> FileMetaData::Subset(
   return impl_->Subset(row_groups);
 }
 
+void FileMetaData::IndexTo(int row_group, const std::vector<ColumnOffsets> &rowgroup_offsets){
+  std::vector<int> row_groups = {row_group};
+  auto target_column_offsets = rowgroup_offsets[row_group];
+  int64_t total_rows = this->num_rows();
+  int64_t chunk_rows = this->RowGroup(0)->num_rows();
+  int64_t num_values = chunk_rows;
+  if (row_group >= total_rows / chunk_rows) {
+    // last page, set num_values to remainder
+    num_values = total_rows % chunk_rows;
+  }
+  this->set_column_offsets(target_column_offsets, num_values);
+}
+
+void FileMetaData::set_column_offsets(const ColumnOffsets& column_offsets, int64_t expected_num_rows) {
+
+  impl_->set_column_offsets(column_offsets, expected_num_rows);
+}
+
 void FileMetaData::WriteTo(::arrow::io::OutputStream* dst,
                            const std::shared_ptr<Encryptor>& encryptor) const {
   return impl_->WriteTo(dst, encryptor);

@@ -263,6 +263,8 @@ class PARQUET_EXPORT RowGroupMetaData {
 };
 
 class FileMetaDataBuilder;
+class OffsetIndex;
+typedef std::vector<std::shared_ptr<OffsetIndex>> ColumnOffsets;
 
 /// \brief FileMetaData is a proxy around format::FileMetaData.
 class PARQUET_EXPORT FileMetaData {
@@ -396,6 +398,13 @@ class PARQUET_EXPORT FileMetaData {
   /// FileMetaData.
   std::shared_ptr<FileMetaData> Subset(const std::vector<int>& row_groups) const;
 
+  /// \brief Return FileMetaData pointing to a single group
+  /// as created via page offsets
+  void IndexTo(int row_group, const std::vector<ColumnOffsets> &rowgroup_offsets);
+
+  /// \brief Override column chunk offsets with provided page offsets
+  void set_column_offsets(const ColumnOffsets& column_offsets, int64_t expected_num_rows);
+
  private:
   friend FileMetaDataBuilder;
   friend class SerializedFile;

@@ -278,7 +278,7 @@ class RowGroupPageIndexReaderImpl : public RowGroupPageIndexReader {
                       encryption::kOffsetIndex);
     }
 
-    return OffsetIndex::Make(offset_index_buffer_->data() + buffer_offset, length,
+    return OffsetIndex::Make(offset_index_buffer_->data() + buffer_offset, &length,
                              properties_, decryptor.get());
   }
 
@@ -344,6 +344,54 @@ class RowGroupPageIndexReaderImpl : public RowGroupPageIndexReader {
   std::shared_ptr<::arrow::Buffer> offset_index_buffer_;
 };
 
+/// Read offset index of a single column.
+class OffsetIndexReader {
+public:
+  OffsetIndexReader(
+          std::shared_ptr<RowGroupMetaData> row_group_metadata,
+          const ReaderProperties &properties,
+          InternalFileDecryptor *file_decryptor,
+          std::shared_ptr<::arrow::Buffer> offset_index_buffer)
+          :
+          row_group_metadata_(std::move(row_group_metadata)),
+          properties_(properties),
+          file_decryptor_(file_decryptor),
+          offset_index_buffer_(std::move(offset_index_buffer)){}
+
+  std::shared_ptr<OffsetIndex> GetOffsetIndex(int32_t col, int64_t buffer_offset, uint32_t estimated_length,
+                                              uint32_t *actual_length) {
+    auto col_chunk = row_group_metadata_->ColumnChunk(col);
+    uint32_t actual_len(estimated_length);
+
+    // Get decryptor of offset index if encrypted.
+    std::shared_ptr<Decryptor> decryptor =
+            GetColumnMetaDecryptor(col_chunk->crypto_metadata().get(), file_decryptor_);
+    if (decryptor != nullptr) {
+      UpdateDecryptor(decryptor, 0, /*column_ordinal=*/col,
+                      encryption::kOffsetIndex);
+    }
+
+    auto offset_index = OffsetIndex::Make(offset_index_buffer_->data() + buffer_offset, &actual_len,
+                             properties_, decryptor.get());
+
+    *actual_length = actual_len;
+    return offset_index;
+  }
+
+private:
+  /// The row group metadata to get column chunk metadata.
+  std::shared_ptr<RowGroupMetaData> row_group_metadata_;
+
+  /// Reader properties used to deserialize thrift object.
+  const ReaderProperties &properties_;
+
+  /// File-level decryptor.
+  InternalFileDecryptor *file_decryptor_;
+
+  /// Buffer to hold the raw bytes of the page index.
+  std::shared_ptr<::arrow::Buffer> offset_index_buffer_;
+};
+
 class PageIndexReaderImpl : public PageIndexReader {
  public:
   PageIndexReaderImpl(::arrow::io::RandomAccessFile* input,
@@ -388,6 +436,56 @@ class PageIndexReaderImpl : public PageIndexReader {
     return nullptr;
   }
 
+  /// Method to read full set of OffsetIndex pages
+  /// Key feature is that this does not require a full set of metadata
+  /// Only rowgroup 0 metadata is needed.
+  std::vector<ColumnOffsets> GetAllOffsets() override{
+    std::shared_ptr<RowGroupMetaData> row_group_metadata = file_metadata_->RowGroup(0);
+    int32_t rowgroup_len = 0; // This rowgroup length is just an estimate, may vary by rowgroup
+    int64_t offset_index_start = -1;
+    int64_t total_rows = file_metadata_->num_rows();
+    int64_t chunk_rows = row_group_metadata->num_rows();
+    // Don't use row_group count from metadata since may be dummy with only rowgroup 0
+    int num_row_groups = ceil(static_cast<double>(total_rows) / static_cast<double>(chunk_rows));
+    int num_columns = file_metadata_->num_columns();
+    // TODO add methods to get offset_index_start and rowgroup_len directly
+    // This is because ColumnChunk creation is super expensive.
+    auto col_chunk = row_group_metadata->ColumnChunk(0);
+    auto offset_index_location = col_chunk->GetOffsetIndexLocation();
+    offset_index_start = offset_index_location->offset;
+    rowgroup_len = offset_index_location->length * num_columns;
+
+    // Retrieve 1.5x the estimated size to allow for variation in storing pages
+    // This is just a guess, but we can go over because metadata comes after offsets
+    // So, we can retrieve a slightly larger buffer here
+    float overhead_factor = 1.5;
+    int32_t est_offset_index_size = num_row_groups * rowgroup_len * overhead_factor;
+    std::shared_ptr<::arrow::Buffer> offset_index_buffer;
+    PARQUET_ASSIGN_OR_THROW(offset_index_buffer,
+                            input_->ReadAt(offset_index_start,
+                                           est_offset_index_size));
+
+    // Perform a direct read against the buffer for performance
+    ThriftDeserializer deserializer(properties_);
+    uint32_t len_used(est_offset_index_size);
+    deserializer.SetInternalBuffer(const_cast<uint8_t*>(offset_index_buffer->data()), &len_used);
+
+    std::vector<ColumnOffsets> rowgroup_offsets;
+    rowgroup_offsets.reserve(num_row_groups);
+    format::OffsetIndex offset_index;
+    for (int rg = 0; rg < num_row_groups; ++rg) {
+      ColumnOffsets offset_indexes;
+      offset_indexes.reserve(num_columns);
+      for (int col = 0; col < num_columns; ++col) {
+        deserializer.DeserializeUnencryptedMessageUsingInternalBuffer(&offset_index);
+        auto offset_index_ptr = std::make_shared<OffsetIndexImpl>(offset_index);
+        offset_indexes.emplace_back(std::move(offset_index_ptr));
+      }
+      rowgroup_offsets.emplace_back(std::move(offset_indexes));
+    }
+    return rowgroup_offsets;
+  }
+
   void WillNeed(const std::vector<int32_t>& row_group_indices,
                 const std::vector<int32_t>& column_indices,
                 const PageIndexSelection& selection) override {
@@ -908,13 +1006,13 @@ std::unique_ptr<ColumnIndex> ColumnIndex::Make(const ColumnDescriptor& descr,
 }
 
 std::unique_ptr<OffsetIndex> OffsetIndex::Make(const void* serialized_index,
-                                               uint32_t index_len,
+                                               uint32_t *index_len,
                                                const ReaderProperties& properties,
                                                Decryptor* decryptor) {
   format::OffsetIndex offset_index;
   ThriftDeserializer deserializer(properties);
   deserializer.DeserializeMessage(reinterpret_cast<const uint8_t*>(serialized_index),
-                                  &index_len, &offset_index, decryptor);
+                                  index_len, &offset_index, decryptor);
   return std::make_unique<OffsetIndexImpl>(offset_index);
 }