From 9aeb8b1dff2fa872b0276a6778dff05253c249da Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Wed, 25 May 2022 10:06:21 +0000 Subject: [PATCH 01/19] Support reading and writing column chunk key-value metadata --- cpp/src/parquet/column_writer.cc | 4 ++ cpp/src/parquet/column_writer.h | 7 +++ cpp/src/parquet/column_writer_test.cc | 54 +++++++++++++++++++ cpp/src/parquet/metadata.cc | 78 ++++++++++++++++++++------- cpp/src/parquet/metadata.h | 5 ++ cpp/src/parquet/printer.cc | 32 ++++++++--- 6 files changed, 153 insertions(+), 27 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index eae8fc6125499..4eb8965d5416b 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1405,6 +1405,10 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< return pages_change_on_record_boundaries_; } + KeyValueMetadata& key_value_metadata() override { + return metadata_->key_value_metadata(); + } + private: using ValueEncoderType = typename EncodingTraits::Encoder; using TypedStats = TypedStatistics; diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index a278670fa81c6..9a541b1e5e00f 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -29,6 +29,7 @@ namespace arrow { class Array; +class KeyValueMetadata; namespace bit_util { class BitWriter; @@ -53,6 +54,8 @@ class Encryptor; class OffsetIndexBuilder; class WriterProperties; +using KeyValueMetadata = ::arrow::KeyValueMetadata; + class PARQUET_EXPORT LevelEncoder { public: LevelEncoder(); @@ -181,6 +184,10 @@ class PARQUET_EXPORT ColumnWriter { /// \brief The file-level writer properties virtual const WriterProperties* properties() = 0; + /// \brief Return KeyValueMetadata that can be used to store key-value + /// metadata in ColumnChunkMetaData. + virtual KeyValueMetadata& key_value_metadata() = 0; + /// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns /// error status if the array data type is not compatible with the concrete /// writer type. diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index c99efd17961aa..d8f96c2c403ae 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -27,6 +27,7 @@ #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_builders.h" #include "arrow/util/config.h" +#include "arrow/util/key_value_metadata.h" #include "parquet/column_page.h" #include "parquet/column_reader.h" @@ -51,6 +52,9 @@ using schema::PrimitiveNode; namespace test { +using ::testing::IsNull; +using ::testing::NotNull; + // The default size used in most tests. const int SMALL_SIZE = 100; #ifdef PARQUET_VALGRIND @@ -385,6 +389,15 @@ class TestPrimitiveWriter : public PrimitiveTypedTest { return metadata_accessor->encoding_stats(); } + std::shared_ptr metadata_key_value_metadata() { + // Metadata accessor must be created lazily. + // This is because the ColumnChunkMetaData semantics dictate the metadata object is + // complete (no changes to the metadata buffer can be made after instantiation) + auto metadata_accessor = + ColumnChunkMetaData::Make(metadata_->contents(), this->descr_); + return metadata_accessor->key_value_metadata(); + } + protected: int64_t values_read_; // Keep the reader alive as for ByteArray the lifetime of the ByteArray @@ -1705,5 +1718,46 @@ TEST(TestColumnWriter, WriteDataPageV2HeaderNullCount) { } } +using TestInt32Writer = TestPrimitiveWriter; + +TEST_F(TestInt32Writer, NoWriteKeyValueMetadata) { + auto writer = this->BuildWriter(); + writer->Close(); + auto key_value_metadata = metadata_key_value_metadata(); + ASSERT_THAT(key_value_metadata, IsNull()); +} + +TEST_F(TestInt32Writer, WriteKeyValueMetadata) { + auto writer = this->BuildWriter(); + writer->key_value_metadata().Append("hello", "world"); + writer->Close(); + auto key_value_metadata = metadata_key_value_metadata(); + ASSERT_THAT(key_value_metadata, NotNull()); + ASSERT_THAT(key_value_metadata->size(), 1); + ASSERT_OK_AND_ASSIGN(auto value, key_value_metadata->Get("hello")); + ASSERT_THAT(value, "world"); +} + +TEST_F(TestInt32Writer, WriteKeyValueMetadataEndToEnd) { + auto sink = CreateOutputStream(); + { + auto file_writer = ParquetFileWriter::Open( + sink, std::dynamic_pointer_cast(schema_.schema_root())); + auto rg_writer = file_writer->AppendRowGroup(); + auto col_writer = rg_writer->NextColumn(); + col_writer->key_value_metadata().Append("foo", "bar"); + file_writer->Close(); + } + ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); + auto file_reader = + ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer)); + auto key_value_metadata = + file_reader->metadata()->RowGroup(0)->ColumnChunk(0)->key_value_metadata(); + ASSERT_THAT(key_value_metadata, NotNull()); + ASSERT_THAT(key_value_metadata->size(), 1); + ASSERT_OK_AND_ASSIGN(auto value, key_value_metadata->Get("foo")); + ASSERT_THAT(value, "bar"); +} + } // namespace test } // namespace parquet diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 3f101b5ae3ac6..89e8d539f54af 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -132,6 +132,31 @@ std::shared_ptr MakeColumnStats(const format::ColumnMetaData& meta_d throw ParquetException("Can't decode page statistics for selected column type"); } +template +std::shared_ptr CopyKeyValueMetadata(const Metadata& source) { + std::shared_ptr metadata = nullptr; + if (source.__isset.key_value_metadata) { + metadata = std::make_shared(); + for (const auto& it : source.key_value_metadata) { + metadata->Append(it.key, it.value); + } + } + return metadata; +} + +template +void SetKeyValueMetadata(Metadata& metadata, const KeyValueMetadata& source) { + metadata.key_value_metadata.clear(); + metadata.key_value_metadata.reserve(static_cast(source.size())); + for (int64_t i = 0; i < source.size(); ++i) { + format::KeyValue kv_pair; + kv_pair.__set_key(source.key(i)); + kv_pair.__set_value(source.value(i)); + metadata.key_value_metadata.push_back(kv_pair); + } + metadata.__isset.key_value_metadata = true; +} + // MetaData Accessor // ColumnCryptoMetaData @@ -230,6 +255,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { encoding_stats.count}); } possible_stats_ = nullptr; + InitKeyValueMetadata(); } bool Equals(const ColumnChunkMetaDataImpl& other) const { @@ -340,7 +366,15 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { return std::nullopt; } + const std::shared_ptr& key_value_metadata() const { + return key_value_metadata_; + } + private: + void InitKeyValueMetadata() { + key_value_metadata_ = CopyKeyValueMetadata(*column_metadata_); + } + mutable std::shared_ptr possible_stats_; std::vector encodings_; std::vector encoding_stats_; @@ -350,6 +384,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { const ColumnDescriptor* descr_; const ReaderProperties properties_; const ApplicationVersion* writer_version_; + std::shared_ptr key_value_metadata_; }; std::unique_ptr ColumnChunkMetaData::Make( @@ -468,6 +503,11 @@ bool ColumnChunkMetaData::Equals(const ColumnChunkMetaData& other) const { return impl_->Equals(*other.impl_); } +const std::shared_ptr& ColumnChunkMetaData::key_value_metadata() + const { + return impl_->key_value_metadata(); +} + // row-group metadata class RowGroupMetaData::RowGroupMetaDataImpl { public: @@ -863,16 +903,7 @@ class FileMetaData::FileMetaDataImpl { schema_.updateColumnOrders(column_orders); } - void InitKeyValueMetadata() { - std::shared_ptr metadata = nullptr; - if (metadata_->__isset.key_value_metadata) { - metadata = std::make_shared(); - for (const auto& it : metadata_->key_value_metadata) { - metadata->Append(it.key, it.value); - } - } - key_value_metadata_ = std::move(metadata); - } + void InitKeyValueMetadata() { key_value_metadata_ = CopyKeyValueMetadata(*metadata_); } }; std::shared_ptr FileMetaData::Make( @@ -1518,6 +1549,10 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_chunk_->meta_data.__set_encodings(std::move(thrift_encodings)); column_chunk_->meta_data.__set_encoding_stats(std::move(thrift_encoding_stats)); + if (key_value_metadata_) { + SetKeyValueMetadata(column_chunk_->meta_data, *key_value_metadata_); + } + const auto& encrypt_md = properties_->column_encryption_properties(column_->path()->ToDotString()); // column is encrypted @@ -1584,6 +1619,13 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { return column_chunk_->meta_data.total_compressed_size; } + KeyValueMetadata& key_value_metadata() { + if (!key_value_metadata_) { + key_value_metadata_ = std::make_unique(); + } + return *key_value_metadata_; + } + private: void Init(format::ColumnChunk* column_chunk) { column_chunk_ = column_chunk; @@ -1598,6 +1640,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { std::unique_ptr owned_column_chunk_; const std::shared_ptr properties_; const ColumnDescriptor* column_; + std::unique_ptr key_value_metadata_; }; std::unique_ptr ColumnChunkMetaDataBuilder::Make( @@ -1659,6 +1702,10 @@ int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const { return impl_->total_compressed_size(); } +KeyValueMetadata& ColumnChunkMetaDataBuilder::key_value_metadata() { + return impl_->key_value_metadata(); +} + class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { public: explicit RowGroupMetaDataBuilderImpl(std::shared_ptr props, @@ -1853,16 +1900,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { } else if (key_value_metadata) { key_value_metadata_ = key_value_metadata_->Merge(*key_value_metadata); } - metadata_->key_value_metadata.clear(); - metadata_->key_value_metadata.reserve( - static_cast(key_value_metadata_->size())); - for (int64_t i = 0; i < key_value_metadata_->size(); ++i) { - format::KeyValue kv_pair; - kv_pair.__set_key(key_value_metadata_->key(i)); - kv_pair.__set_value(key_value_metadata_->value(i)); - metadata_->key_value_metadata.push_back(std::move(kv_pair)); - } - metadata_->__isset.key_value_metadata = true; + SetKeyValueMetadata(*metadata_, *key_value_metadata_); } int32_t file_version = 0; diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 640b898024346..323280ea8c6de 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -179,6 +179,7 @@ class PARQUET_EXPORT ColumnChunkMetaData { std::unique_ptr crypto_metadata() const; std::optional GetColumnIndexLocation() const; std::optional GetOffsetIndexLocation() const; + const std::shared_ptr& key_value_metadata() const; private: explicit ColumnChunkMetaData( @@ -452,8 +453,12 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { // column chunk // Used when a dataset is spread across multiple files void set_file_path(const std::string& path); + // column metadata void SetStatistics(const EncodedStatistics& stats); + + KeyValueMetadata& key_value_metadata(); + // get the column descriptor const ColumnDescriptor* descr() const; diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index ce194f897e44d..a738ee0eb7988 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -64,6 +64,25 @@ void PrintPageEncodingStats(std::ostream& stream, // the fixed initial size is just for an example #define COL_WIDTH 30 +void Put(std::ostream& stream, char c, int n) { + for (int i = 0; i < n; ++i) { + stream.put(c); + } +} + +void PrintKeyValueMetadata(std::ostream& stream, + const KeyValueMetadata& key_value_metadata, + int indent_level = 0, int indent_width = 1) { + const int64_t size_of_key_value_metadata = key_value_metadata.size(); + Put(stream, ' ', indent_level * indent_width); + stream << "Key Value Metadata: " << size_of_key_value_metadata << " entries\n"; + for (int64_t i = 0; i < size_of_key_value_metadata; i++) { + Put(stream, ' ', (indent_level + 1) * indent_width); + stream << "Key nr " << i << " " << key_value_metadata.key(i) << ": " + << key_value_metadata.value(i) << "\n"; + } +} + void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selected_columns, bool print_values, bool format_dump, bool print_key_value_metadata, const char* filename) { @@ -76,12 +95,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte if (print_key_value_metadata && file_metadata->key_value_metadata()) { auto key_value_metadata = file_metadata->key_value_metadata(); - int64_t size_of_key_value_metadata = key_value_metadata->size(); - stream << "Key Value File Metadata: " << size_of_key_value_metadata << " entries\n"; - for (int64_t i = 0; i < size_of_key_value_metadata; i++) { - stream << " Key nr " << i << " " << key_value_metadata->key(i) << ": " - << key_value_metadata->value(i) << "\n"; - } + PrintKeyValueMetadata(stream, *key_value_metadata); } stream << "Number of RowGroups: " << file_metadata->num_row_groups() << "\n"; @@ -136,7 +150,11 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte std::shared_ptr stats = column_chunk->statistics(); const ColumnDescriptor* descr = file_metadata->schema()->Column(i); - stream << "Column " << i << std::endl << " Values: " << column_chunk->num_values(); + stream << "Column " << i << std::endl; + if (print_key_value_metadata && column_chunk->key_value_metadata()) { + PrintKeyValueMetadata(stream, *column_chunk->key_value_metadata(), 1, 2); + } + stream << " Values: " << column_chunk->num_values(); if (column_chunk->is_stats_set()) { std::string min = stats->EncodeMin(), max = stats->EncodeMax(); stream << ", Null Values: " << stats->null_count() From 408270aafa7b706959743a69b9126be25c0f055d Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Wed, 8 May 2024 01:20:02 +0000 Subject: [PATCH 02/19] Python support --- python/pyarrow/_parquet.pxd | 1 + python/pyarrow/_parquet.pyx | 13 +++++++++++++ .../column-chunk-key-value-metadata.parquet | Bin 0 -> 236 bytes python/pyarrow/tests/parquet/test_metadata.py | 7 +++++++ 4 files changed, 21 insertions(+) create mode 100644 python/pyarrow/tests/data/parquet/column-chunk-key-value-metadata.parquet diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd index ae4094d8b4b5f..c80351821b3fd 100644 --- a/python/pyarrow/_parquet.pxd +++ b/python/pyarrow/_parquet.pxd @@ -327,6 +327,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: unique_ptr[CColumnCryptoMetaData] crypto_metadata() const optional[ParquetIndexLocation] GetColumnIndexLocation() const optional[ParquetIndexLocation] GetOffsetIndexLocation() const + shared_ptr[const CKeyValueMetadata] key_value_metadata() const struct CSortingColumn" parquet::SortingColumn": int column_idx diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 7bc68a288aa78..6c06bde6e9bb1 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -507,6 +507,19 @@ cdef class ColumnChunkMetaData(_Weakrefable): """Whether the column chunk has a column index""" return self.metadata.GetColumnIndexLocation().has_value() + @property + def metadata(self): + """Additional metadata as key value pairs (dict[bytes, bytes]).""" + cdef: + unordered_map[c_string, c_string] metadata + const CKeyValueMetadata* underlying_metadata + underlying_metadata = self.metadata.key_value_metadata().get() + if underlying_metadata != NULL: + underlying_metadata.ToUnorderedMap(&metadata) + return metadata + else: + return None + cdef class SortingColumn: """ diff --git a/python/pyarrow/tests/data/parquet/column-chunk-key-value-metadata.parquet b/python/pyarrow/tests/data/parquet/column-chunk-key-value-metadata.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f4e76b9814a74ee8eba7fbdc00cb28e253f30c99 GIT binary patch literal 236 zcmWG=3^EjD5oHi%@BtA*3=C>2GNMe9stjzB5**3-IigaCRGN65kTdVK>g&J4zwL; zotTUW1A~TUL1Iy1X=;gXazTM^Vo_0kxk6cLQE_H|o`Rvdo`If$Zm^$YK(L2@h@^}R M&}FlLH~{ER0F)dmp8x;= literal 0 HcmV?d00001 diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index bf186bd923c4f..2fc67159c88da 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -772,3 +772,10 @@ def test_write_metadata_fs_file_combinations(tempdir, s3_example_s3fs): assert meta1.read_bytes() == meta2.read_bytes() \ == meta3.read_bytes() == meta4.read_bytes() \ == s3_fs.open(meta5).read() + + +def test_column_chunk_key_value_metadata(datadir): + metadata = pq.read_metadata(datadir / 'column-chunk-key-value-metadata.parquet') + key_value_metadata = metadata.row_group(0).column(0).metadata + print(key_value_metadata) + assert key_value_metadata[b'foo'] == b'bar' From 008d6cb971e4c10ed246998ca512204456606dd7 Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Wed, 29 May 2024 02:26:24 +0000 Subject: [PATCH 03/19] Move the test file to parquet-testing --- .../column-chunk-key-value-metadata.parquet | Bin 236 -> 0 bytes python/pyarrow/tests/parquet/conftest.py | 12 ++++++++++++ python/pyarrow/tests/parquet/test_metadata.py | 4 ++-- 3 files changed, 14 insertions(+), 2 deletions(-) delete mode 100644 python/pyarrow/tests/data/parquet/column-chunk-key-value-metadata.parquet diff --git a/python/pyarrow/tests/data/parquet/column-chunk-key-value-metadata.parquet b/python/pyarrow/tests/data/parquet/column-chunk-key-value-metadata.parquet deleted file mode 100644 index f4e76b9814a74ee8eba7fbdc00cb28e253f30c99..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 236 zcmWG=3^EjD5oHi%@BtA*3=C>2GNMe9stjzB5**3-IigaCRGN65kTdVK>g&J4zwL; zotTUW1A~TUL1Iy1X=;gXazTM^Vo_0kxk6cLQE_H|o`Rvdo`If$Zm^$YK(L2@h@^}R M&}FlLH~{ER0F)dmp8x;= diff --git a/python/pyarrow/tests/parquet/conftest.py b/python/pyarrow/tests/parquet/conftest.py index 767e7f6b69d07..80605e973cda8 100644 --- a/python/pyarrow/tests/parquet/conftest.py +++ b/python/pyarrow/tests/parquet/conftest.py @@ -15,6 +15,9 @@ # specific language governing permissions and limitations # under the License. +import os +import pathlib + import pytest from pyarrow.util import guid @@ -25,6 +28,15 @@ def datadir(base_datadir): return base_datadir / 'parquet' +@pytest.fixture(scope='module') +def parquet_test_datadir(): + result = os.environ.get('PARQUET_TEST_DATA') + if not result: + raise RuntimeError('Please point the PARQUET_TEST_DATA environment ' + 'variable to the test data directory') + return pathlib.Path(result) + + @pytest.fixture def s3_bucket(s3_server): boto3 = pytest.importorskip('boto3') diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 2fc67159c88da..0a5736eb71a27 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -774,8 +774,8 @@ def test_write_metadata_fs_file_combinations(tempdir, s3_example_s3fs): == s3_fs.open(meta5).read() -def test_column_chunk_key_value_metadata(datadir): - metadata = pq.read_metadata(datadir / 'column-chunk-key-value-metadata.parquet') +def test_column_chunk_key_value_metadata(parquet_test_datadir): + metadata = pq.read_metadata(parquet_test_datadir / 'column-chunk-key-value-metadata.parquet') key_value_metadata = metadata.row_group(0).column(0).metadata print(key_value_metadata) assert key_value_metadata[b'foo'] == b'bar' From d23647ec34bc412551ec45d453349123643c7be5 Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Wed, 29 May 2024 03:41:08 +0000 Subject: [PATCH 04/19] Address review comments --- cpp/src/parquet/column_writer.cc | 22 ++++++++++++++++++++-- cpp/src/parquet/column_writer.h | 13 ++++++++++--- cpp/src/parquet/column_writer_test.cc | 27 +++++++++++++++++++++++++-- cpp/src/parquet/metadata.cc | 24 +++++++++++------------- cpp/src/parquet/metadata.h | 2 +- 5 files changed, 67 insertions(+), 21 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 4eb8965d5416b..854e054124337 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -40,6 +40,7 @@ #include "arrow/util/crc32.h" #include "arrow/util/endian.h" #include "arrow/util/float16.h" +#include "arrow/util/key_value_metadata.h" #include "arrow/util/logging.h" #include "arrow/util/rle_encoding.h" #include "arrow/util/type_traits.h" @@ -836,6 +837,7 @@ class ColumnWriterImpl { void FlushBufferedDataPages(); ColumnChunkMetaDataBuilder* metadata_; + std::shared_ptr key_value_metadata_; const ColumnDescriptor* descr_; // scratch buffer if validity bits need to be recalculated. std::shared_ptr bits_buffer_; @@ -1104,6 +1106,7 @@ int64_t ColumnWriterImpl::Close() { if (rows_written_ > 0 && chunk_statistics.is_set()) { metadata_->SetStatistics(chunk_statistics); } + metadata_->SetKeyValueMetadata(std::move(key_value_metadata_)); pager_->Close(has_dictionary_, fallback_); } @@ -1405,8 +1408,23 @@ class TypedColumnWriterImpl : public ColumnWriterImpl, public TypedColumnWriter< return pages_change_on_record_boundaries_; } - KeyValueMetadata& key_value_metadata() override { - return metadata_->key_value_metadata(); + void AddKeyValueMetadata( + const std::shared_ptr& key_value_metadata) override { + if (closed_) { + throw ParquetException("Cannot add key-value metadata to closed column"); + } + if (key_value_metadata_ == nullptr) { + key_value_metadata_ = key_value_metadata; + } else if (key_value_metadata != nullptr) { + key_value_metadata_ = key_value_metadata_->Merge(*key_value_metadata); + } + } + + void ResetKeyValueMetadata() override { + if (closed_) { + throw ParquetException("Cannot add key-value metadata to closed column"); + } + key_value_metadata_ = nullptr; } private: diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 9a541b1e5e00f..1e92bbee642c1 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -184,9 +184,16 @@ class PARQUET_EXPORT ColumnWriter { /// \brief The file-level writer properties virtual const WriterProperties* properties() = 0; - /// \brief Return KeyValueMetadata that can be used to store key-value - /// metadata in ColumnChunkMetaData. - virtual KeyValueMetadata& key_value_metadata() = 0; + /// \brief Add key-value metadata to the ColumnChunk. + /// \param[in] key_value_metadata the metadata to add. + /// \note This will overwrite any existing metadata with the same key. + /// \throw ParquetException if Close() has been called. + virtual void AddKeyValueMetadata( + const std::shared_ptr& key_value_metadata) = 0; + + /// \brief Reset the ColumnChunk key-value metadata. + /// \throw ParquetException if Close() has been called. + virtual void ResetKeyValueMetadata() = 0; /// \brief Write Apache Arrow columnar data directly to ColumnWriter. Returns /// error status if the array data type is not compatible with the concrete diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index d8f96c2c403ae..22bda1012ecb0 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -23,6 +23,7 @@ #include #include "arrow/io/buffered.h" +#include "arrow/io/file.h" #include "arrow/testing/gtest_util.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_builders.h" @@ -1729,7 +1730,7 @@ TEST_F(TestInt32Writer, NoWriteKeyValueMetadata) { TEST_F(TestInt32Writer, WriteKeyValueMetadata) { auto writer = this->BuildWriter(); - writer->key_value_metadata().Append("hello", "world"); + writer->AddKeyValueMetadata(KeyValueMetadata::Make({"hello"}, {"world"})); writer->Close(); auto key_value_metadata = metadata_key_value_metadata(); ASSERT_THAT(key_value_metadata, NotNull()); @@ -1745,7 +1746,7 @@ TEST_F(TestInt32Writer, WriteKeyValueMetadataEndToEnd) { sink, std::dynamic_pointer_cast(schema_.schema_root())); auto rg_writer = file_writer->AppendRowGroup(); auto col_writer = rg_writer->NextColumn(); - col_writer->key_value_metadata().Append("foo", "bar"); + col_writer->AddKeyValueMetadata(KeyValueMetadata::Make({"foo"}, {"bar"})); file_writer->Close(); } ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish()); @@ -1761,3 +1762,25 @@ TEST_F(TestInt32Writer, WriteKeyValueMetadataEndToEnd) { } // namespace test } // namespace parquet + +// TEST(CreateTestFile, Create) { +// PARQUET_ASSIGN_OR_THROW( +// auto sink, arrow::io::FileOutputStream::Open( +// "column-chunk-key-value-metadata.parquet")); +// auto writer = parquet::ParquetFileWriter::Open( +// sink, std::static_pointer_cast( +// parquet::schema::GroupNode::Make( +// "schema", parquet::Repetition::REQUIRED, +// {parquet::schema::PrimitiveNode::Make( +// "column1", parquet::Repetition::OPTIONAL, +// parquet::Type::INT32), +// parquet::schema::PrimitiveNode::Make( +// "column2", parquet::Repetition::OPTIONAL, +// parquet::Type::INT32) +// }))); +// auto rg_writer = writer->AppendRowGroup(); +// rg_writer->NextColumn() +// ->key_value_metadata() +// .Append("foo", "bar"); +// rg_writer->NextColumn(); +// } diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 89e8d539f54af..a6c02cfec499f 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -145,7 +145,7 @@ std::shared_ptr CopyKeyValueMetadata(const Metadata& source) { } template -void SetKeyValueMetadata(Metadata& metadata, const KeyValueMetadata& source) { +void ToThriftKeyValueMetadata(Metadata& metadata, const KeyValueMetadata& source) { metadata.key_value_metadata.clear(); metadata.key_value_metadata.reserve(static_cast(source.size())); for (int64_t i = 0; i < source.size(); ++i) { @@ -1550,7 +1550,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_chunk_->meta_data.__set_encoding_stats(std::move(thrift_encoding_stats)); if (key_value_metadata_) { - SetKeyValueMetadata(column_chunk_->meta_data, *key_value_metadata_); + ToThriftKeyValueMetadata(column_chunk_->meta_data, *key_value_metadata_); } const auto& encrypt_md = @@ -1619,11 +1619,8 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { return column_chunk_->meta_data.total_compressed_size; } - KeyValueMetadata& key_value_metadata() { - if (!key_value_metadata_) { - key_value_metadata_ = std::make_unique(); - } - return *key_value_metadata_; + void SetKeyValueMetadata(std::shared_ptr key_value_metadata) { + key_value_metadata_ = std::move(key_value_metadata); } private: @@ -1640,7 +1637,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { std::unique_ptr owned_column_chunk_; const std::shared_ptr properties_; const ColumnDescriptor* column_; - std::unique_ptr key_value_metadata_; + std::shared_ptr key_value_metadata_; }; std::unique_ptr ColumnChunkMetaDataBuilder::Make( @@ -1698,12 +1695,13 @@ void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result) impl_->SetStatistics(result); } -int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const { - return impl_->total_compressed_size(); +void ColumnChunkMetaDataBuilder::SetKeyValueMetadata( + std::shared_ptr key_value_metadata) { + impl_->SetKeyValueMetadata(key_value_metadata); } -KeyValueMetadata& ColumnChunkMetaDataBuilder::key_value_metadata() { - return impl_->key_value_metadata(); +int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const { + return impl_->total_compressed_size(); } class RowGroupMetaDataBuilder::RowGroupMetaDataBuilderImpl { @@ -1900,7 +1898,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { } else if (key_value_metadata) { key_value_metadata_ = key_value_metadata_->Merge(*key_value_metadata); } - SetKeyValueMetadata(*metadata_, *key_value_metadata_); + ToThriftKeyValueMetadata(*metadata_, *key_value_metadata_); } int32_t file_version = 0; diff --git a/cpp/src/parquet/metadata.h b/cpp/src/parquet/metadata.h index 323280ea8c6de..09c4d727d643c 100644 --- a/cpp/src/parquet/metadata.h +++ b/cpp/src/parquet/metadata.h @@ -457,7 +457,7 @@ class PARQUET_EXPORT ColumnChunkMetaDataBuilder { // column metadata void SetStatistics(const EncodedStatistics& stats); - KeyValueMetadata& key_value_metadata(); + void SetKeyValueMetadata(std::shared_ptr key_value_metadata); // get the column descriptor const ColumnDescriptor* descr() const; From db6b50da698b5e33ac09fb3119bef90f4ebf79ad Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Wed, 29 May 2024 03:45:57 +0000 Subject: [PATCH 05/19] Update Python test --- python/pyarrow/tests/parquet/test_metadata.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 0a5736eb71a27..c5ca3433e235b 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -775,7 +775,8 @@ def test_write_metadata_fs_file_combinations(tempdir, s3_example_s3fs): def test_column_chunk_key_value_metadata(parquet_test_datadir): - metadata = pq.read_metadata(parquet_test_datadir / 'column-chunk-key-value-metadata.parquet') - key_value_metadata = metadata.row_group(0).column(0).metadata - print(key_value_metadata) - assert key_value_metadata[b'foo'] == b'bar' + metadata = pq.read_metadata(parquet_test_datadir / 'column_chunk_key_value_metadata.parquet') + key_value_metadata1 = metadata.row_group(0).column(0).metadata + assert key_value_metadata1 == {b'foo': b'bar'} + key_value_metadata2 = metadata.row_group(0).column(1).metadata + assert key_value_metadata2 is None From d9205648587c532a55fa40f4ae42db6c0e9ae6af Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Wed, 29 May 2024 03:54:22 +0000 Subject: [PATCH 06/19] Address review comment --- python/pyarrow/_parquet.pyx | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 6c06bde6e9bb1..45873c5697eb8 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -510,15 +510,11 @@ cdef class ColumnChunkMetaData(_Weakrefable): @property def metadata(self): """Additional metadata as key value pairs (dict[bytes, bytes]).""" - cdef: - unordered_map[c_string, c_string] metadata - const CKeyValueMetadata* underlying_metadata - underlying_metadata = self.metadata.key_value_metadata().get() - if underlying_metadata != NULL: - underlying_metadata.ToUnorderedMap(&metadata) - return metadata + wrapped = pyarrow_wrap_metadata(self.metadata.key_value_metadata()) + if wrapped is not None: + return wrapped.to_dict() else: - return None + return wrapped cdef class SortingColumn: From 5a5c242e589b3951423d832ab0e28a7af0e1f99d Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Wed, 29 May 2024 04:01:18 +0000 Subject: [PATCH 07/19] Address review comment --- cpp/src/parquet/column_writer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 1e92bbee642c1..23e2b2a3b1bf8 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -21,6 +21,7 @@ #include #include +#include "arrow/type_fwd.h" #include "arrow/util/compression.h" #include "parquet/exception.h" #include "parquet/platform.h" @@ -29,7 +30,6 @@ namespace arrow { class Array; -class KeyValueMetadata; namespace bit_util { class BitWriter; From 4379c745f6e6d481ae10049fc0f39c788f75c69e Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Wed, 29 May 2024 04:36:54 +0000 Subject: [PATCH 08/19] Address review comment --- cpp/src/parquet/metadata.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index a6c02cfec499f..41b279403504d 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -146,15 +146,15 @@ std::shared_ptr CopyKeyValueMetadata(const Metadata& source) { template void ToThriftKeyValueMetadata(Metadata& metadata, const KeyValueMetadata& source) { - metadata.key_value_metadata.clear(); - metadata.key_value_metadata.reserve(static_cast(source.size())); + std::vector key_value_metadata; + key_value_metadata.reserve(static_cast(source.size())); for (int64_t i = 0; i < source.size(); ++i) { format::KeyValue kv_pair; kv_pair.__set_key(source.key(i)); kv_pair.__set_value(source.value(i)); - metadata.key_value_metadata.push_back(kv_pair); + key_value_metadata.emplace_back(kv_pair); } - metadata.__isset.key_value_metadata = true; + metadata.__set_key_value_metadata(key_value_metadata); } // MetaData Accessor From eb5cfa2e56868dba9956ede767da5cdeaf9aa415 Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Wed, 29 May 2024 04:37:39 +0000 Subject: [PATCH 09/19] Revert accidentally committed change --- cpp/src/parquet/column_writer_test.cc | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 22bda1012ecb0..c88baeaccd927 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1762,25 +1762,3 @@ TEST_F(TestInt32Writer, WriteKeyValueMetadataEndToEnd) { } // namespace test } // namespace parquet - -// TEST(CreateTestFile, Create) { -// PARQUET_ASSIGN_OR_THROW( -// auto sink, arrow::io::FileOutputStream::Open( -// "column-chunk-key-value-metadata.parquet")); -// auto writer = parquet::ParquetFileWriter::Open( -// sink, std::static_pointer_cast( -// parquet::schema::GroupNode::Make( -// "schema", parquet::Repetition::REQUIRED, -// {parquet::schema::PrimitiveNode::Make( -// "column1", parquet::Repetition::OPTIONAL, -// parquet::Type::INT32), -// parquet::schema::PrimitiveNode::Make( -// "column2", parquet::Repetition::OPTIONAL, -// parquet::Type::INT32) -// }))); -// auto rg_writer = writer->AppendRowGroup(); -// rg_writer->NextColumn() -// ->key_value_metadata() -// .Append("foo", "bar"); -// rg_writer->NextColumn(); -// } From 3179912fb67e65d487c43fcdcab434de0f8c29b3 Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Wed, 29 May 2024 04:48:59 +0000 Subject: [PATCH 10/19] Add test case for ResetKeyValueMetadata --- cpp/src/parquet/column_writer_test.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index c88baeaccd927..1d4b902e73c31 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1739,6 +1739,15 @@ TEST_F(TestInt32Writer, WriteKeyValueMetadata) { ASSERT_THAT(value, "world"); } +TEST_F(TestInt32Writer, ResetKeyValueMetadata) { + auto writer = this->BuildWriter(); + writer->AddKeyValueMetadata(KeyValueMetadata::Make({"hello"}, {"world"})); + writer->ResetKeyValueMetadata(); + writer->Close(); + auto key_value_metadata = metadata_key_value_metadata(); + ASSERT_THAT(key_value_metadata, IsNull()); +} + TEST_F(TestInt32Writer, WriteKeyValueMetadataEndToEnd) { auto sink = CreateOutputStream(); { From 197ef0fb6642d1debba1546ddbd329af31307496 Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Sun, 14 Jul 2024 22:42:44 +0000 Subject: [PATCH 11/19] Remove using --- cpp/src/parquet/column_writer.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cpp/src/parquet/column_writer.h b/cpp/src/parquet/column_writer.h index 23e2b2a3b1bf8..845bf9aa896bd 100644 --- a/cpp/src/parquet/column_writer.h +++ b/cpp/src/parquet/column_writer.h @@ -54,8 +54,6 @@ class Encryptor; class OffsetIndexBuilder; class WriterProperties; -using KeyValueMetadata = ::arrow::KeyValueMetadata; - class PARQUET_EXPORT LevelEncoder { public: LevelEncoder(); @@ -189,7 +187,7 @@ class PARQUET_EXPORT ColumnWriter { /// \note This will overwrite any existing metadata with the same key. /// \throw ParquetException if Close() has been called. virtual void AddKeyValueMetadata( - const std::shared_ptr& key_value_metadata) = 0; + const std::shared_ptr& key_value_metadata) = 0; /// \brief Reset the ColumnChunk key-value metadata. /// \throw ParquetException if Close() has been called. From de099d8c601be3cc2e25a55d82e69f658bff67d3 Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Sun, 14 Jul 2024 23:01:50 +0000 Subject: [PATCH 12/19] Address review comment --- cpp/src/parquet/column_writer.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 854e054124337..daf918e99d54f 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -1106,7 +1106,7 @@ int64_t ColumnWriterImpl::Close() { if (rows_written_ > 0 && chunk_statistics.is_set()) { metadata_->SetStatistics(chunk_statistics); } - metadata_->SetKeyValueMetadata(std::move(key_value_metadata_)); + metadata_->SetKeyValueMetadata(key_value_metadata_); pager_->Close(has_dictionary_, fallback_); } From cd5130e33f7009d8fd45b53665791057e8a98b96 Mon Sep 17 00:00:00 2001 From: Chungmin Lee Date: Sun, 14 Jul 2024 23:44:30 +0000 Subject: [PATCH 13/19] Update test --- python/pyarrow/tests/parquet/test_metadata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index c5ca3433e235b..966f40875f610 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -777,6 +777,6 @@ def test_write_metadata_fs_file_combinations(tempdir, s3_example_s3fs): def test_column_chunk_key_value_metadata(parquet_test_datadir): metadata = pq.read_metadata(parquet_test_datadir / 'column_chunk_key_value_metadata.parquet') key_value_metadata1 = metadata.row_group(0).column(0).metadata - assert key_value_metadata1 == {b'foo': b'bar'} + assert key_value_metadata1 == {b'foo': b'bar', b'thisiskeywithoutvalue': b''} key_value_metadata2 = metadata.row_group(0).column(1).metadata assert key_value_metadata2 is None From ef4ae5b7db48b2fd92597a1d2703486bd4a9138b Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 3 Aug 2024 22:54:03 +0800 Subject: [PATCH 14/19] fix cpp comments --- cpp/src/parquet/column_writer_test.cc | 8 ++++---- cpp/src/parquet/metadata.cc | 19 ++++++++++++------- cpp/src/parquet/printer.cc | 6 +++--- 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 1d4b902e73c31..882a6fcb166e8 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1734,9 +1734,9 @@ TEST_F(TestInt32Writer, WriteKeyValueMetadata) { writer->Close(); auto key_value_metadata = metadata_key_value_metadata(); ASSERT_THAT(key_value_metadata, NotNull()); - ASSERT_THAT(key_value_metadata->size(), 1); + ASSERT_EQ(1, key_value_metadata->size()); ASSERT_OK_AND_ASSIGN(auto value, key_value_metadata->Get("hello")); - ASSERT_THAT(value, "world"); + ASSERT_EQ("world", value); } TEST_F(TestInt32Writer, ResetKeyValueMetadata) { @@ -1764,9 +1764,9 @@ TEST_F(TestInt32Writer, WriteKeyValueMetadataEndToEnd) { auto key_value_metadata = file_reader->metadata()->RowGroup(0)->ColumnChunk(0)->key_value_metadata(); ASSERT_THAT(key_value_metadata, NotNull()); - ASSERT_THAT(key_value_metadata->size(), 1); + ASSERT_EQ(1U, key_value_metadata->size()); ASSERT_OK_AND_ASSIGN(auto value, key_value_metadata->Get("foo")); - ASSERT_THAT(value, "bar"); + ASSERT_EQ("bar", value); } } // namespace test diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 6d96215248b84..9ac0c90ebd03d 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -139,25 +139,30 @@ template std::shared_ptr CopyKeyValueMetadata(const Metadata& source) { std::shared_ptr metadata = nullptr; if (source.__isset.key_value_metadata) { - metadata = std::make_shared(); + std::vector keys; + std::vector values; + keys.reserve(source.key_value_metadata.size()); + values.reserve(source.key_value_metadata.size()); for (const auto& it : source.key_value_metadata) { - metadata->Append(it.key, it.value); + keys.push_back(it.key); + values.push_back(it.value); } + metadata = std::make_shared(std::move(keys), std::move(values)); } return metadata; } template -void ToThriftKeyValueMetadata(Metadata& metadata, const KeyValueMetadata& source) { +void ToThriftKeyValueMetadata(const KeyValueMetadata& source, Metadata* metadata) { std::vector key_value_metadata; key_value_metadata.reserve(static_cast(source.size())); for (int64_t i = 0; i < source.size(); ++i) { format::KeyValue kv_pair; kv_pair.__set_key(source.key(i)); kv_pair.__set_value(source.value(i)); - key_value_metadata.emplace_back(kv_pair); + key_value_metadata.emplace_back(std::move(kv_pair)); } - metadata.__set_key_value_metadata(key_value_metadata); + metadata->__set_key_value_metadata(std::move(key_value_metadata)); } // MetaData Accessor @@ -1622,7 +1627,7 @@ class ColumnChunkMetaDataBuilder::ColumnChunkMetaDataBuilderImpl { column_chunk_->meta_data.__set_encoding_stats(std::move(thrift_encoding_stats)); if (key_value_metadata_) { - ToThriftKeyValueMetadata(column_chunk_->meta_data, *key_value_metadata_); + ToThriftKeyValueMetadata(*key_value_metadata_, &column_chunk_->meta_data); } const auto& encrypt_md = @@ -1970,7 +1975,7 @@ class FileMetaDataBuilder::FileMetaDataBuilderImpl { } else if (key_value_metadata) { key_value_metadata_ = key_value_metadata_->Merge(*key_value_metadata); } - ToThriftKeyValueMetadata(*metadata_, *key_value_metadata_); + ToThriftKeyValueMetadata(*key_value_metadata_, metadata_.get()); } int32_t file_version = 0; diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 5ab0c872e4567..60adfc697f95c 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -64,7 +64,7 @@ void PrintPageEncodingStats(std::ostream& stream, // the fixed initial size is just for an example #define COL_WIDTH 30 -void Put(std::ostream& stream, char c, int n) { +void PutChars(std::ostream& stream, char c, int n) { for (int i = 0; i < n; ++i) { stream.put(c); } @@ -74,10 +74,10 @@ void PrintKeyValueMetadata(std::ostream& stream, const KeyValueMetadata& key_value_metadata, int indent_level = 0, int indent_width = 1) { const int64_t size_of_key_value_metadata = key_value_metadata.size(); - Put(stream, ' ', indent_level * indent_width); + PutChars(stream, ' ', indent_level * indent_width); stream << "Key Value Metadata: " << size_of_key_value_metadata << " entries\n"; for (int64_t i = 0; i < size_of_key_value_metadata; i++) { - Put(stream, ' ', (indent_level + 1) * indent_width); + PutChars(stream, ' ', (indent_level + 1) * indent_width); stream << "Key nr " << i << " " << key_value_metadata.key(i) << ": " << key_value_metadata.value(i) << "\n"; } From 807b01f385da176182589c79890ca3f052a7d3b9 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 3 Aug 2024 22:54:12 +0800 Subject: [PATCH 15/19] trying to fix python lint --- python/pyarrow/tests/parquet/test_metadata.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/tests/parquet/test_metadata.py b/python/pyarrow/tests/parquet/test_metadata.py index 1e19b4110b719..23f34aae06a88 100644 --- a/python/pyarrow/tests/parquet/test_metadata.py +++ b/python/pyarrow/tests/parquet/test_metadata.py @@ -785,7 +785,8 @@ def test_write_metadata_fs_file_combinations(tempdir, s3_example_s3fs): def test_column_chunk_key_value_metadata(parquet_test_datadir): - metadata = pq.read_metadata(parquet_test_datadir / 'column_chunk_key_value_metadata.parquet') + metadata = pq.read_metadata(parquet_test_datadir / + 'column_chunk_key_value_metadata.parquet') key_value_metadata1 = metadata.row_group(0).column(0).metadata assert key_value_metadata1 == {b'foo': b'bar', b'thisiskeywithoutvalue': b''} key_value_metadata2 = metadata.row_group(0).column(1).metadata From e5f0b9dc04255fd93c2a7beaf25340b3c96789c1 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 3 Aug 2024 22:55:00 +0800 Subject: [PATCH 16/19] update submodules --- cpp/submodules/parquet-testing | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/submodules/parquet-testing b/cpp/submodules/parquet-testing index 74278bc4a1122..9b48ff4f94dc5 160000 --- a/cpp/submodules/parquet-testing +++ b/cpp/submodules/parquet-testing @@ -1 +1 @@ -Subproject commit 74278bc4a1122d74945969e6dec405abd1533ec3 +Subproject commit 9b48ff4f94dc5e89592d46a119884dbb88100884 From 1faf6698ff0f2bebb4af92476e3a185b975844a0 Mon Sep 17 00:00:00 2001 From: mwish Date: Sat, 3 Aug 2024 23:03:53 +0800 Subject: [PATCH 17/19] update some non-move impl --- cpp/src/parquet/column_writer.cc | 2 ++ cpp/src/parquet/metadata.cc | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index 5d41741bb1e88..b5ab21f807a42 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -838,6 +838,8 @@ class ColumnWriterImpl { void FlushBufferedDataPages(); ColumnChunkMetaDataBuilder* metadata_; + // key_value_metadata_ for the column chunk + // It would be nullptr if there is no KeyValueMetadata set. std::shared_ptr key_value_metadata_; const ColumnDescriptor* descr_; // scratch buffer if validity bits need to be recalculated. diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index 9ac0c90ebd03d..a192974df3127 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -958,7 +958,7 @@ class FileMetaData::FileMetaDataImpl { std::vector column_orders; if (metadata_->__isset.column_orders) { column_orders.reserve(metadata_->column_orders.size()); - for (auto column_order : metadata_->column_orders) { + for (auto& column_order : metadata_->column_orders) { if (column_order.__isset.TYPE_ORDER) { column_orders.push_back(ColumnOrder::type_defined_); } else { @@ -1774,7 +1774,7 @@ void ColumnChunkMetaDataBuilder::SetStatistics(const EncodedStatistics& result) void ColumnChunkMetaDataBuilder::SetKeyValueMetadata( std::shared_ptr key_value_metadata) { - impl_->SetKeyValueMetadata(key_value_metadata); + impl_->SetKeyValueMetadata(std::move(key_value_metadata)); } int64_t ColumnChunkMetaDataBuilder::total_compressed_size() const { From 25f2a25bfe84cb7367ca5698b16a46f0fb3ba380 Mon Sep 17 00:00:00 2001 From: mwish Date: Sun, 4 Aug 2024 12:28:25 +0800 Subject: [PATCH 18/19] Trying to fix python tests --- python/pyarrow/_parquet.pyx | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index 4dde0f0d81b3c..254bfe3b09a9c 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -511,11 +511,15 @@ cdef class ColumnChunkMetaData(_Weakrefable): @property def metadata(self): """Additional metadata as key value pairs (dict[bytes, bytes]).""" - wrapped = pyarrow_wrap_metadata(self.metadata.key_value_metadata()) - if wrapped is not None: - return wrapped.to_dict() + cdef: + unordered_map[c_string, c_string] metadata + const CKeyValueMetadata* underlying_metadata + underlying_metadata = self.metadata.key_value_metadata().get() + if underlying_metadata != NULL: + underlying_metadata.ToUnorderedMap(&metadata) + return metadata else: - return wrapped + return None cdef class SortingColumn: From ab5cb560714f6f9efe1ca7535b53ed211af9cf62 Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 15 Aug 2024 22:49:29 +0800 Subject: [PATCH 19/19] Apply suggestions --- cpp/src/parquet/column_writer_test.cc | 9 +++++++-- cpp/src/parquet/metadata.cc | 11 ++++++++--- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/cpp/src/parquet/column_writer_test.cc b/cpp/src/parquet/column_writer_test.cc index 882a6fcb166e8..d2b3aa0dff003 100644 --- a/cpp/src/parquet/column_writer_test.cc +++ b/cpp/src/parquet/column_writer_test.cc @@ -1730,13 +1730,18 @@ TEST_F(TestInt32Writer, NoWriteKeyValueMetadata) { TEST_F(TestInt32Writer, WriteKeyValueMetadata) { auto writer = this->BuildWriter(); - writer->AddKeyValueMetadata(KeyValueMetadata::Make({"hello"}, {"world"})); + writer->AddKeyValueMetadata( + KeyValueMetadata::Make({"hello", "bye"}, {"world", "earth"})); + // overwrite the previous value + writer->AddKeyValueMetadata(KeyValueMetadata::Make({"bye"}, {"moon"})); writer->Close(); auto key_value_metadata = metadata_key_value_metadata(); ASSERT_THAT(key_value_metadata, NotNull()); - ASSERT_EQ(1, key_value_metadata->size()); + ASSERT_EQ(2, key_value_metadata->size()); ASSERT_OK_AND_ASSIGN(auto value, key_value_metadata->Get("hello")); ASSERT_EQ("world", value); + ASSERT_OK_AND_ASSIGN(value, key_value_metadata->Get("bye")); + ASSERT_EQ("moon", value); } TEST_F(TestInt32Writer, ResetKeyValueMetadata) { diff --git a/cpp/src/parquet/metadata.cc b/cpp/src/parquet/metadata.cc index d4c4b78f413b3..4f2aa6e37328c 100644 --- a/cpp/src/parquet/metadata.cc +++ b/cpp/src/parquet/metadata.cc @@ -135,8 +135,11 @@ std::shared_ptr MakeColumnStats(const format::ColumnMetaData& meta_d throw ParquetException("Can't decode page statistics for selected column type"); } +// Get KeyValueMetadata from parquet Thrift RowGroup or ColumnChunk metadata. +// +// Returns nullptr if the metadata is not set. template -std::shared_ptr CopyKeyValueMetadata(const Metadata& source) { +std::shared_ptr FromThriftKeyValueMetadata(const Metadata& source) { std::shared_ptr metadata = nullptr; if (source.__isset.key_value_metadata) { std::vector keys; @@ -380,7 +383,7 @@ class ColumnChunkMetaData::ColumnChunkMetaDataImpl { private: void InitKeyValueMetadata() { - key_value_metadata_ = CopyKeyValueMetadata(*column_metadata_); + key_value_metadata_ = FromThriftKeyValueMetadata(*column_metadata_); } mutable std::shared_ptr possible_stats_; @@ -972,7 +975,9 @@ class FileMetaData::FileMetaDataImpl { schema_.updateColumnOrders(column_orders); } - void InitKeyValueMetadata() { key_value_metadata_ = CopyKeyValueMetadata(*metadata_); } + void InitKeyValueMetadata() { + key_value_metadata_ = FromThriftKeyValueMetadata(*metadata_); + } }; std::shared_ptr FileMetaData::Make(