From 2dbb83afc7fb43429d52586b2e896a9f111ccafb Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Fri, 13 Sep 2024 15:47:12 +0900 Subject: [PATCH] GH-44007: [GLib][Parquet] Add `gparquet_arrow_file_writer_new_buffered_row_group()` --- c_glib/parquet-glib/arrow-file-writer.cpp | 38 +++++++++++++++++++ c_glib/parquet-glib/arrow-file-writer.h | 5 +++ c_glib/test/parquet/test-arrow-file-writer.rb | 7 +++- 3 files changed, 48 insertions(+), 2 deletions(-) diff --git a/c_glib/parquet-glib/arrow-file-writer.cpp b/c_glib/parquet-glib/arrow-file-writer.cpp index 7a672f1f21dcc..2b8e2bdeac026 100644 --- a/c_glib/parquet-glib/arrow-file-writer.cpp +++ b/c_glib/parquet-glib/arrow-file-writer.cpp @@ -517,6 +517,19 @@ gparquet_arrow_file_writer_get_schema(GParquetArrowFileWriter *writer) * @record_batch: A record batch to be written. * @error: (nullable): Return location for a #GError or %NULL. * + * Write a record batch into the buffered row group. + * + * Multiple record batches can be written into the same row group + * through this function. + * + * gparquet_writer_properties_get_max_row_group_length() is respected + * and a new row group will be created if the current row group + * exceeds the limit. + * + * Record batches get flushed to the output stream once + * gparquet_file_writer_new_buffered_row_group() or + * gparquet_file_writer_close() is called. + * * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 18.0.0 @@ -564,6 +577,8 @@ gparquet_arrow_file_writer_write_table(GParquetArrowFileWriter *writer, * @chunk_size: The max number of rows in a row group. * @error: (nullable): Return location for a #GError or %NULL. * + * Start a new row group. + * * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 18.0.0 @@ -579,12 +594,35 @@ gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, "[parquet][arrow][file-writer][new-row-group]"); } +/** + * gparquet_arrow_file_writer_new_buffered_row_group: + * @writer: A #GParquetArrowFileWriter. + * @error: (nullable): Return location for a #GError or %NULL. + * + * Start a new buffered row group. + * + * Returns: %TRUE on success, %FALSE if there was an error. + * + * Since: 18.0.0 + */ +gboolean +gparquet_arrow_file_writer_new_buffered_row_group(GParquetArrowFileWriter *writer, + GError **error) +{ + auto parquet_arrow_file_writer = gparquet_arrow_file_writer_get_raw(writer); + return garrow::check(error, + parquet_arrow_file_writer->NewBufferedRowGroup(), + "[parquet][arrow][file-writer][new-buffered-row-group]"); +} + /** * gparquet_arrow_file_writer_write_chunked_array: * @writer: A #GParquetArrowFileWriter. * @chunked_array: A #GArrowChunkedArray to be written. * @error: (nullable): Return location for a #GError or %NULL. * + * Start a chunked array as a column chunk. + * * Returns: %TRUE on success, %FALSE if there was an error. * * Since: 18.0.0 diff --git a/c_glib/parquet-glib/arrow-file-writer.h b/c_glib/parquet-glib/arrow-file-writer.h index 40595bdfef4b9..2c82f7c1f87de 100644 --- a/c_glib/parquet-glib/arrow-file-writer.h +++ b/c_glib/parquet-glib/arrow-file-writer.h @@ -139,6 +139,11 @@ gparquet_arrow_file_writer_new_row_group(GParquetArrowFileWriter *writer, gsize chunk_size, GError **error); +GPARQUET_AVAILABLE_IN_18_0 +gboolean +gparquet_arrow_file_writer_new_buffered_row_group(GParquetArrowFileWriter *writer, + GError **error); + GPARQUET_AVAILABLE_IN_18_0 gboolean gparquet_arrow_file_writer_write_chunked_array(GParquetArrowFileWriter *writer, diff --git a/c_glib/test/parquet/test-arrow-file-writer.rb b/c_glib/test/parquet/test-arrow-file-writer.rb index 89db16c6fb90b..d8344bf1c50b0 100644 --- a/c_glib/test/parquet/test-arrow-file-writer.rb +++ b/c_glib/test/parquet/test-arrow-file-writer.rb @@ -40,14 +40,17 @@ def test_write_record_batch writer = Parquet::ArrowFileWriter.new(record_batch.schema, @file.path) writer.write_record_batch(record_batch) + writer.new_buffered_row_group + writer.write_record_batch(record_batch) writer.close reader = Parquet::ArrowFileReader.new(@file.path) begin reader.use_threads = true assert_equal([ - 1, - Arrow::Table.new(record_batch.schema, [record_batch]), + 2, + Arrow::Table.new(record_batch.schema, + [record_batch, record_batch]), ], [ reader.n_row_groups,