diff --git a/cpp/src/arrow/dataset/file_parquet.cc b/cpp/src/arrow/dataset/file_parquet.cc index 9d0e8a6515878..751937e93b937 100644 --- a/cpp/src/arrow/dataset/file_parquet.cc +++ b/cpp/src/arrow/dataset/file_parquet.cc @@ -104,11 +104,12 @@ parquet::ArrowReaderProperties MakeArrowReaderProperties( return arrow_properties; } -template Result> GetSchemaManifest( - const M& metadata, const parquet::ArrowReaderProperties& properties) { + const parquet::FileMetaData& metadata, + const parquet::ArrowReaderProperties& properties) { auto manifest = std::make_shared(); - const std::shared_ptr& key_value_metadata = nullptr; + const std::shared_ptr& key_value_metadata = + metadata.key_value_metadata(); RETURN_NOT_OK(SchemaManifest::Make(metadata.schema(), key_value_metadata, properties, manifest.get())); return manifest; diff --git a/cpp/src/arrow/dataset/file_parquet_test.cc b/cpp/src/arrow/dataset/file_parquet_test.cc index 8527c3af64c83..177ca824179a8 100644 --- a/cpp/src/arrow/dataset/file_parquet_test.cc +++ b/cpp/src/arrow/dataset/file_parquet_test.cc @@ -65,11 +65,15 @@ class ParquetFormatHelper { public: using FormatType = ParquetFileFormat; - static Result> Write(RecordBatchReader* reader) { + static Result> Write( + RecordBatchReader* reader, + const std::shared_ptr& arrow_properties = + default_arrow_writer_properties()) { auto pool = ::arrow::default_memory_pool(); std::shared_ptr out; auto sink = CreateOutputStream(pool); - RETURN_NOT_OK(WriteRecordBatchReader(reader, pool, sink)); + RETURN_NOT_OK(WriteRecordBatchReader(reader, pool, sink, default_writer_properties(), + arrow_properties)); return sink->Finish(); } static std::shared_ptr MakeFormat() { @@ -703,6 +707,29 @@ TEST_P(TestParquetFileFormatScan, PredicatePushdownRowGroupFragmentsUsingStringC CountRowGroupsInFragment(fragment, {0, 3}, equal(field_ref("x"), literal("a"))); } +TEST_P(TestParquetFileFormatScan, PredicatePushdownRowGroupFragmentsUsingDurationColumn) { + // GH-37111: Parquet arrow stores writer schema and possible field_id in + // key_value_metadata when store_schema enabled. When storing `arrow::duration`, it will + // be stored as int64. This test ensures that dataset can parse the writer schema + // correctly. + auto table = TableFromJSON(schema({field("t", duration(TimeUnit::NANO))}), + { + R"([{"t": 1}])", + R"([{"t": 2}, {"t": 3}])", + }); + TableBatchReader table_reader(*table); + ASSERT_OK_AND_ASSIGN( + auto buffer, + ParquetFormatHelper::Write( + &table_reader, ArrowWriterProperties::Builder().store_schema()->build())); + auto source = std::make_shared(buffer); + SetSchema({field("t", duration(TimeUnit::NANO))}); + ASSERT_OK_AND_ASSIGN(auto fragment, format_->MakeFragment(*source)); + + auto expr = equal(field_ref("t"), literal(::arrow::DurationScalar(1, TimeUnit::NANO))); + CountRowGroupsInFragment(fragment, {0}, expr); +} + // Tests projection with nested/indexed FieldRefs. // https://github.com/apache/arrow/issues/35579 TEST_P(TestParquetFileFormatScan, ProjectWithNonNamedFieldRefs) {