From 04d09786e98f09fe4e0832d4e46374497c144a1d Mon Sep 17 00:00:00 2001 From: nameexhaustion Date: Tue, 3 Sep 2024 18:49:23 +1000 Subject: [PATCH] refactor(rust): Remove top-level metadata from `ArrowSchema` (#18527) --- crates/polars-arrow/src/datatypes/schema.rs | 30 ++++--------------- .../src/io/ipc/read/file_async.rs | 5 +--- crates/polars-arrow/src/io/ipc/read/reader.rs | 5 +--- crates/polars-arrow/src/io/ipc/read/schema.rs | 16 +--------- crates/polars-arrow/src/io/ipc/read/stream.rs | 5 +--- .../polars-arrow/src/io/ipc/write/schema.rs | 8 +---- .../src/arrow/read/schema/mod.rs | 2 +- .../polars/tests/it/io/parquet/arrow/read.rs | 3 -- 8 files changed, 11 insertions(+), 63 deletions(-) diff --git a/crates/polars-arrow/src/datatypes/schema.rs b/crates/polars-arrow/src/datatypes/schema.rs index 9b01816c1135..d5a273274029 100644 --- a/crates/polars-arrow/src/datatypes/schema.rs +++ b/crates/polars-arrow/src/datatypes/schema.rs @@ -4,9 +4,9 @@ use polars_error::{polars_bail, PolarsResult}; #[cfg(feature = "serde")] use serde::{Deserialize, Serialize}; -use super::{Field, Metadata}; +use super::Field; -/// An ordered sequence of [`Field`]s with associated [`Metadata`]. +/// An ordered sequence of [`Field`]s /// /// [`ArrowSchema`] is an abstraction used to read from, and write to, Arrow IPC format, /// Apache Parquet, and Apache Avro. All these formats have a concept of a schema @@ -16,22 +16,11 @@ use super::{Field, Metadata}; pub struct ArrowSchema { /// The fields composing this schema. pub fields: Vec, - /// Optional metadata. - pub metadata: Metadata, } pub type ArrowSchemaRef = Arc; impl ArrowSchema { - /// Attaches a [`Metadata`] to [`ArrowSchema`] - #[inline] - pub fn with_metadata(self, metadata: Metadata) -> Self { - Self { - fields: self.fields, - metadata, - } - } - #[inline] pub fn len(&self) -> usize { self.fields.len() @@ -58,10 +47,7 @@ impl ArrowSchema { }) .collect(); - ArrowSchema { - fields, - metadata: self.metadata, - } + ArrowSchema { fields } } pub fn try_project(&self, indices: &[usize]) -> PolarsResult { @@ -76,18 +62,12 @@ impl ArrowSchema { Ok(out.clone()) }).collect::>>()?; - Ok(ArrowSchema { - fields, - metadata: self.metadata.clone(), - }) + Ok(ArrowSchema { fields }) } } impl From> for ArrowSchema { fn from(fields: Vec) -> Self { - Self { - fields, - ..Default::default() - } + Self { fields } } } diff --git a/crates/polars-arrow/src/io/ipc/read/file_async.rs b/crates/polars-arrow/src/io/ipc/read/file_async.rs index 4c4626be6ee2..effd2151fc67 100644 --- a/crates/polars-arrow/src/io/ipc/read/file_async.rs +++ b/crates/polars-arrow/src/io/ipc/read/file_async.rs @@ -39,10 +39,7 @@ impl<'a> FileStream<'a> { { let (projection, schema) = if let Some(projection) = projection { let (p, h, fields) = prepare_projection(&metadata.schema.fields, projection); - let schema = ArrowSchema { - fields, - metadata: metadata.schema.metadata.clone(), - }; + let schema = ArrowSchema { fields }; (Some((p, h)), Some(schema)) } else { (None, None) diff --git a/crates/polars-arrow/src/io/ipc/read/reader.rs b/crates/polars-arrow/src/io/ipc/read/reader.rs index 45a4eb40a689..6328223d427c 100644 --- a/crates/polars-arrow/src/io/ipc/read/reader.rs +++ b/crates/polars-arrow/src/io/ipc/read/reader.rs @@ -34,10 +34,7 @@ impl FileReader { ) -> Self { let projection = projection.map(|projection| { let (p, h, fields) = prepare_projection(&metadata.schema.fields, projection); - let schema = ArrowSchema { - fields, - metadata: metadata.schema.metadata.clone(), - }; + let schema = ArrowSchema { fields }; (p, h, schema) }); Self { diff --git a/crates/polars-arrow/src/io/ipc/read/schema.rs b/crates/polars-arrow/src/io/ipc/read/schema.rs index 655455a9606a..46925ca7599e 100644 --- a/crates/polars-arrow/src/io/ipc/read/schema.rs +++ b/crates/polars-arrow/src/io/ipc/read/schema.rs @@ -391,22 +391,8 @@ pub(super) fn fb_to_schema( arrow_format::ipc::Endianness::Big => false, }; - let mut metadata = Metadata::default(); - if let Some(md_fields) = schema.custom_metadata()? { - for kv in md_fields { - let kv = kv?; - let k_str = kv.key()?; - let v_str = kv.value()?; - if let Some(k) = k_str { - if let Some(v) = v_str { - metadata.insert(PlSmallStr::from_str(k), PlSmallStr::from_str(v)); - } - } - } - } - Ok(( - ArrowSchema { fields, metadata }, + ArrowSchema { fields }, IpcSchema { fields: ipc_fields, is_little_endian, diff --git a/crates/polars-arrow/src/io/ipc/read/stream.rs b/crates/polars-arrow/src/io/ipc/read/stream.rs index 6a5543852e28..de679de9899d 100644 --- a/crates/polars-arrow/src/io/ipc/read/stream.rs +++ b/crates/polars-arrow/src/io/ipc/read/stream.rs @@ -251,10 +251,7 @@ impl StreamReader { pub fn new(reader: R, metadata: StreamMetadata, projection: Option>) -> Self { let projection = projection.map(|projection| { let (p, h, fields) = prepare_projection(&metadata.schema.fields, projection); - let schema = ArrowSchema { - fields, - metadata: metadata.schema.metadata.clone(), - }; + let schema = ArrowSchema { fields }; (p, h, schema) }); diff --git a/crates/polars-arrow/src/io/ipc/write/schema.rs b/crates/polars-arrow/src/io/ipc/write/schema.rs index 192b0fb5d6ec..594e9275c6a3 100644 --- a/crates/polars-arrow/src/io/ipc/write/schema.rs +++ b/crates/polars-arrow/src/io/ipc/write/schema.rs @@ -38,13 +38,7 @@ pub fn serialize_schema( .map(|(field, ipc_field)| serialize_field(field, ipc_field)) .collect::>(); - let custom_metadata = schema - .metadata - .iter() - .map(|(k, v)| key_value(k.clone().into_string(), v.clone().into_string())) - .collect::>(); - - let custom_metadata = (!custom_metadata.is_empty()).then_some(custom_metadata); + let custom_metadata = None; arrow_format::ipc::Schema { endianness, diff --git a/crates/polars-parquet/src/arrow/read/schema/mod.rs b/crates/polars-parquet/src/arrow/read/schema/mod.rs index 34fb195a4eaa..1ddcdacf4664 100644 --- a/crates/polars-parquet/src/arrow/read/schema/mod.rs +++ b/crates/polars-parquet/src/arrow/read/schema/mod.rs @@ -55,6 +55,6 @@ pub fn infer_schema_with_options( let schema = read_schema_from_metadata(&mut metadata)?; Ok(schema.unwrap_or_else(|| { let fields = parquet_to_arrow_schema_with_options(file_metadata.schema().fields(), options); - ArrowSchema { fields, metadata } + ArrowSchema { fields } })) } diff --git a/crates/polars/tests/it/io/parquet/arrow/read.rs b/crates/polars/tests/it/io/parquet/arrow/read.rs index 974f67b0e879..8178097b5c8c 100644 --- a/crates/polars/tests/it/io/parquet/arrow/read.rs +++ b/crates/polars/tests/it/io/parquet/arrow/read.rs @@ -97,8 +97,6 @@ fn all_types_chunked() -> PolarsResult<()> { #[test] fn read_int96_timestamps() -> PolarsResult<()> { - use std::collections::BTreeMap; - let timestamp_data = &[ 0x50, 0x41, 0x52, 0x31, 0x15, 0x04, 0x15, 0x48, 0x15, 0x3c, 0x4c, 0x15, 0x06, 0x15, 0x00, 0x12, 0x00, 0x00, 0x24, 0x00, 0x00, 0x0d, 0x01, 0x08, 0x9f, 0xd5, 0x1f, 0x0d, 0x0a, 0x44, @@ -131,7 +129,6 @@ fn read_int96_timestamps() -> PolarsResult<()> { arrow::datatypes::ArrowDataType::Timestamp(time_unit, None), false, )], - metadata: BTreeMap::new(), }; let reader = FileReader::new(reader, metadata.row_groups, schema, None); reader.collect::>>()