Skip to content

Commit

Permalink
support string and binary view read write parquet
Browse files Browse the repository at this point in the history
  • Loading branch information
ariesdevil committed Mar 28, 2024
1 parent ff86119 commit 1c7260e
Show file tree
Hide file tree
Showing 12 changed files with 857 additions and 33 deletions.
18 changes: 17 additions & 1 deletion arrow-array/src/array/byte_view_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -400,7 +400,23 @@ where
/// ```
pub type BinaryViewArray = GenericByteViewArray<BinaryViewType>;

/// A [`GenericByteViewArray`] that stores uf8 data
impl BinaryViewArray {
/// Convert the [`BinaryViewArray`] to [`StringViewArray`]
/// If items not utf8 data, validate will fail and error returned.
pub fn to_stringview(&self) -> Result<StringViewArray, ArrowError> {
StringViewType::validate(self.views(), self.data_buffers())?;
unsafe { Ok(self.to_stringview_unchecked()) }
}

/// Convert the [`BinaryViewArray`] to [`StringViewArray`]
/// # Safety
/// Caller is responsible for ensuring that items in array are utf8 data.
pub unsafe fn to_stringview_unchecked(&self) -> StringViewArray {
StringViewArray::new_unchecked(self.views.clone(), self.buffers.clone(), self.nulls.clone())
}
}

/// A [`GenericByteViewArray`] that stores utf8 data
///
/// # Example
/// ```
Expand Down
15 changes: 14 additions & 1 deletion arrow-ipc/src/convert.rs
Original file line number Diff line number Diff line change
Expand Up @@ -248,8 +248,10 @@ pub(crate) fn get_data_type(field: crate::Field, may_be_dictionary: bool) -> Dat
}
crate::Type::Binary => DataType::Binary,
crate::Type::LargeBinary => DataType::LargeBinary,
crate::Type::BinaryView => DataType::BinaryView,
crate::Type::Utf8 => DataType::Utf8,
crate::Type::LargeUtf8 => DataType::LargeUtf8,
crate::Type::Utf8View => DataType::Utf8View,
crate::Type::FixedSizeBinary => {
let fsb = field.type_as_fixed_size_binary().unwrap();
DataType::FixedSizeBinary(fsb.byteWidth())
Expand Down Expand Up @@ -548,7 +550,11 @@ pub(crate) fn get_fb_field_type<'a>(
.as_union_value(),
children: Some(fbb.create_vector(&empty_fields[..])),
},
BinaryView | Utf8View => unimplemented!("unimplemented"),
BinaryView => FBFieldType {
type_type: crate::Type::BinaryView,
type_: crate::BinaryViewBuilder::new(fbb).finish().as_union_value(),
children: Some(fbb.create_vector(&empty_fields[..])),
},
Utf8 => FBFieldType {
type_type: crate::Type::Utf8,
type_: crate::Utf8Builder::new(fbb).finish().as_union_value(),
Expand All @@ -568,6 +574,11 @@ pub(crate) fn get_fb_field_type<'a>(
children: Some(fbb.create_vector(&empty_fields[..])),
}
}
Utf8View => FBFieldType {
type_type: crate::Type::Utf8View,
type_: crate::Utf8ViewBuilder::new(fbb).finish().as_union_value(),
children: Some(fbb.create_vector(&empty_fields[..])),
},
Date32 => {
let mut builder = crate::DateBuilder::new(fbb);
builder.add_unit(crate::DateUnit::DAY);
Expand Down Expand Up @@ -921,7 +932,9 @@ mod tests {
true,
),
Field::new("utf8", DataType::Utf8, false),
Field::new("utf8_view", DataType::Utf8View, false),
Field::new("binary", DataType::Binary, false),
Field::new("binary_view", DataType::BinaryView, false),
Field::new_list("list[u8]", Field::new("item", DataType::UInt8, false), true),
Field::new_fixed_size_list(
"fixed_size_list[u8]",
Expand Down
28 changes: 12 additions & 16 deletions parquet/src/arrow/array_reader/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use std::sync::Arc;

use arrow_schema::{DataType, Fields, SchemaBuilder};

use crate::arrow::array_reader::byte_view_array::make_byte_view_array_reader;
use crate::arrow::array_reader::empty_array::make_empty_array_reader;
use crate::arrow::array_reader::fixed_len_byte_array::make_fixed_len_byte_array_reader;
use crate::arrow::array_reader::{
Expand All @@ -29,9 +30,7 @@ use crate::arrow::array_reader::{
use crate::arrow::schema::{ParquetField, ParquetFieldType};
use crate::arrow::ProjectionMask;
use crate::basic::Type as PhysicalType;
use crate::data_type::{
BoolType, DoubleType, FloatType, Int32Type, Int64Type, Int96Type,
};
use crate::data_type::{BoolType, DoubleType, FloatType, Int32Type, Int64Type, Int96Type};
use crate::errors::{ParquetError, Result};
use crate::schema::types::{ColumnDescriptor, ColumnPath, Type};

Expand All @@ -55,17 +54,13 @@ fn build_reader(
row_groups: &dyn RowGroups,
) -> Result<Option<Box<dyn ArrayReader>>> {
match field.field_type {
ParquetFieldType::Primitive { .. } => {
build_primitive_reader(field, mask, row_groups)
}
ParquetFieldType::Primitive { .. } => build_primitive_reader(field, mask, row_groups),
ParquetFieldType::Group { .. } => match &field.arrow_type {
DataType::Map(_, _) => build_map_reader(field, mask, row_groups),
DataType::Struct(_) => build_struct_reader(field, mask, row_groups),
DataType::List(_) => build_list_reader(field, mask, false, row_groups),
DataType::LargeList(_) => build_list_reader(field, mask, true, row_groups),
DataType::FixedSizeList(_, _) => {
build_fixed_size_list_reader(field, mask, row_groups)
}
DataType::FixedSizeList(_, _) => build_fixed_size_list_reader(field, mask, row_groups),
d => unimplemented!("reading group type {} not implemented", d),
},
}
Expand Down Expand Up @@ -140,9 +135,9 @@ fn build_list_reader(
DataType::List(f) => {
DataType::List(Arc::new(f.as_ref().clone().with_data_type(item_type)))
}
DataType::LargeList(f) => DataType::LargeList(Arc::new(
f.as_ref().clone().with_data_type(item_type),
)),
DataType::LargeList(f) => {
DataType::LargeList(Arc::new(f.as_ref().clone().with_data_type(item_type)))
}
_ => unreachable!(),
};

Expand Down Expand Up @@ -289,6 +284,9 @@ fn build_primitive_reader(
Some(DataType::Dictionary(_, _)) => {
make_byte_array_dictionary_reader(page_iterator, column_desc, arrow_type)?
}
Some(DataType::Utf8View | DataType::BinaryView) => {
make_byte_view_array_reader(page_iterator, column_desc, arrow_type)?
}
_ => make_byte_array_reader(page_iterator, column_desc, arrow_type)?,
},
PhysicalType::FIXED_LEN_BYTE_ARRAY => {
Expand Down Expand Up @@ -347,8 +345,7 @@ mod tests {
#[test]
fn test_create_array_reader() {
let file = get_test_file("nulls.snappy.parquet");
let file_reader: Arc<dyn FileReader> =
Arc::new(SerializedFileReader::new(file).unwrap());
let file_reader: Arc<dyn FileReader> = Arc::new(SerializedFileReader::new(file).unwrap());

let file_metadata = file_reader.metadata().file_metadata();
let mask = ProjectionMask::leaves(file_metadata.schema_descr(), [0]);
Expand All @@ -359,8 +356,7 @@ mod tests {
)
.unwrap();

let array_reader =
build_array_reader(fields.as_ref(), &mask, &file_reader).unwrap();
let array_reader = build_array_reader(fields.as_ref(), &mask, &file_reader).unwrap();

// Create arrow types
let arrow_type = DataType::Struct(Fields::from(vec![Field::new(
Expand Down
Loading

0 comments on commit 1c7260e

Please sign in to comment.