From ca1bfb8f28fcd82757ce08c8038916bb6e7986e9 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Fri, 8 Jul 2022 10:11:34 -0700 Subject: [PATCH] Add Decimal256Builder and Decimal256Array (#2000) * Add Decimal256Builder and Decimal256Array * Add decimal builder test * Rebased with master * Trigger Build * Disable value validation for precision > 38 * Deduplicate by using macro * Trigger Build * Add precision and scale check --- arrow/src/array/array_decimal.rs | 196 +++++++++++++++------ arrow/src/array/builder/decimal_builder.rs | 120 ++++++++++++- arrow/src/array/builder/mod.rs | 1 + arrow/src/array/equal_json.rs | 16 ++ arrow/src/array/iterator.rs | 1 + arrow/src/array/mod.rs | 3 + arrow/src/array/ord.rs | 1 + arrow/src/compute/kernels/cast.rs | 2 + arrow/src/compute/kernels/sort.rs | 1 + arrow/src/compute/kernels/take.rs | 2 + arrow/src/csv/reader.rs | 1 + arrow/src/datatypes/datatype.rs | 5 + arrow/src/util/decimal.rs | 1 + arrow/src/util/display.rs | 1 + parquet/src/arrow/arrow_writer/mod.rs | 1 + 15 files changed, 294 insertions(+), 58 deletions(-) diff --git a/arrow/src/array/array_decimal.rs b/arrow/src/array/array_decimal.rs index 62ff7905ad79..15863cfff711 100644 --- a/arrow/src/array/array_decimal.rs +++ b/arrow/src/array/array_decimal.rs @@ -16,7 +16,7 @@ // under the License. use std::borrow::Borrow; -use std::convert::{From, TryInto}; +use std::convert::From; use std::fmt; use std::{any::Any, iter::FromIterator}; @@ -32,7 +32,7 @@ use crate::datatypes::{ DECIMAL_MAX_SCALE, }; use crate::error::{ArrowError, Result}; -use crate::util::decimal::{BasicDecimal, Decimal128}; +use crate::util::decimal::{BasicDecimal, Decimal128, Decimal256}; /// `DecimalArray` stores fixed width decimal numbers, /// with a fixed precision and scale. @@ -40,7 +40,7 @@ use crate::util::decimal::{BasicDecimal, Decimal128}; /// # Examples /// /// ``` -/// use arrow::array::{Array, DecimalArray}; +/// use arrow::array::{Array, BasicDecimalArray, DecimalArray}; /// use arrow::datatypes::DataType; /// /// // Create a DecimalArray with the default precision and scale @@ -75,47 +75,67 @@ pub struct DecimalArray { scale: usize, } -impl DecimalArray { - const VALUE_LENGTH: i32 = 16; +pub struct Decimal256Array { + data: ArrayData, + value_data: RawPtrBox, + precision: usize, + scale: usize, +} + +mod private_decimal { + pub trait DecimalArrayPrivate { + fn raw_value_data_ptr(&self) -> *const u8; + } +} + +pub trait BasicDecimalArray>: + private_decimal::DecimalArrayPrivate +{ + const VALUE_LENGTH: i32; + + fn data(&self) -> &ArrayData; + + /// Return the precision (total digits) that can be stored by this array + fn precision(&self) -> usize; + + /// Return the scale (digits after the decimal) that can be stored by this array + fn scale(&self) -> usize; /// Returns the element at index `i`. - pub fn value(&self, i: usize) -> Decimal128 { - assert!(i < self.data.len(), "DecimalArray out of bounds access"); - let offset = i + self.data.offset(); + fn value(&self, i: usize) -> T { + let data = self.data(); + assert!(i < data.len(), "Out of bounds access"); + + let offset = i + data.offset(); let raw_val = unsafe { let pos = self.value_offset_at(offset); std::slice::from_raw_parts( - self.value_data.as_ptr().offset(pos as isize), + self.raw_value_data_ptr().offset(pos as isize), Self::VALUE_LENGTH as usize, ) }; - let as_array = raw_val.try_into().unwrap(); - Decimal128::new_from_i128( - self.precision, - self.scale, - i128::from_le_bytes(as_array), - ) + T::new(self.precision(), self.scale(), raw_val) } /// Returns the offset for the element at index `i`. /// /// Note this doesn't do any bound checking, for performance reason. #[inline] - pub fn value_offset(&self, i: usize) -> i32 { - self.value_offset_at(self.data.offset() + i) + fn value_offset(&self, i: usize) -> i32 { + self.value_offset_at(self.data().offset() + i) } /// Returns the length for an element. /// /// All elements have the same length as the array is a fixed size. #[inline] - pub const fn value_length(&self) -> i32 { + fn value_length(&self) -> i32 { Self::VALUE_LENGTH } /// Returns a clone of the value data buffer - pub fn value_data(&self) -> Buffer { - self.data.buffers()[0].clone() + fn value_data(&self) -> Buffer { + self.data().buffers()[0].clone() } #[inline] @@ -124,15 +144,15 @@ impl DecimalArray { } #[inline] - pub fn value_as_string(&self, row: usize) -> String { + fn value_as_string(&self, row: usize) -> String { self.value(row).to_string() } - pub fn from_fixed_size_list_array( + fn from_fixed_size_list_array( v: FixedSizeListArray, precision: usize, scale: usize, - ) -> Self { + ) -> U { let child_data = &v.data_ref().child_data()[0]; assert_eq!( child_data.child_data().len(), @@ -155,9 +175,43 @@ impl DecimalArray { .offset(list_offset); let array_data = unsafe { builder.build_unchecked() }; - Self::from(array_data) + U::from(array_data) + } +} + +impl BasicDecimalArray for DecimalArray { + const VALUE_LENGTH: i32 = 16; + + fn data(&self) -> &ArrayData { + &self.data + } + + fn precision(&self) -> usize { + self.precision + } + + fn scale(&self) -> usize { + self.scale + } +} + +impl BasicDecimalArray for Decimal256Array { + const VALUE_LENGTH: i32 = 32; + + fn data(&self) -> &ArrayData { + &self.data + } + + fn precision(&self) -> usize { + self.precision } + fn scale(&self) -> usize { + self.scale + } +} + +impl DecimalArray { /// Creates a [DecimalArray] with default precision and scale, /// based on an iterator of `i128` values without nulls pub fn from_iter_values>(iter: I) -> Self { @@ -176,16 +230,6 @@ impl DecimalArray { DecimalArray::from(data) } - /// Return the precision (total digits) that can be stored by this array - pub fn precision(&self) -> usize { - self.precision - } - - /// Return the scale (digits after the decimal) that can be stored by this array - pub fn scale(&self) -> usize { - self.scale - } - /// Returns a DecimalArray with the same data as self, with the /// specified precision. /// @@ -267,9 +311,24 @@ impl From for DecimalArray { } } -impl From for ArrayData { - fn from(array: DecimalArray) -> Self { - array.data +impl From for Decimal256Array { + fn from(data: ArrayData) -> Self { + assert_eq!( + data.buffers().len(), + 1, + "DecimalArray data should contain 1 buffer only (values)" + ); + let values = data.buffers()[0].as_ptr(); + let (precision, scale) = match data.data_type() { + DataType::Decimal(precision, scale) => (*precision, *scale), + _ => panic!("Expected data type to be Decimal"), + }; + Self { + data, + value_data: unsafe { RawPtrBox::new(values) }, + precision, + scale, + } } } @@ -325,32 +384,55 @@ impl>> FromIterator for DecimalArray { } } -impl fmt::Debug for DecimalArray { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "DecimalArray<{}, {}>\n[\n", self.precision, self.scale)?; - print_long_array(self, f, |array, index, f| { - let formatted_decimal = array.value_as_string(index); +macro_rules! def_decimal_array { + ($ty:ident, $array_name:expr) => { + impl private_decimal::DecimalArrayPrivate for $ty { + fn raw_value_data_ptr(&self) -> *const u8 { + self.value_data.as_ptr() + } + } - write!(f, "{}", formatted_decimal) - })?; - write!(f, "]") - } -} + impl Array for $ty { + fn as_any(&self) -> &dyn Any { + self + } -impl Array for DecimalArray { - fn as_any(&self) -> &dyn Any { - self - } + fn data(&self) -> &ArrayData { + &self.data + } - fn data(&self) -> &ArrayData { - &self.data - } + fn into_data(self) -> ArrayData { + self.into() + } + } - fn into_data(self) -> ArrayData { - self.into() - } + impl From<$ty> for ArrayData { + fn from(array: $ty) -> Self { + array.data + } + } + + impl fmt::Debug for $ty { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{}<{}, {}>\n[\n", + $array_name, self.precision, self.scale + )?; + print_long_array(self, f, |array, index, f| { + let formatted_decimal = array.value_as_string(index); + + write!(f, "{}", formatted_decimal) + })?; + write!(f, "]") + } + } + }; } +def_decimal_array!(DecimalArray, "DecimalArray"); +def_decimal_array!(Decimal256Array, "Decimal256Array"); + #[cfg(test)] mod tests { use crate::{array::DecimalBuilder, datatypes::Field}; diff --git a/arrow/src/array/builder/decimal_builder.rs b/arrow/src/array/builder/decimal_builder.rs index e7e9ec6a58f3..033de8976e34 100644 --- a/arrow/src/array/builder/decimal_builder.rs +++ b/arrow/src/array/builder/decimal_builder.rs @@ -18,6 +18,7 @@ use std::any::Any; use std::sync::Arc; +use crate::array::array_decimal::{BasicDecimalArray, Decimal256Array}; use crate::array::ArrayRef; use crate::array::DecimalArray; use crate::array::UInt8Builder; @@ -26,6 +27,7 @@ use crate::array::{ArrayBuilder, FixedSizeListBuilder}; use crate::error::{ArrowError, Result}; use crate::datatypes::validate_decimal_precision; +use crate::util::decimal::{BasicDecimal, Decimal256}; /// Array Builder for [`DecimalArray`] /// @@ -42,8 +44,18 @@ pub struct DecimalBuilder { value_validation: bool, } +/// Array Builder for [`Decimal256Array`] +/// +/// See [`Decimal256Array`] for example. +#[derive(Debug)] +pub struct Decimal256Builder { + builder: FixedSizeListBuilder, + precision: usize, + scale: usize, +} + impl DecimalBuilder { - /// Creates a new `BinaryBuilder`, `capacity` is the number of bytes in the values + /// Creates a new `DecimalBuilder`, `capacity` is the number of bytes in the values /// array pub fn new(capacity: usize, precision: usize, scale: usize) -> Self { let values_builder = UInt8Builder::new(capacity); @@ -154,10 +166,65 @@ impl ArrayBuilder for DecimalBuilder { } } +impl Decimal256Builder { + /// Creates a new `Decimal256Builder`, `capacity` is the number of bytes in the values + /// array + pub fn new(capacity: usize, precision: usize, scale: usize) -> Self { + let values_builder = UInt8Builder::new(capacity); + let byte_width = 32; + Self { + builder: FixedSizeListBuilder::new(values_builder, byte_width), + precision, + scale, + } + } + + /// Appends a byte slice into the builder. + /// + /// Automatically calls the `append` method to delimit the slice appended in as a + /// distinct array element. + #[inline] + pub fn append_value(&mut self, value: &Decimal256) -> Result<()> { + let value_as_bytes = value.raw_value(); + + if self.precision != value.precision() || self.scale != value.scale() { + return Err(ArrowError::InvalidArgumentError( + "Decimal value does not have the same precision or scale as Decimal256Builder".to_string() + )); + } + + if self.builder.value_length() != value_as_bytes.len() as i32 { + return Err(ArrowError::InvalidArgumentError( + "Byte slice does not have the same length as Decimal256Builder value lengths".to_string() + )); + } + self.builder.values().append_slice(value_as_bytes)?; + self.builder.append(true) + } + + /// Append a null value to the array. + #[inline] + pub fn append_null(&mut self) -> Result<()> { + let length: usize = self.builder.value_length() as usize; + self.builder.values().append_slice(&vec![0u8; length][..])?; + self.builder.append(false) + } + + /// Builds the `Decimal256Array` and reset this builder. + pub fn finish(&mut self) -> Decimal256Array { + Decimal256Array::from_fixed_size_list_array( + self.builder.finish(), + self.precision, + self.scale, + ) + } +} + #[cfg(test)] mod tests { use super::*; + use crate::array::array_decimal::BasicDecimalArray; use crate::array::Array; use crate::datatypes::DataType; use crate::util::decimal::Decimal128; @@ -197,4 +264,55 @@ mod tests { assert_eq!(32, decimal_array.value_offset(2)); assert_eq!(16, decimal_array.value_length()); } + + #[test] + fn test_decimal256_builder() { + let mut builder = Decimal256Builder::new(30, 40, 6); + + let mut bytes = vec![0; 32]; + bytes[0..16].clone_from_slice(&8_887_000_000_i128.to_le_bytes()); + let value = Decimal256::try_new_from_bytes(40, 6, bytes.as_slice()).unwrap(); + builder.append_value(&value).unwrap(); + + builder.append_null().unwrap(); + + bytes = vec![255; 32]; + let value = Decimal256::try_new_from_bytes(40, 6, bytes.as_slice()).unwrap(); + builder.append_value(&value).unwrap(); + + bytes = vec![0; 32]; + bytes[0..16].clone_from_slice(&0_i128.to_le_bytes()); + bytes[15] = 128; + let value = Decimal256::try_new_from_bytes(40, 6, bytes.as_slice()).unwrap(); + builder.append_value(&value).unwrap(); + + let decimal_array: Decimal256Array = builder.finish(); + + assert_eq!(&DataType::Decimal(40, 6), decimal_array.data_type()); + assert_eq!(4, decimal_array.len()); + assert_eq!(1, decimal_array.null_count()); + assert_eq!(64, decimal_array.value_offset(2)); + assert_eq!(32, decimal_array.value_length()); + + assert_eq!(decimal_array.value(0).to_string(), "8887.000000"); + assert!(decimal_array.is_null(1)); + assert_eq!(decimal_array.value(2).to_string(), "-0.000001"); + assert_eq!( + decimal_array.value(3).to_string(), + "170141183460469231731687303715884.105728" + ); + } + + #[test] + #[should_panic( + expected = "Decimal value does not have the same precision or scale as Decimal256Builder" + )] + fn test_decimal256_builder_unmatched_precision_scale() { + let mut builder = Decimal256Builder::new(30, 10, 6); + + let mut bytes = vec![0; 32]; + bytes[0..16].clone_from_slice(&8_887_000_000_i128.to_le_bytes()); + let value = Decimal256::try_new_from_bytes(40, 6, bytes.as_slice()).unwrap(); + builder.append_value(&value).unwrap(); + } } diff --git a/arrow/src/array/builder/mod.rs b/arrow/src/array/builder/mod.rs index 634ef772f3ca..045a11648d52 100644 --- a/arrow/src/array/builder/mod.rs +++ b/arrow/src/array/builder/mod.rs @@ -45,6 +45,7 @@ use super::ArrayRef; pub use boolean_buffer_builder::BooleanBufferBuilder; pub use boolean_builder::BooleanBuilder; pub use buffer_builder::BufferBuilder; +pub use decimal_builder::Decimal256Builder; pub use decimal_builder::DecimalBuilder; pub use fixed_size_binary_builder::FixedSizeBinaryBuilder; pub use fixed_size_list_builder::FixedSizeListBuilder; diff --git a/arrow/src/array/equal_json.rs b/arrow/src/array/equal_json.rs index 9db1a4397cb8..3fc84a7e3ab4 100644 --- a/arrow/src/array/equal_json.rs +++ b/arrow/src/array/equal_json.rs @@ -16,7 +16,9 @@ // under the License. use super::*; +use crate::array::BasicDecimalArray; use crate::datatypes::*; +use crate::util::decimal::BasicDecimal; use array::Array; use hex::FromHex; use serde_json::value::Value::{Null as JNull, Object, String as JString}; @@ -378,6 +380,20 @@ impl JsonEqual for DecimalArray { } } +impl JsonEqual for Decimal256Array { + fn equals_json(&self, json: &[&Value]) -> bool { + if self.len() != json.len() { + return false; + } + + (0..self.len()).all(|i| match json[i] { + JString(s) => self.is_valid(i) && (s == &self.value(i).to_string()), + JNull => self.is_null(i), + _ => false, + }) + } +} + impl PartialEq for DecimalArray { fn eq(&self, json: &Value) -> bool { match json { diff --git a/arrow/src/array/iterator.rs b/arrow/src/array/iterator.rs index bc70d1a2a8ed..9ac2d0642d44 100644 --- a/arrow/src/array/iterator.rs +++ b/arrow/src/array/iterator.rs @@ -15,6 +15,7 @@ // specific language governing permissions and limitations // under the License. +use crate::array::BasicDecimalArray; use crate::datatypes::ArrowPrimitiveType; use super::{ diff --git a/arrow/src/array/mod.rs b/arrow/src/array/mod.rs index 90c3b21fbd2a..60703c53b5b5 100644 --- a/arrow/src/array/mod.rs +++ b/arrow/src/array/mod.rs @@ -181,6 +181,8 @@ pub use self::array_binary::BinaryArray; pub use self::array_binary::FixedSizeBinaryArray; pub use self::array_binary::LargeBinaryArray; pub use self::array_boolean::BooleanArray; +pub use self::array_decimal::BasicDecimalArray; +pub use self::array_decimal::Decimal256Array; pub use self::array_decimal::DecimalArray; pub use self::array_dictionary::DictionaryArray; pub use self::array_list::FixedSizeListArray; @@ -468,6 +470,7 @@ pub use self::builder::BinaryBuilder; pub use self::builder::BooleanBufferBuilder; pub use self::builder::BooleanBuilder; pub use self::builder::BufferBuilder; +pub use self::builder::Decimal256Builder; pub use self::builder::DecimalBuilder; pub use self::builder::FixedSizeBinaryBuilder; pub use self::builder::FixedSizeListBuilder; diff --git a/arrow/src/array/ord.rs b/arrow/src/array/ord.rs index be910f96bd54..019b1163b50a 100644 --- a/arrow/src/array/ord.rs +++ b/arrow/src/array/ord.rs @@ -19,6 +19,7 @@ use std::cmp::Ordering; +use crate::array::BasicDecimalArray; use crate::array::*; use crate::datatypes::TimeUnit; use crate::datatypes::*; diff --git a/arrow/src/compute/kernels/cast.rs b/arrow/src/compute/kernels/cast.rs index 73288c04037b..3dd2ad69264e 100644 --- a/arrow/src/compute/kernels/cast.rs +++ b/arrow/src/compute/kernels/cast.rs @@ -38,6 +38,7 @@ use std::str; use std::sync::Arc; +use crate::array::BasicDecimalArray; use crate::buffer::MutableBuffer; use crate::compute::kernels::arithmetic::{divide, multiply}; use crate::compute::kernels::arity::unary; @@ -2134,6 +2135,7 @@ where #[cfg(test)] mod tests { use super::*; + use crate::array::BasicDecimalArray; use crate::util::decimal::Decimal128; use crate::{buffer::Buffer, util::display::array_value_to_string}; diff --git a/arrow/src/compute/kernels/sort.rs b/arrow/src/compute/kernels/sort.rs index e399cf9f0c19..8e0831c6140e 100644 --- a/arrow/src/compute/kernels/sort.rs +++ b/arrow/src/compute/kernels/sort.rs @@ -17,6 +17,7 @@ //! Defines sort kernel for `ArrayRef` +use crate::array::BasicDecimalArray; use crate::array::*; use crate::buffer::MutableBuffer; use crate::compute::take; diff --git a/arrow/src/compute/kernels/take.rs b/arrow/src/compute/kernels/take.rs index 57471f459b97..fa907656ae8c 100644 --- a/arrow/src/compute/kernels/take.rs +++ b/arrow/src/compute/kernels/take.rs @@ -19,6 +19,8 @@ use std::{ops::AddAssign, sync::Arc}; +use crate::array::BasicDecimalArray; + use crate::buffer::{Buffer, MutableBuffer}; use crate::compute::util::{ take_value_indices_from_fixed_size_list, take_value_indices_from_list, diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index 639c0b42afea..7250f943e48a 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -1115,6 +1115,7 @@ mod tests { use std::io::{Cursor, Write}; use tempfile::NamedTempFile; + use crate::array::BasicDecimalArray; use crate::array::*; use crate::compute::cast; use crate::datatypes::Field; diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs index f3cb58d84e64..f1c468926d5f 100644 --- a/arrow/src/datatypes/datatype.rs +++ b/arrow/src/datatypes/datatype.rs @@ -354,6 +354,11 @@ pub const DECIMAL_DEFAULT_SCALE: usize = 10; /// interpreted as a Decimal number with precision `precision` #[inline] pub(crate) fn validate_decimal_precision(value: i128, precision: usize) -> Result { + // TODO: add validation logic for precision > 38 + if precision > 38 { + return Ok(value); + } + let max = MAX_DECIMAL_FOR_EACH_PRECISION[precision - 1]; let min = MIN_DECIMAL_FOR_EACH_PRECISION[precision - 1]; diff --git a/arrow/src/util/decimal.rs b/arrow/src/util/decimal.rs index 7649b6b417fc..4d67245647d6 100644 --- a/arrow/src/util/decimal.rs +++ b/arrow/src/util/decimal.rs @@ -107,6 +107,7 @@ pub struct Decimal128 { impl Decimal128 { /// Creates `Decimal128` from an `i128` value. + #[allow(dead_code)] pub(crate) fn new_from_i128(precision: usize, scale: usize, value: i128) -> Self { Decimal128 { precision, diff --git a/arrow/src/util/display.rs b/arrow/src/util/display.rs index 220aa59ad406..7a7da8ccb0f5 100644 --- a/arrow/src/util/display.rs +++ b/arrow/src/util/display.rs @@ -23,6 +23,7 @@ use std::fmt::Write; use std::sync::Arc; use crate::array::Array; +use crate::array::BasicDecimalArray; use crate::datatypes::{ ArrowNativeType, ArrowPrimitiveType, DataType, Field, Int16Type, Int32Type, Int64Type, Int8Type, TimeUnit, UInt16Type, UInt32Type, UInt64Type, UInt8Type, diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index fa14281184d8..73f46f971f95 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -23,6 +23,7 @@ use std::sync::Arc; use arrow::array as arrow_array; use arrow::array::ArrayRef; +use arrow::array::BasicDecimalArray; use arrow::datatypes::{DataType as ArrowDataType, IntervalUnit, SchemaRef}; use arrow::record_batch::RecordBatch; use arrow_array::Array;