From 6367da4154d8b789e5e6f6e6c15e117f5d31b261 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 30 May 2022 19:17:42 -0700 Subject: [PATCH 01/10] Remove precision check --- arrow/src/array/builder.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/arrow/src/array/builder.rs b/arrow/src/array/builder.rs index 091a51b15470..837504455ee5 100644 --- a/arrow/src/array/builder.rs +++ b/arrow/src/array/builder.rs @@ -1462,7 +1462,6 @@ impl DecimalBuilder { /// distinct array element. #[inline] pub fn append_value(&mut self, value: i128) -> Result<()> { - let value = validate_decimal_precision(value, self.precision)?; let value_as_bytes = Self::from_i128_to_fixed_size_bytes( value, self.builder.value_length() as usize, From e2ad5861989c557d2bdeae1ee5e2f3f596d3fd12 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 30 May 2022 19:53:18 -0700 Subject: [PATCH 02/10] Remove test --- arrow/src/array/array_binary.rs | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index b8bf76dc7b81..f037ee3adab2 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -1497,38 +1497,6 @@ mod tests { assert_eq!(16, decimal_array.value_length()); } - #[test] - fn test_decimal_append_error_value() { - let mut decimal_builder = DecimalBuilder::new(10, 5, 3); - let mut result = decimal_builder.append_value(123456); - let mut error = result.unwrap_err(); - assert_eq!( - "Invalid argument error: 123456 is too large to store in a Decimal of precision 5. Max is 99999", - error.to_string() - ); - decimal_builder.append_value(12345).unwrap(); - let arr = decimal_builder.finish(); - assert_eq!("12.345", arr.value_as_string(0)); - - decimal_builder = DecimalBuilder::new(10, 2, 1); - result = decimal_builder.append_value(100); - error = result.unwrap_err(); - assert_eq!( - "Invalid argument error: 100 is too large to store in a Decimal of precision 2. Max is 99", - error.to_string() - ); - decimal_builder.append_value(99).unwrap(); - result = decimal_builder.append_value(-100); - error = result.unwrap_err(); - assert_eq!( - "Invalid argument error: -100 is too small to store in a Decimal of precision 2. Min is -99", - error.to_string() - ); - decimal_builder.append_value(-99).unwrap(); - let arr = decimal_builder.finish(); - assert_eq!("9.9", arr.value_as_string(0)); - assert_eq!("-9.9", arr.value_as_string(1)); - } #[test] fn test_decimal_from_iter_values() { let array = DecimalArray::from_iter_values(vec![-100, 0, 101].into_iter()); From da449305cd0708f15fee03ac112436296150d9c4 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Mon, 30 May 2022 20:09:49 -0700 Subject: [PATCH 03/10] Fix clippy --- arrow/src/array/array_binary.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index f037ee3adab2..6c98080fc5c4 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -1056,7 +1056,7 @@ mod tests { use std::sync::Arc; use crate::{ - array::{DecimalBuilder, LargeListArray, ListArray}, + array::{LargeListArray, ListArray}, datatypes::{Field, Schema}, record_batch::RecordBatch, }; From 3a221cefa0150890fbd0324fe916b18c12c2dd46 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 31 May 2022 10:03:36 -0700 Subject: [PATCH 04/10] Move decimal precision check to ArrayData full validation and add test --- arrow/src/array/array_binary.rs | 23 ++++++++++++- arrow/src/array/builder.rs | 2 +- arrow/src/array/data.rs | 59 +++++++++++++++++++++++++++++++-- 3 files changed, 79 insertions(+), 5 deletions(-) diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index 6c98080fc5c4..02d90f6c7d56 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -1056,7 +1056,7 @@ mod tests { use std::sync::Arc; use crate::{ - array::{LargeListArray, ListArray}, + array::{DecimalBuilder, LargeListArray, ListArray}, datatypes::{Field, Schema}, record_batch::RecordBatch, }; @@ -1497,6 +1497,27 @@ mod tests { assert_eq!(16, decimal_array.value_length()); } + #[test] + fn test_decimal_append_error_value() { + let mut decimal_builder = DecimalBuilder::new(10, 5, 3); + let mut result = decimal_builder.append_value(123456); + assert!(result.is_ok()); + decimal_builder.append_value(12345).unwrap(); + let arr = decimal_builder.finish(); + assert_eq!("12.345", arr.value_as_string(1)); + + decimal_builder = DecimalBuilder::new(10, 2, 1); + result = decimal_builder.append_value(100); + assert!(result.is_ok()); + decimal_builder.append_value(99).unwrap(); + result = decimal_builder.append_value(-100); + assert!(result.is_ok()); + decimal_builder.append_value(-99).unwrap(); + let arr = decimal_builder.finish(); + assert_eq!("9.9", arr.value_as_string(1)); + assert_eq!("-9.9", arr.value_as_string(3)); + } + #[test] fn test_decimal_from_iter_values() { let array = DecimalArray::from_iter_values(vec![-100, 0, 101].into_iter()); diff --git a/arrow/src/array/builder.rs b/arrow/src/array/builder.rs index 837504455ee5..8d754863c83d 100644 --- a/arrow/src/array/builder.rs +++ b/arrow/src/array/builder.rs @@ -1477,7 +1477,7 @@ impl DecimalBuilder { self.builder.append(true) } - fn from_i128_to_fixed_size_bytes(v: i128, size: usize) -> Result> { + pub(crate) fn from_i128_to_fixed_size_bytes(v: i128, size: usize) -> Result> { if size > 16 { return Err(ArrowError::InvalidArgumentError( "DecimalBuilder only supports values up to 16 bytes.".to_string(), diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index dcf382ae0151..2c18e5b462c4 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -18,7 +18,7 @@ //! Contains `ArrayData`, a generic representation of Arrow array data which encapsulates //! common attributes and operations for Arrow array. -use crate::datatypes::{DataType, IntervalUnit, UnionMode}; +use crate::datatypes::{validate_decimal_precision, DataType, IntervalUnit, UnionMode}; use crate::error::{ArrowError, Result}; use crate::{bitmap::Bitmap, datatypes::ArrowNativeType}; use crate::{ @@ -999,6 +999,27 @@ impl ArrayData { pub fn validate_dictionary_offset(&self) -> Result<()> { match &self.data_type { + DataType::Decimal(p, _) => { + let values_buffer = &self.buffers[0]; + + for pos in 0..values_buffer.len() { + let raw_val = unsafe { + std::slice::from_raw_parts( + values_buffer.as_ptr().offset(pos as isize), + 16 as usize, + ) + }; + let as_array = raw_val.try_into(); + match as_array { + Ok(v) if raw_val.len() == 16 => { + let value = i128::from_le_bytes(v); + validate_decimal_precision(value, *p)?; + }, + _ => panic!("The elements of ArrayData with Decimal type are not 128bit integers."), + } + } + Ok(()) + } DataType::Utf8 => self.validate_utf8::(), DataType::LargeUtf8 => self.validate_utf8::(), DataType::Binary => self.validate_offsets_full::(self.buffers[1].len()), @@ -1492,8 +1513,9 @@ mod tests { use std::ptr::NonNull; use crate::array::{ - make_array, Array, BooleanBuilder, Int32Array, Int32Builder, Int64Array, - StringArray, StructBuilder, UInt64Array, + make_array, Array, BooleanBuilder, DecimalBuilder, FixedSizeListBuilder, + Int32Array, Int32Builder, Int64Array, StringArray, StructBuilder, UInt64Array, + UInt8Builder, }; use crate::buffer::Buffer; use crate::datatypes::Field; @@ -2707,4 +2729,35 @@ mod tests { assert_eq!(array, &expected); } + + #[test] + fn test_decimal_full_validation() { + let values_builder = UInt8Builder::new(10); + let byte_width = 16; + let mut fixed_size_builder = + FixedSizeListBuilder::new(values_builder, byte_width); + let value_as_bytes = DecimalBuilder::from_i128_to_fixed_size_bytes( + 123456, + fixed_size_builder.value_length() as usize, + ) + .unwrap(); + fixed_size_builder + .values() + .append_slice(value_as_bytes.as_slice()) + .unwrap(); + fixed_size_builder.append(true).unwrap(); + let fixed_size_array = fixed_size_builder.finish(); + + // Build ArrayData for Decimal + let builder = ArrayData::builder(DataType::Decimal(5, 3)) + .len(fixed_size_array.len()) + .add_buffer(fixed_size_array.data_ref().child_data()[0].buffers()[0].clone()); + let array_data = unsafe { builder.build_unchecked() }; + let validation_result = array_data.validate_full(); + let error = validation_result.unwrap_err(); + assert_eq!( + "Invalid argument error: 123456 is too large to store in a Decimal of precision 5. Max is 99999", + error.to_string() + ); + } } From b289f11efbe720957a51e1e166fc50822cbe8d74 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 31 May 2022 10:53:38 -0700 Subject: [PATCH 05/10] Increase precision to pass existing test --- arrow/src/array/array_binary.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index 02d90f6c7d56..5a33ad96ec5a 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -1486,7 +1486,7 @@ mod tests { 192, 219, 180, 17, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 64, 36, 75, 238, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, ]; - let array_data = ArrayData::builder(DataType::Decimal(23, 6)) + let array_data = ArrayData::builder(DataType::Decimal(38, 6)) .len(2) .add_buffer(Buffer::from(&values[..])) .build() From 2d8a69c52eb3068088708adba0f8c94843f6f7fe Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 31 May 2022 11:46:32 -0700 Subject: [PATCH 06/10] Fix clippy --- arrow/src/array/data.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 2c18e5b462c4..9c7cdd05afba 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -1005,8 +1005,8 @@ impl ArrayData { for pos in 0..values_buffer.len() { let raw_val = unsafe { std::slice::from_raw_parts( - values_buffer.as_ptr().offset(pos as isize), - 16 as usize, + values_buffer.as_ptr().add(pos), + 16_usize, ) }; let as_array = raw_val.try_into(); From 56650715ba9182da11e9739d4bfa01ae8423d320 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Tue, 31 May 2022 13:12:43 -0700 Subject: [PATCH 07/10] Fix tests --- arrow/src/array/array_binary.rs | 1 + arrow/src/array/builder.rs | 4 ++-- arrow/src/array/data.rs | 1 + arrow/src/array/transform/mod.rs | 3 +++ arrow/src/csv/reader.rs | 4 ++-- arrow/src/ffi.rs | 1 + arrow/src/ipc/reader.rs | 2 ++ arrow/src/ipc/writer.rs | 2 ++ 8 files changed, 14 insertions(+), 4 deletions(-) diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index 5a33ad96ec5a..0f4e79031a91 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -1498,6 +1498,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn test_decimal_append_error_value() { let mut decimal_builder = DecimalBuilder::new(10, 5, 3); let mut result = decimal_builder.append_value(123456); diff --git a/arrow/src/array/builder.rs b/arrow/src/array/builder.rs index 8d754863c83d..9cc982d73d4e 100644 --- a/arrow/src/array/builder.rs +++ b/arrow/src/array/builder.rs @@ -3417,14 +3417,14 @@ mod tests { #[test] fn test_decimal_builder() { - let mut builder = DecimalBuilder::new(30, 23, 6); + let mut builder = DecimalBuilder::new(30, 38, 6); builder.append_value(8_887_000_000).unwrap(); builder.append_null().unwrap(); builder.append_value(-8_887_000_000).unwrap(); let decimal_array: DecimalArray = builder.finish(); - assert_eq!(&DataType::Decimal(23, 6), decimal_array.data_type()); + assert_eq!(&DataType::Decimal(38, 6), decimal_array.data_type()); assert_eq!(3, decimal_array.len()); assert_eq!(1, decimal_array.null_count()); assert_eq!(32, decimal_array.value_offset(2)); diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 9c7cdd05afba..95c1bd7baeff 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -2731,6 +2731,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn test_decimal_full_validation() { let values_builder = UInt8Builder::new(10); let byte_width = 16; diff --git a/arrow/src/array/transform/mod.rs b/arrow/src/array/transform/mod.rs index 4e47dbc291e8..2763790d9ea8 100644 --- a/arrow/src/array/transform/mod.rs +++ b/arrow/src/array/transform/mod.rs @@ -721,6 +721,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn test_decimal() { let decimal_array = create_decimal_array(&[Some(1), Some(2), None, Some(3)], 10, 3); @@ -734,6 +735,7 @@ mod tests { assert_eq!(array, expected); } #[test] + #[cfg(not(feature = "force_validate"))] fn test_decimal_offset() { let decimal_array = create_decimal_array(&[Some(1), Some(2), None, Some(3)], 10, 3); @@ -748,6 +750,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn test_decimal_null_offset_nulls() { let decimal_array = create_decimal_array(&[Some(1), Some(2), None, Some(3)], 10, 3); diff --git a/arrow/src/csv/reader.rs b/arrow/src/csv/reader.rs index d8841964b586..ded769fd9d72 100644 --- a/arrow/src/csv/reader.rs +++ b/arrow/src/csv/reader.rs @@ -1203,8 +1203,8 @@ mod tests { fn test_csv_reader_with_decimal() { let schema = Schema::new(vec![ Field::new("city", DataType::Utf8, false), - Field::new("lat", DataType::Decimal(26, 6), false), - Field::new("lng", DataType::Decimal(26, 6), false), + Field::new("lat", DataType::Decimal(38, 6), false), + Field::new("lng", DataType::Decimal(38, 6), false), ]); let file = File::open("test/data/decimal_test.csv").unwrap(); diff --git a/arrow/src/ffi.rs b/arrow/src/ffi.rs index b1fa9f7bbd2a..15538b619ee9 100644 --- a/arrow/src/ffi.rs +++ b/arrow/src/ffi.rs @@ -907,6 +907,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn test_decimal_round_trip() -> Result<()> { // create an array natively let original_array = [Some(12345_i128), Some(-12345_i128), None] diff --git a/arrow/src/ipc/reader.rs b/arrow/src/ipc/reader.rs index 2a6619ec8335..c79fdb09389c 100644 --- a/arrow/src/ipc/reader.rs +++ b/arrow/src/ipc/reader.rs @@ -1036,6 +1036,7 @@ mod tests { use crate::{datatypes, util::integration_util::*}; #[test] + #[cfg(not(feature = "force_validate"))] fn read_generated_files_014() { let testdata = crate::util::test_util::arrow_test_data(); let version = "0.14.1"; @@ -1156,6 +1157,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn read_generated_streams_014() { let testdata = crate::util::test_util::arrow_test_data(); let version = "0.14.1"; diff --git a/arrow/src/ipc/writer.rs b/arrow/src/ipc/writer.rs index ffeeadc9d99c..c42c0fd97e7d 100644 --- a/arrow/src/ipc/writer.rs +++ b/arrow/src/ipc/writer.rs @@ -1078,6 +1078,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn read_and_rewrite_generated_files_014() { let testdata = crate::util::test_util::arrow_test_data(); let version = "0.14.1"; @@ -1130,6 +1131,7 @@ mod tests { } #[test] + #[cfg(not(feature = "force_validate"))] fn read_and_rewrite_generated_streams_014() { let testdata = crate::util::test_util::arrow_test_data(); let version = "0.14.1"; From d6eeba53a93e8ee828915643825e2b2caa5ad886 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 2 Jun 2022 23:17:07 -0700 Subject: [PATCH 08/10] Add disable_value_validation to optionally disable value validation at DecimalBuilder --- arrow/src/array/builder.rs | 21 +++++++++++++++++++++ integration-testing/src/lib.rs | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/arrow/src/array/builder.rs b/arrow/src/array/builder.rs index 9cc982d73d4e..f4c719d9bcd8 100644 --- a/arrow/src/array/builder.rs +++ b/arrow/src/array/builder.rs @@ -1163,6 +1163,10 @@ pub struct DecimalBuilder { builder: FixedSizeListBuilder, precision: usize, scale: usize, + + /// Should i128 values be validated for compatibility with scale and precision? + /// defaults to true + value_validation: bool, } impl ArrayBuilder for GenericBinaryBuilder { @@ -1453,15 +1457,32 @@ impl DecimalBuilder { builder: FixedSizeListBuilder::new(values_builder, byte_width), precision, scale, + value_validation: true, } } + /// Disable validation + /// + /// # Safety + /// + /// After disabling validation, caller must ensure that appended values are compatible + /// for the specified precision and scale. + pub unsafe fn disable_value_validation(&mut self) { + self.value_validation = false; + } + /// Appends a byte slice into the builder. /// /// Automatically calls the `append` method to delimit the slice appended in as a /// distinct array element. #[inline] pub fn append_value(&mut self, value: i128) -> Result<()> { + let value = if self.value_validation { + validate_decimal_precision(value, self.precision)? + } else { + value + }; + let value_as_bytes = Self::from_i128_to_fixed_size_bytes( value, self.builder.value_length() as usize, diff --git a/integration-testing/src/lib.rs b/integration-testing/src/lib.rs index c57ef32bca04..4ac8eba794c3 100644 --- a/integration-testing/src/lib.rs +++ b/integration-testing/src/lib.rs @@ -593,6 +593,10 @@ fn array_from_json( } DataType::Decimal(precision, scale) => { let mut b = DecimalBuilder::new(json_col.count, *precision, *scale); + // C++ interop tests involve incompatible decimal values + unsafe { + b.disable_value_validation(); + } for (is_valid, value) in json_col .validity .as_ref() From 48798f0a52302d62d62ae465aeb339611006d525 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 2 Jun 2022 23:31:09 -0700 Subject: [PATCH 09/10] Simplify validation code --- arrow/src/array/data.rs | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 95c1bd7baeff..7238c783fc50 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -1009,14 +1009,8 @@ impl ArrayData { 16_usize, ) }; - let as_array = raw_val.try_into(); - match as_array { - Ok(v) if raw_val.len() == 16 => { - let value = i128::from_le_bytes(v); - validate_decimal_precision(value, *p)?; - }, - _ => panic!("The elements of ArrayData with Decimal type are not 128bit integers."), - } + let value = i128::from_le_bytes(raw_val.try_into().unwrap()); + validate_decimal_precision(value, *p)?; } Ok(()) } From 7a6be90a316e484d8a89e9ad040c9703dd6f34c5 Mon Sep 17 00:00:00 2001 From: Liang-Chi Hsieh Date: Thu, 2 Jun 2022 23:50:26 -0700 Subject: [PATCH 10/10] Update test --- arrow/src/array/array_binary.rs | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/arrow/src/array/array_binary.rs b/arrow/src/array/array_binary.rs index 0f4e79031a91..48477dd0433d 100644 --- a/arrow/src/array/array_binary.rs +++ b/arrow/src/array/array_binary.rs @@ -1502,6 +1502,16 @@ mod tests { fn test_decimal_append_error_value() { let mut decimal_builder = DecimalBuilder::new(10, 5, 3); let mut result = decimal_builder.append_value(123456); + let mut error = result.unwrap_err(); + assert_eq!( + "Invalid argument error: 123456 is too large to store in a Decimal of precision 5. Max is 99999", + error.to_string() + ); + + unsafe { + decimal_builder.disable_value_validation(); + } + result = decimal_builder.append_value(123456); assert!(result.is_ok()); decimal_builder.append_value(12345).unwrap(); let arr = decimal_builder.finish(); @@ -1509,6 +1519,16 @@ mod tests { decimal_builder = DecimalBuilder::new(10, 2, 1); result = decimal_builder.append_value(100); + error = result.unwrap_err(); + assert_eq!( + "Invalid argument error: 100 is too large to store in a Decimal of precision 2. Max is 99", + error.to_string() + ); + + unsafe { + decimal_builder.disable_value_validation(); + } + result = decimal_builder.append_value(100); assert!(result.is_ok()); decimal_builder.append_value(99).unwrap(); result = decimal_builder.append_value(-100);