speed up the decimal validation and add benchmark test

apache · Aug 8, 2022 · 6ebf1da · 6ebf1da
1 parent 5fae299
commit 6ebf1da
Show file tree

Hide file tree

Showing 4 changed files with 242 additions and 5 deletions.
diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml
@@ -204,3 +204,7 @@ required-features = ["test_utils"]
 [[bench]]
 name = "array_data_validate"
 harness = false
+
+[[bench]]
+name = "decimal_validate"
+harness = false
diff --git a/arrow/benches/decimal_validate.rs b/arrow/benches/decimal_validate.rs
@@ -0,0 +1,50 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#[macro_use]
+extern crate criterion;
+use criterion::Criterion;
+
+extern crate arrow;
+
+use arrow::{array::*, buffer::Buffer, datatypes::DataType};
+
+
+fn validate_decimal128_array_slow(array: &Decimal128Array) {
+    array.validate_decimal_precision(35).unwrap();
+}
+
+fn validate_decimal128_array_fast(array: &Decimal128Array) {
+    array.validate_decimal_with_bytes(35).unwrap();
+}
+
+fn validate_benchmark(c: &mut Criterion) {
+    // decimal array slow
+    let decimal_array = Decimal128Array::from_iter_values(vec![12324; 20000]);
+    c.bench_function("validate_decimal128_array_slow 20000", |b| {
+        b.iter(|| validate_decimal128_array_slow(&decimal_array))
+    });
+
+    // decimal array fast
+    c.bench_function("validate_decimal128_array_fast 20000", |b| {
+        b.iter(|| validate_decimal128_array_fast(&decimal_array))
+    });
+
+}
+
+criterion_group!(benches, validate_benchmark);
+criterion_main!(benches);
diff --git a/arrow/src/array/array_decimal.rs b/arrow/src/array/array_decimal.rs
@@ -21,6 +21,7 @@ use std::borrow::Borrow;
 use std::convert::From;
 use std::fmt;
 use std::{any::Any, iter::FromIterator};
+use crate::array::array_decimal::private_decimal::DecimalArrayPrivate;
 
 use super::{
     array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, FixedSizeListArray,
@@ -29,10 +30,7 @@ use super::{BooleanBufferBuilder, FixedSizeBinaryArray};
 #[allow(deprecated)]
 pub use crate::array::DecimalIter;
 use crate::buffer::{Buffer, MutableBuffer};
-use crate::datatypes::{
-    validate_decimal256_precision, validate_decimal_precision, DECIMAL256_MAX_PRECISION,
-    DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE,
-};
+use crate::datatypes::{validate_decimal256_precision, validate_decimal_precision, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, validate_decimal_precision_with_bytes};
 use crate::datatypes::{DataType, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE};
 use crate::error::{ArrowError, Result};
 use crate::util::decimal::{BasicDecimal, Decimal128, Decimal256};
@@ -92,7 +90,7 @@ mod private_decimal {
 }
 
 pub trait BasicDecimalArray<T: BasicDecimal, U: From<ArrayData>>:
-    private_decimal::DecimalArrayPrivate
+    DecimalArrayPrivate
 {
     const VALUE_LENGTH: i32;
     const DEFAULT_TYPE: DataType;
@@ -307,6 +305,8 @@ pub trait BasicDecimalArray<T: BasicDecimal, U: From<ArrayData>>:
     /// Validates decimal values in this array can be properly interpreted
     /// with the specified precision.
     fn validate_decimal_precision(&self, precision: usize) -> Result<()>;
+
+    fn validate_decimal_with_bytes(&self, precision: usize) -> Result<()>;
 }
 
 impl BasicDecimalArray<Decimal128, Decimal128Array> for Decimal128Array {
@@ -336,6 +336,31 @@ impl BasicDecimalArray<Decimal128, Decimal128Array> for Decimal128Array {
         }
         Ok(())
     }
+
+    fn validate_decimal_with_bytes(&self, precision: usize) -> Result<()> {
+        let current_end = self.data.len();
+        let mut current: usize = 0;
+        let data = &self.data;
+
+        while current != current_end {
+            if self.is_null(current) {
+                current += 1;
+                continue
+            } else {
+                let offset = current + data.offset();
+                current += 1;
+                let raw_val = unsafe {
+                    let pos = self.value_offset_at(offset);
+                    std::slice::from_raw_parts(
+                        self.raw_value_data_ptr().offset(pos as isize),
+                        Self::VALUE_LENGTH as usize,
+                    )
+                };
+                validate_decimal_precision_with_bytes(raw_val, precision)?;
+            }
+        }
+        Ok(())
+    }
 }
 
 impl BasicDecimalArray<Decimal256, Decimal256Array> for Decimal256Array {
@@ -365,6 +390,10 @@ impl BasicDecimalArray<Decimal256, Decimal256Array> for Decimal256Array {
         }
         Ok(())
     }
+
+    fn validate_decimal_with_bytes(&self, precision: usize) -> Result<()> {
+        todo!()
+    }
 }
 
 impl Decimal128Array {

diff --git a/arrow/src/datatypes/datatype.rs b/arrow/src/datatypes/datatype.rs
@@ -15,6 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
+use std::cmp::Ordering;
 use num::{BigInt, Num, ToPrimitive};
 use std::fmt;
 
@@ -263,6 +264,89 @@ impl fmt::Display for DataType {
     }
 }
 
+// Max value of little-endian format for each precision.
+pub(crate) const MAX_DECIMAL_BYTES_FOR_EACH_PRECISION : [[u8;16];38]  = [
+    9_i128.to_le_bytes(),
+    99_i128.to_le_bytes(),
+    999_i128.to_le_bytes(),
+    9999_i128.to_le_bytes(),
+    99999_i128.to_le_bytes(),
+    999999_i128.to_le_bytes(),
+    9999999_i128.to_le_bytes(),
+    99999999_i128.to_le_bytes(),
+    999999999_i128.to_le_bytes(),
+    9999999999_i128.to_le_bytes(),
+    99999999999_i128.to_le_bytes(),
+    999999999999_i128.to_le_bytes(),
+    9999999999999_i128.to_le_bytes(),
+    99999999999999_i128.to_le_bytes(),
+    999999999999999_i128.to_le_bytes(),
+    9999999999999999_i128.to_le_bytes(),
+    99999999999999999_i128.to_le_bytes(),
+    999999999999999999_i128.to_le_bytes(),
+    9999999999999999999_i128.to_le_bytes(),
+    99999999999999999999_i128.to_le_bytes(),
+    999999999999999999999_i128.to_le_bytes(),
+    9999999999999999999999_i128.to_le_bytes(),
+    99999999999999999999999_i128.to_le_bytes(),
+    999999999999999999999999_i128.to_le_bytes(),
+    9999999999999999999999999_i128.to_le_bytes(),
+    99999999999999999999999999_i128.to_le_bytes(),
+    999999999999999999999999999_i128.to_le_bytes(),
+    9999999999999999999999999999_i128.to_le_bytes(),
+    99999999999999999999999999999_i128.to_le_bytes(),
+    999999999999999999999999999999_i128.to_le_bytes(),
+    9999999999999999999999999999999_i128.to_le_bytes(),
+    99999999999999999999999999999999_i128.to_le_bytes(),
+    999999999999999999999999999999999_i128.to_le_bytes(),
+    9999999999999999999999999999999999_i128.to_le_bytes(),
+    99999999999999999999999999999999999_i128.to_le_bytes(),
+    999999999999999999999999999999999999_i128.to_le_bytes(),
+    9999999999999999999999999999999999999_i128.to_le_bytes(),
+    99999999999999999999999999999999999999_i128.to_le_bytes(),
+];
+
+pub(crate) const MIN_DECIMAL_BYTES_FOR_EACH_PRECISION: [[u8; 16]; 38] = [
+    (-9_i128).to_le_bytes(),
+    (-99_i128).to_le_bytes(),
+    (-999_i128).to_le_bytes(),
+    (-9999_i128).to_le_bytes(),
+    (-99999_i128).to_le_bytes(),
+    (-999999_i128).to_le_bytes(),
+    (-9999999_i128).to_le_bytes(),
+    (-99999999_i128).to_le_bytes(),
+    (-999999999_i128).to_le_bytes(),
+    (-9999999999_i128).to_le_bytes(),
+    (-99999999999_i128).to_le_bytes(),
+    (-999999999999_i128).to_le_bytes(),
+    (-9999999999999_i128).to_le_bytes(),
+    (-99999999999999_i128).to_le_bytes(),
+    (-999999999999999_i128).to_le_bytes(),
+    (-9999999999999999_i128).to_le_bytes(),
+    (-99999999999999999_i128).to_le_bytes(),
+    (-999999999999999999_i128).to_le_bytes(),
+    (-9999999999999999999_i128).to_le_bytes(),
+    (-99999999999999999999_i128).to_le_bytes(),
+    (-999999999999999999999_i128).to_le_bytes(),
+    (-9999999999999999999999_i128).to_le_bytes(),
+    (-99999999999999999999999_i128).to_le_bytes(),
+    (-999999999999999999999999_i128).to_le_bytes(),
+    (-9999999999999999999999999_i128).to_le_bytes(),
+    (-99999999999999999999999999_i128).to_le_bytes(),
+    (-999999999999999999999999999_i128).to_le_bytes(),
+    (-9999999999999999999999999999_i128).to_le_bytes(),
+    (-99999999999999999999999999999_i128).to_le_bytes(),
+    (-999999999999999999999999999999_i128).to_le_bytes(),
+    (-9999999999999999999999999999999_i128).to_le_bytes(),
+    (-99999999999999999999999999999999_i128).to_le_bytes(),
+    (-999999999999999999999999999999999_i128).to_le_bytes(),
+    (-9999999999999999999999999999999999_i128).to_le_bytes(),
+    (-99999999999999999999999999999999999_i128).to_le_bytes(),
+    (-999999999999999999999999999999999999_i128).to_le_bytes(),
+    (-9999999999999999999999999999999999999_i128).to_le_bytes(),
+    (-99999999999999999999999999999999999999_i128).to_le_bytes(),
+];
+
 /// `MAX_DECIMAL_FOR_EACH_PRECISION[p]` holds the maximum `i128` value
 /// that can be stored in [DataType::Decimal128] value of precision `p`
 pub const MAX_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [
@@ -479,6 +563,76 @@ pub(crate) fn validate_decimal_precision(value: i128, precision: usize) -> Resul
     }
 }
 
+// duplicate code
+#[inline]
+fn singed_cmp_le_bytes(left: &[u8], right: &[u8]) -> Ordering {
+    assert_eq!(
+        left.len(),
+        right.len(),
+        "Can't compare bytes array with different len: {}, {}",
+        left.len(),
+        right.len()
+    );
+    assert_ne!(left.len(), 0, "Can't compare bytes array of length 0");
+    let len = left.len();
+    // the sign bit is 1, the value is negative
+    let left_negative = left[len - 1] >= 0x80_u8;
+    let right_negative = right[len - 1] >= 0x80_u8;
+    if left_negative != right_negative {
+        return match left_negative {
+            true => {
+                // left is negative value
+                // right is positive value
+                Ordering::Less
+            }
+            false => Ordering::Greater,
+        };
+    }
+    for i in 0..len {
+        let l_byte = left[len - 1 - i];
+        let r_byte = right[len - 1 - i];
+        match l_byte.cmp(&r_byte) {
+            Ordering::Less => {
+                return Ordering::Less;
+            }
+            Ordering::Greater => {
+                return Ordering::Greater;
+            }
+            Ordering::Equal => {}
+        }
+    }
+    Ordering::Equal
+}
+
+pub(crate) fn validate_decimal_precision_with_bytes(lt_value: &[u8], precision: usize) -> Result<i128> {
+    if precision > DECIMAL128_MAX_PRECISION {
+        return Err(ArrowError::InvalidArgumentError(format!(
+            "Max precision of a Decimal128 is {}, but got {}",
+            DECIMAL128_MAX_PRECISION, precision,
+        )));
+    }
+
+    let max = MAX_DECIMAL_BYTES_FOR_EACH_PRECISION[precision - 1];
+    let min = MIN_DECIMAL_BYTES_FOR_EACH_PRECISION[precision - 1];
+    if singed_cmp_le_bytes(lt_value, &max) == Ordering::Greater {
+        Err(ArrowError::InvalidArgumentError(format!(
+            "{:?} is too large to store in a Decimal128 of precision {}. Max is {:?}",
+            lt_value, precision, max
+        )))
+    } else if singed_cmp_le_bytes(lt_value, &min) == Ordering::Less {
+        Err(ArrowError::InvalidArgumentError(format!(
+            "{:?} is too small to store in a Decimal128 of precision {}. Min is {:?}",
+            lt_value, precision, min
+        )))
+    } else {
+        Ok(1)
+    }
+}
+
+pub(crate) fn validate_decimal256_precision_with_bytes(lt_value : &[u8], precision: usize) {
+
+}
+
 /// Validates that the specified string value can be properly
 /// interpreted as a Decimal256 number with precision `precision`
 #[inline]