Skip to content

Commit

Permalink
speed up the decimal validation and add benchmark test
Browse files Browse the repository at this point in the history
  • Loading branch information
liukun4515 committed Aug 8, 2022
1 parent 5fae299 commit 6ebf1da
Show file tree
Hide file tree
Showing 4 changed files with 242 additions and 5 deletions.
4 changes: 4 additions & 0 deletions arrow/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -204,3 +204,7 @@ required-features = ["test_utils"]
[[bench]]
name = "array_data_validate"
harness = false

[[bench]]
name = "decimal_validate"
harness = false
50 changes: 50 additions & 0 deletions arrow/benches/decimal_validate.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#[macro_use]
extern crate criterion;
use criterion::Criterion;

extern crate arrow;

use arrow::{array::*, buffer::Buffer, datatypes::DataType};


fn validate_decimal128_array_slow(array: &Decimal128Array) {
array.validate_decimal_precision(35).unwrap();
}

fn validate_decimal128_array_fast(array: &Decimal128Array) {
array.validate_decimal_with_bytes(35).unwrap();
}

fn validate_benchmark(c: &mut Criterion) {
// decimal array slow
let decimal_array = Decimal128Array::from_iter_values(vec![12324; 20000]);
c.bench_function("validate_decimal128_array_slow 20000", |b| {
b.iter(|| validate_decimal128_array_slow(&decimal_array))
});

// decimal array fast
c.bench_function("validate_decimal128_array_fast 20000", |b| {
b.iter(|| validate_decimal128_array_fast(&decimal_array))
});

}

criterion_group!(benches, validate_benchmark);
criterion_main!(benches);
39 changes: 34 additions & 5 deletions arrow/src/array/array_decimal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use std::borrow::Borrow;
use std::convert::From;
use std::fmt;
use std::{any::Any, iter::FromIterator};
use crate::array::array_decimal::private_decimal::DecimalArrayPrivate;

use super::{
array::print_long_array, raw_pointer::RawPtrBox, Array, ArrayData, FixedSizeListArray,
Expand All @@ -29,10 +30,7 @@ use super::{BooleanBufferBuilder, FixedSizeBinaryArray};
#[allow(deprecated)]
pub use crate::array::DecimalIter;
use crate::buffer::{Buffer, MutableBuffer};
use crate::datatypes::{
validate_decimal256_precision, validate_decimal_precision, DECIMAL256_MAX_PRECISION,
DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE,
};
use crate::datatypes::{validate_decimal256_precision, validate_decimal_precision, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, DECIMAL_DEFAULT_SCALE, validate_decimal_precision_with_bytes};
use crate::datatypes::{DataType, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE};
use crate::error::{ArrowError, Result};
use crate::util::decimal::{BasicDecimal, Decimal128, Decimal256};
Expand Down Expand Up @@ -92,7 +90,7 @@ mod private_decimal {
}

pub trait BasicDecimalArray<T: BasicDecimal, U: From<ArrayData>>:
private_decimal::DecimalArrayPrivate
DecimalArrayPrivate
{
const VALUE_LENGTH: i32;
const DEFAULT_TYPE: DataType;
Expand Down Expand Up @@ -307,6 +305,8 @@ pub trait BasicDecimalArray<T: BasicDecimal, U: From<ArrayData>>:
/// Validates decimal values in this array can be properly interpreted
/// with the specified precision.
fn validate_decimal_precision(&self, precision: usize) -> Result<()>;

fn validate_decimal_with_bytes(&self, precision: usize) -> Result<()>;
}

impl BasicDecimalArray<Decimal128, Decimal128Array> for Decimal128Array {
Expand Down Expand Up @@ -336,6 +336,31 @@ impl BasicDecimalArray<Decimal128, Decimal128Array> for Decimal128Array {
}
Ok(())
}

fn validate_decimal_with_bytes(&self, precision: usize) -> Result<()> {
let current_end = self.data.len();
let mut current: usize = 0;
let data = &self.data;

while current != current_end {
if self.is_null(current) {
current += 1;
continue
} else {
let offset = current + data.offset();
current += 1;
let raw_val = unsafe {
let pos = self.value_offset_at(offset);
std::slice::from_raw_parts(
self.raw_value_data_ptr().offset(pos as isize),
Self::VALUE_LENGTH as usize,
)
};
validate_decimal_precision_with_bytes(raw_val, precision)?;
}
}
Ok(())
}
}

impl BasicDecimalArray<Decimal256, Decimal256Array> for Decimal256Array {
Expand Down Expand Up @@ -365,6 +390,10 @@ impl BasicDecimalArray<Decimal256, Decimal256Array> for Decimal256Array {
}
Ok(())
}

fn validate_decimal_with_bytes(&self, precision: usize) -> Result<()> {
todo!()
}
}

impl Decimal128Array {
Expand Down
154 changes: 154 additions & 0 deletions arrow/src/datatypes/datatype.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// specific language governing permissions and limitations
// under the License.

use std::cmp::Ordering;
use num::{BigInt, Num, ToPrimitive};
use std::fmt;

Expand Down Expand Up @@ -263,6 +264,89 @@ impl fmt::Display for DataType {
}
}

// Max value of little-endian format for each precision.
pub(crate) const MAX_DECIMAL_BYTES_FOR_EACH_PRECISION : [[u8;16];38] = [
9_i128.to_le_bytes(),
99_i128.to_le_bytes(),
999_i128.to_le_bytes(),
9999_i128.to_le_bytes(),
99999_i128.to_le_bytes(),
999999_i128.to_le_bytes(),
9999999_i128.to_le_bytes(),
99999999_i128.to_le_bytes(),
999999999_i128.to_le_bytes(),
9999999999_i128.to_le_bytes(),
99999999999_i128.to_le_bytes(),
999999999999_i128.to_le_bytes(),
9999999999999_i128.to_le_bytes(),
99999999999999_i128.to_le_bytes(),
999999999999999_i128.to_le_bytes(),
9999999999999999_i128.to_le_bytes(),
99999999999999999_i128.to_le_bytes(),
999999999999999999_i128.to_le_bytes(),
9999999999999999999_i128.to_le_bytes(),
99999999999999999999_i128.to_le_bytes(),
999999999999999999999_i128.to_le_bytes(),
9999999999999999999999_i128.to_le_bytes(),
99999999999999999999999_i128.to_le_bytes(),
999999999999999999999999_i128.to_le_bytes(),
9999999999999999999999999_i128.to_le_bytes(),
99999999999999999999999999_i128.to_le_bytes(),
999999999999999999999999999_i128.to_le_bytes(),
9999999999999999999999999999_i128.to_le_bytes(),
99999999999999999999999999999_i128.to_le_bytes(),
999999999999999999999999999999_i128.to_le_bytes(),
9999999999999999999999999999999_i128.to_le_bytes(),
99999999999999999999999999999999_i128.to_le_bytes(),
999999999999999999999999999999999_i128.to_le_bytes(),
9999999999999999999999999999999999_i128.to_le_bytes(),
99999999999999999999999999999999999_i128.to_le_bytes(),
999999999999999999999999999999999999_i128.to_le_bytes(),
9999999999999999999999999999999999999_i128.to_le_bytes(),
99999999999999999999999999999999999999_i128.to_le_bytes(),
];

pub(crate) const MIN_DECIMAL_BYTES_FOR_EACH_PRECISION: [[u8; 16]; 38] = [
(-9_i128).to_le_bytes(),
(-99_i128).to_le_bytes(),
(-999_i128).to_le_bytes(),
(-9999_i128).to_le_bytes(),
(-99999_i128).to_le_bytes(),
(-999999_i128).to_le_bytes(),
(-9999999_i128).to_le_bytes(),
(-99999999_i128).to_le_bytes(),
(-999999999_i128).to_le_bytes(),
(-9999999999_i128).to_le_bytes(),
(-99999999999_i128).to_le_bytes(),
(-999999999999_i128).to_le_bytes(),
(-9999999999999_i128).to_le_bytes(),
(-99999999999999_i128).to_le_bytes(),
(-999999999999999_i128).to_le_bytes(),
(-9999999999999999_i128).to_le_bytes(),
(-99999999999999999_i128).to_le_bytes(),
(-999999999999999999_i128).to_le_bytes(),
(-9999999999999999999_i128).to_le_bytes(),
(-99999999999999999999_i128).to_le_bytes(),
(-999999999999999999999_i128).to_le_bytes(),
(-9999999999999999999999_i128).to_le_bytes(),
(-99999999999999999999999_i128).to_le_bytes(),
(-999999999999999999999999_i128).to_le_bytes(),
(-9999999999999999999999999_i128).to_le_bytes(),
(-99999999999999999999999999_i128).to_le_bytes(),
(-999999999999999999999999999_i128).to_le_bytes(),
(-9999999999999999999999999999_i128).to_le_bytes(),
(-99999999999999999999999999999_i128).to_le_bytes(),
(-999999999999999999999999999999_i128).to_le_bytes(),
(-9999999999999999999999999999999_i128).to_le_bytes(),
(-99999999999999999999999999999999_i128).to_le_bytes(),
(-999999999999999999999999999999999_i128).to_le_bytes(),
(-9999999999999999999999999999999999_i128).to_le_bytes(),
(-99999999999999999999999999999999999_i128).to_le_bytes(),
(-999999999999999999999999999999999999_i128).to_le_bytes(),
(-9999999999999999999999999999999999999_i128).to_le_bytes(),
(-99999999999999999999999999999999999999_i128).to_le_bytes(),
];

/// `MAX_DECIMAL_FOR_EACH_PRECISION[p]` holds the maximum `i128` value
/// that can be stored in [DataType::Decimal128] value of precision `p`
pub const MAX_DECIMAL_FOR_EACH_PRECISION: [i128; 38] = [
Expand Down Expand Up @@ -479,6 +563,76 @@ pub(crate) fn validate_decimal_precision(value: i128, precision: usize) -> Resul
}
}

// duplicate code
#[inline]
fn singed_cmp_le_bytes(left: &[u8], right: &[u8]) -> Ordering {
assert_eq!(
left.len(),
right.len(),
"Can't compare bytes array with different len: {}, {}",
left.len(),
right.len()
);
assert_ne!(left.len(), 0, "Can't compare bytes array of length 0");
let len = left.len();
// the sign bit is 1, the value is negative
let left_negative = left[len - 1] >= 0x80_u8;
let right_negative = right[len - 1] >= 0x80_u8;
if left_negative != right_negative {
return match left_negative {
true => {
// left is negative value
// right is positive value
Ordering::Less
}
false => Ordering::Greater,
};
}
for i in 0..len {
let l_byte = left[len - 1 - i];
let r_byte = right[len - 1 - i];
match l_byte.cmp(&r_byte) {
Ordering::Less => {
return Ordering::Less;
}
Ordering::Greater => {
return Ordering::Greater;
}
Ordering::Equal => {}
}
}
Ordering::Equal
}

pub(crate) fn validate_decimal_precision_with_bytes(lt_value: &[u8], precision: usize) -> Result<i128> {
if precision > DECIMAL128_MAX_PRECISION {
return Err(ArrowError::InvalidArgumentError(format!(
"Max precision of a Decimal128 is {}, but got {}",
DECIMAL128_MAX_PRECISION, precision,
)));
}

let max = MAX_DECIMAL_BYTES_FOR_EACH_PRECISION[precision - 1];
let min = MIN_DECIMAL_BYTES_FOR_EACH_PRECISION[precision - 1];
if singed_cmp_le_bytes(lt_value, &max) == Ordering::Greater {
Err(ArrowError::InvalidArgumentError(format!(
"{:?} is too large to store in a Decimal128 of precision {}. Max is {:?}",
lt_value, precision, max
)))
} else if singed_cmp_le_bytes(lt_value, &min) == Ordering::Less {
Err(ArrowError::InvalidArgumentError(format!(
"{:?} is too small to store in a Decimal128 of precision {}. Min is {:?}",
lt_value, precision, min
)))
} else {
Ok(1)
}
}

pub(crate) fn validate_decimal256_precision_with_bytes(lt_value : &[u8], precision: usize) {

}

/// Validates that the specified string value can be properly
/// interpreted as a Decimal256 number with precision `precision`
#[inline]
Expand Down

0 comments on commit 6ebf1da

Please sign in to comment.