From 880d683f3b7f98219ccabf18d49c405dbac8b880 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Tue, 10 May 2022 16:12:25 +0800 Subject: [PATCH 1/9] simplify offsets fully checking Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/data.rs | 94 ++++++++++++++++++----------------------- 1 file changed, 40 insertions(+), 54 deletions(-) diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index c0ecef75d1c0..3182b93f595d 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -731,14 +731,14 @@ impl ArrayData { &'a self, buffer: &'a Buffer, ) -> Result<&'a [T]> { - // Validate that there are the correct number of offsets for this array's length - let required_offsets = self.len + self.offset + 1; - // An empty list-like array can have 0 offsets - if buffer.is_empty() { + if buffer.is_empty() && self.len == 0 { return Ok(&[]); } + // Validate that there are the correct number of offsets for this array's length + let required_offsets = self.len + self.offset + 1; + if (buffer.len() / std::mem::size_of::()) < required_offsets { return Err(ArrowError::InvalidArgumentError(format!( "Offsets buffer size (bytes): {} isn't large enough for {}. Length {} needs {}", @@ -1049,56 +1049,42 @@ impl ArrayData { T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, V: Fn(usize, Range) -> Result<()>, { - // An empty binary-like array can have 0 offsets - if self.len == 0 && offset_buffer.is_empty() { - return Ok(()); - } - - let offsets = self.typed_offsets::(offset_buffer)?; - - offsets + let offsets = self.typed_offsets::(offset_buffer)? .iter() - .zip(offsets.iter().skip(1)) .enumerate() - .map(|(i, (&start_offset, &end_offset))| { - let start_offset: usize = start_offset - .try_into() - .map_err(|_| { - ArrowError::InvalidArgumentError(format!( - "Offset invariant failure: could not convert start_offset {} to usize in slot {}", - start_offset, i)) - })?; - let end_offset: usize = end_offset - .try_into() - .map_err(|_| { - ArrowError::InvalidArgumentError(format!( - "Offset invariant failure: Could not convert end_offset {} to usize in slot {}", - end_offset, i+1)) - })?; - - if start_offset > offset_limit { - return Err(ArrowError::InvalidArgumentError(format!( - "Offset invariant failure: offset for slot {} out of bounds: {} > {}", - i, start_offset, offset_limit)) - ); - } - - if end_offset > offset_limit { - return Err(ArrowError::InvalidArgumentError(format!( - "Offset invariant failure: offset for slot {} out of bounds: {} > {}", - i, end_offset, offset_limit)) + .map(|(i, x)| { + // check if the offset can be converted to usize + let r = x.to_usize().ok_or_else(|| { + ArrowError::InvalidArgumentError(format!( + "Offset invariant failure: Could not convert offset {} to usize at position {}", + x, i))} ); + // check if the offset exceed the limit + match r { + Ok(n) if n <= offset_limit => Ok(n), + Ok(_) => Err(ArrowError::InvalidArgumentError(format!( + "Offset invariant failure: offset at position {} out of bounds: {} > {}", + i, x, offset_limit)) + ), + err => err, } + }) + .collect::>>()?; - // check range actually is low -> high - if start_offset > end_offset { - return Err(ArrowError::InvalidArgumentError(format!( + offsets + .windows(2) + .enumerate() + .map(|(i, pair)| { + // check offsets are monotonically increasing + let (start, end) = (pair[0], pair[1]); + if start > end { + Err(ArrowError::InvalidArgumentError(format!( "Offset invariant failure: non-monotonic offset at slot {}: {} > {}", - i, start_offset, end_offset)) - ); + i, start, end)) + ) + } else { + Ok((i, start..end)) } - - Ok((i, start_offset..end_offset)) }) .try_for_each(|res: Result<(usize, Range)>| { let (item_index, range) = res?; @@ -2110,7 +2096,7 @@ mod tests { #[test] #[should_panic( - expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4" + expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" )] fn test_validate_utf8_out_of_bounds() { check_index_out_of_bounds_validation::(DataType::Utf8); @@ -2118,7 +2104,7 @@ mod tests { #[test] #[should_panic( - expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4" + expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" )] fn test_validate_large_utf8_out_of_bounds() { check_index_out_of_bounds_validation::(DataType::LargeUtf8); @@ -2126,7 +2112,7 @@ mod tests { #[test] #[should_panic( - expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4" + expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" )] fn test_validate_binary_out_of_bounds() { check_index_out_of_bounds_validation::(DataType::Binary); @@ -2134,7 +2120,7 @@ mod tests { #[test] #[should_panic( - expected = "Offset invariant failure: offset for slot 2 out of bounds: 5 > 4" + expected = "Offset invariant failure: offset at position 3 out of bounds: 5 > 4" )] fn test_validate_large_binary_out_of_bounds() { check_index_out_of_bounds_validation::(DataType::LargeBinary); @@ -2327,7 +2313,7 @@ mod tests { #[test] #[should_panic( - expected = "Offset invariant failure: offset for slot 1 out of bounds: 5 > 4" + expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4" )] fn test_validate_list_offsets() { let field_type = Field::new("f", DataType::Int32, true); @@ -2336,7 +2322,7 @@ mod tests { #[test] #[should_panic( - expected = "Offset invariant failure: offset for slot 1 out of bounds: 5 > 4" + expected = "Offset invariant failure: offset at position 2 out of bounds: 5 > 4" )] fn test_validate_large_list_offsets() { let field_type = Field::new("f", DataType::Int32, true); @@ -2346,7 +2332,7 @@ mod tests { /// Test that the list of type `data_type` generates correct errors for negative offsets #[test] #[should_panic( - expected = "Offset invariant failure: Could not convert end_offset -1 to usize in slot 2" + expected = "Offset invariant failure: Could not convert offset -1 to usize at position 2" )] fn test_validate_list_negative_offsets() { let values: Int32Array = From cc8d82b364afcc81bced684be352b659c2036e6d Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Tue, 10 May 2022 16:38:26 +0800 Subject: [PATCH 2/9] update doc rename to offsets buffer Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/data.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 3182b93f595d..3645a1e1b227 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -1033,15 +1033,15 @@ impl ArrayData { } /// Calls the `validate(item_index, range)` function for each of - /// the ranges specified in the arrow offset buffer of type + /// the ranges specified in the arrow offsets buffer of type /// `T`. Also validates that each offset is smaller than - /// `max_offset` + /// `offset_limit` /// - /// For example, the offset buffer contained `[1, 2, 4]`, this + /// For example, the offsets buffer contained `[1, 2, 4]`, this /// function would call `validate([1,2])`, and `validate([2,4])` fn validate_each_offset( &self, - offset_buffer: &Buffer, + offsets_buffer: &Buffer, offset_limit: usize, validate: V, ) -> Result<()> @@ -1049,7 +1049,7 @@ impl ArrayData { T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, V: Fn(usize, Range) -> Result<()>, { - let offsets = self.typed_offsets::(offset_buffer)? + let offsets = self.typed_offsets::(offsets_buffer)? .iter() .enumerate() .map(|(i, x)| { @@ -1059,7 +1059,7 @@ impl ArrayData { "Offset invariant failure: Could not convert offset {} to usize at position {}", x, i))} ); - // check if the offset exceed the limit + // check if the offset exceeds the limit match r { Ok(n) if n <= offset_limit => Ok(n), Ok(_) => Err(ArrowError::InvalidArgumentError(format!( From f502da397c202c7518c8be928ff5f57b04e945d2 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Tue, 10 May 2022 17:08:30 +0800 Subject: [PATCH 3/9] add more tests for empty binary-like array Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/data.rs | 84 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 3645a1e1b227..34bd34572301 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -1807,6 +1807,90 @@ mod tests { .unwrap(); } + #[test] + fn test_empty_utf8_array_with_empty_offsets_buffer() { + let data_buffer = Buffer::from(&[]); + let offsets_buffer = Buffer::from(&[]); + ArrayData::try_new( + DataType::Utf8, + 0, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + fn test_empty_utf8_array_with_single_zero_offset() { + let data_buffer = Buffer::from(&[]); + let offsets_buffer = Buffer::from_slice_ref(&[0i32]); + ArrayData::try_new( + DataType::Utf8, + 0, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic(expected = "First offset 1 of Utf8 is larger than values length 0")] + fn test_empty_utf8_array_with_invalid_offset() { + let data_buffer = Buffer::from(&[]); + let offsets_buffer = Buffer::from_slice_ref(&[1i32]); + ArrayData::try_new( + DataType::Utf8, + 0, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + fn test_empty_utf8_array_with_non_zero_offset() { + let data_buffer = Buffer::from_slice_ref(&"abcdef".as_bytes()); + let offsets_buffer = Buffer::from_slice_ref(&[0i32, 2, 6, 0]); + ArrayData::try_new( + DataType::Utf8, + 0, + None, + None, + 3, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + + #[test] + #[should_panic( + expected = "Offsets buffer size (bytes): 4 isn't large enough for LargeUtf8. Length 0 needs 1" + )] + fn test_empty_large_utf8_array_with_wrong_type_offsets() { + let data_buffer = Buffer::from(&[]); + let offsets_buffer = Buffer::from_slice_ref(&[0i32]); + ArrayData::try_new( + DataType::LargeUtf8, + 0, + None, + None, + 0, + vec![offsets_buffer, data_buffer], + vec![], + ) + .unwrap(); + } + #[test] #[should_panic( expected = "Offsets buffer size (bytes): 8 isn't large enough for Utf8. Length 2 needs 3" From ab203acfdaf9b83dfe5077286471269cccb1b574 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Tue, 10 May 2022 17:15:27 +0800 Subject: [PATCH 4/9] add docs for empty binary-like array Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/data.rs | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 34bd34572301..763a993af148 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -726,7 +726,9 @@ impl ArrayData { /// Returns a reference to the data in `buffer` as a typed slice /// (typically `&[i32]` or `&[i64]`) after validating. The /// returned slice is guaranteed to have at least `self.len + 1` - /// entries + /// entries. + /// + /// For an empty array, the `buffer` can also be empty`. fn typed_offsets<'a, T: ArrowNativeType + num::Num + std::fmt::Display>( &'a self, buffer: &'a Buffer, @@ -1037,6 +1039,9 @@ impl ArrayData { /// `T`. Also validates that each offset is smaller than /// `offset_limit` /// + /// For an empty array, the offsets buffer can either be empty + /// or contain a single `0`. + /// /// For example, the offsets buffer contained `[1, 2, 4]`, this /// function would call `validate([1,2])`, and `validate([2,4])` fn validate_each_offset( From 8f07d6429b0e268cfae7971f78cd065ad97e131e Mon Sep 17 00:00:00 2001 From: Remzi Yang <59198230+HaoYang670@users.noreply.github.com> Date: Wed, 11 May 2022 10:52:23 +0800 Subject: [PATCH 5/9] Update arrow/src/array/data.rs fix a nit in docs. Co-authored-by: Liang-Chi Hsieh --- arrow/src/array/data.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 763a993af148..8cb2550171ab 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -728,7 +728,7 @@ impl ArrayData { /// returned slice is guaranteed to have at least `self.len + 1` /// entries. /// - /// For an empty array, the `buffer` can also be empty`. + /// For an empty array, the `buffer` can also be empty. fn typed_offsets<'a, T: ArrowNativeType + num::Num + std::fmt::Display>( &'a self, buffer: &'a Buffer, From 25d3935c1668a9c276bb6d039e619db81f2c3f38 Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Wed, 11 May 2022 11:32:17 +0800 Subject: [PATCH 6/9] add benchmark Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/Cargo.toml | 4 +++ arrow/benches/array_data_validate.rs | 48 ++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 arrow/benches/array_data_validate.rs diff --git a/arrow/Cargo.toml b/arrow/Cargo.toml index 2e734e4acc78..61776d8f0407 100644 --- a/arrow/Cargo.toml +++ b/arrow/Cargo.toml @@ -175,3 +175,7 @@ harness = false [[bench]] name = "string_kernels" harness = false + +[[bench]] +name = "array_data_validate" +harness = false diff --git a/arrow/benches/array_data_validate.rs b/arrow/benches/array_data_validate.rs new file mode 100644 index 000000000000..32e548a29195 --- /dev/null +++ b/arrow/benches/array_data_validate.rs @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#[macro_use] +extern crate criterion; +use criterion::Criterion; + +extern crate arrow; + +use arrow::{array::*, buffer::Buffer, datatypes::DataType}; + +fn create_binary_array_data(length: i32) -> ArrayData { + let value_buffer = Buffer::from_iter(0_i32..length); + let offsets_buffer = Buffer::from_iter(0_i32..length + 1); + ArrayData::try_new( + DataType::Binary, + length as usize, + None, + None, + 0, + vec![offsets_buffer, value_buffer], + vec![], + ) + .unwrap() +} + +fn array_slice_benchmark(c: &mut Criterion) { + c.bench_function("validate_binary_array_data 20000", |b| { + b.iter(|| create_binary_array_data(20000)) + }); +} + +criterion_group!(benches, array_slice_benchmark); +criterion_main!(benches); From 0e61afa631590f826deb172a7cb122bf2a7d6e2c Mon Sep 17 00:00:00 2001 From: remzi <13716567376yh@gmail.com> Date: Wed, 11 May 2022 14:02:31 +0800 Subject: [PATCH 7/9] replace windows with scan Signed-off-by: remzi <13716567376yh@gmail.com> --- arrow/src/array/data.rs | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 8cb2550171ab..2954c0c5aea5 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -1054,7 +1054,7 @@ impl ArrayData { T: ArrowNativeType + std::convert::TryInto + num::Num + std::fmt::Display, V: Fn(usize, Range) -> Result<()>, { - let offsets = self.typed_offsets::(offsets_buffer)? + self.typed_offsets::(offsets_buffer)? .iter() .enumerate() .map(|(i, x)| { @@ -1066,34 +1066,33 @@ impl ArrayData { ); // check if the offset exceeds the limit match r { - Ok(n) if n <= offset_limit => Ok(n), + Ok(n) if n <= offset_limit => Ok((i, n)), Ok(_) => Err(ArrowError::InvalidArgumentError(format!( "Offset invariant failure: offset at position {} out of bounds: {} > {}", i, x, offset_limit)) ), - err => err, + Err(e) => Err(e), } }) - .collect::>>()?; - - offsets - .windows(2) - .enumerate() - .map(|(i, pair)| { + .scan(0_usize, |start, end| { // check offsets are monotonically increasing - let (start, end) = (pair[0], pair[1]); - if start > end { - Err(ArrowError::InvalidArgumentError(format!( + match end { + Ok((i, end)) if *start <= end => { + let range = Some(Ok((i, *start..end))); + *start = end; + range + } + Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!( "Offset invariant failure: non-monotonic offset at slot {}: {} > {}", - i, start, end)) - ) - } else { - Ok((i, start..end)) + i-1, start, end)) + )), + Err(err) => Some(Err(err)), } }) + .skip(1)// the first element is meaningless .try_for_each(|res: Result<(usize, Range)>| { let (item_index, range) = res?; - validate(item_index, range) + validate(item_index-1, range) }) } From 9e1602fb6cebef73e3bbee7673300670d61c03fc Mon Sep 17 00:00:00 2001 From: Remzi Yang <59198230+HaoYang670@users.noreply.github.com> Date: Wed, 11 May 2022 15:19:51 +0800 Subject: [PATCH 8/9] Update arrow/src/array/data.rs Co-authored-by: Liang-Chi Hsieh --- arrow/src/array/data.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index 2954c0c5aea5..a4905ef7ce36 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -1089,7 +1089,7 @@ impl ArrayData { Err(err) => Some(Err(err)), } }) - .skip(1)// the first element is meaningless + .skip(1) // the first element is meaningless .try_for_each(|res: Result<(usize, Range)>| { let (item_index, range) = res?; validate(item_index-1, range) From d86902b8839c304376310068f5fcfa02994f2d32 Mon Sep 17 00:00:00 2001 From: Remzi Yang <59198230+HaoYang670@users.noreply.github.com> Date: Wed, 11 May 2022 15:20:37 +0800 Subject: [PATCH 9/9] Update arrow/src/array/data.rs Co-authored-by: Liang-Chi Hsieh --- arrow/src/array/data.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/array/data.rs b/arrow/src/array/data.rs index a4905ef7ce36..f7eacb0a409c 100644 --- a/arrow/src/array/data.rs +++ b/arrow/src/array/data.rs @@ -1084,7 +1084,7 @@ impl ArrayData { } Ok((i, end)) => Some(Err(ArrowError::InvalidArgumentError(format!( "Offset invariant failure: non-monotonic offset at slot {}: {} > {}", - i-1, start, end)) + i - 1, start, end)) )), Err(err) => Some(Err(err)), }