From 5cd4cd878667e44d4b153f7816e58bfa1e9b6c29 Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Wed, 17 Apr 2024 16:03:38 +1200 Subject: [PATCH] GH-41164: [C#] Fix concatenation of sliced arrays (#41245) ### Rationale for this change Makes array concatenation work correctly when the input arrays have been sliced. ### What changes are included in this PR? * Updates the array concatenation tests so that the `TestDataGenerator` can generate test cases with sliced input arrays. To avoid too much duplicated logic, I've added a new `GenerateTestData` method that works with builders that are not `IArrowArrayBuilder`, and simplified a lot of the data generation by using this new method. Only struct and union array test data generation still needs to duplicate the logic in `GenerateTestData`. * Fixes `ArrayDataConcatenator` logic to handle sliced input arrays ### Are these changes tested? Yes, I've added a new test for this. ### Are there any user-facing changes? Yes, this is a user-facing bug fix. * GitHub Issue: #41164 Authored-by: Adam Reeve Signed-off-by: Curt Hagenlocher --- .../Arrays/ArrayDataConcatenator.cs | 197 +++++- .../Arrays/ArrowArrayBuilderFactory.cs | 13 +- .../ArrowArrayBuilderFactoryReflector.cs | 32 - .../ArrowArrayConcatenatorTests.cs | 653 +++++++----------- 4 files changed, 430 insertions(+), 465 deletions(-) delete mode 100644 csharp/test/Apache.Arrow.Tests/ArrowArrayBuilderFactoryReflector.cs diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs index 347d0d76bac64..fe2543b70a108 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs @@ -107,38 +107,76 @@ public void Visit(ListViewType type) { CheckData(type, 3); ArrowBuffer validityBuffer = ConcatenateValidityBuffer(); + ArrowBuffer sizesBuffer = ConcatenateFixedWidthTypeValueBuffer(2, Int32Type.Default); + var children = new List(_arrayDataList.Count); var offsetsBuilder = new ArrowBuffer.Builder(_totalLength); int baseOffset = 0; foreach (ArrayData arrayData in _arrayDataList) { - if (arrayData.Length > 0) + if (arrayData.Length == 0) { - ReadOnlySpan span = arrayData.Buffers[1].Span.CastTo().Slice(0, arrayData.Length); - foreach (int offset in span) - { - offsetsBuilder.Append(baseOffset + offset); - } + continue; + } + + var child = arrayData.Children[0]; + ReadOnlySpan offsets = arrayData.Buffers[1].Span.CastTo().Slice(arrayData.Offset, arrayData.Length); + ReadOnlySpan sizes = arrayData.Buffers[2].Span.CastTo().Slice(arrayData.Offset, arrayData.Length); + var minOffset = offsets[0]; + var maxEnd = 0; + + for (int i = 0; i < arrayData.Length; ++i) + { + minOffset = Math.Min(minOffset, offsets[i]); + maxEnd = Math.Max(maxEnd, offsets[i] + sizes[i]); } - baseOffset += arrayData.Children[0].Length; + foreach (int offset in offsets) + { + offsetsBuilder.Append(baseOffset + offset - minOffset); + } + + var childLength = maxEnd - minOffset; + if (minOffset != 0 || childLength != child.Length) + { + child = child.Slice(minOffset, childLength); + } + + baseOffset += childLength; + children.Add(child); } ArrowBuffer offsetBuffer = offsetsBuilder.Build(_allocator); - ArrowBuffer sizesBuffer = ConcatenateFixedWidthTypeValueBuffer(2, Int32Type.Default); - ArrayData child = Concatenate(SelectChildren(0), _allocator); + ArrayData combinedChild = Concatenate(children, _allocator); - Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer, sizesBuffer }, new[] { child }); + Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer, sizesBuffer }, new[] { combinedChild }); } public void Visit(FixedSizeListType type) { CheckData(type, 1); + var listSize = type.ListSize; ArrowBuffer validityBuffer = ConcatenateValidityBuffer(); - ArrayData child = Concatenate(SelectChildren(0), _allocator); - Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer }, new[] { child }); + var children = new List(_arrayDataList.Count); + + foreach (ArrayData arrayData in _arrayDataList) + { + var offset = arrayData.Offset; + var length = arrayData.Length; + var child = arrayData.Children[0]; + if (offset != 0 || child.Length != length * listSize) + { + child = child.Slice(offset * listSize, length * listSize); + } + + children.Add(child); + } + + ArrayData combinedChild = Concatenate(children, _allocator); + + Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer }, new[] { combinedChild }); } public void Visit(StructType type) @@ -149,7 +187,7 @@ public void Visit(StructType type) for (int i = 0; i < type.Fields.Count; i++) { - children.Add(Concatenate(SelectChildren(i), _allocator)); + children.Add(Concatenate(SelectSlicedChildren(i), _allocator)); } Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer }, children); @@ -169,7 +207,11 @@ public void Visit(UnionType type) for (int i = 0; i < type.Fields.Count; i++) { - children.Add(Concatenate(SelectChildren(i), _allocator)); + // For dense mode, the offsets aren't adjusted so are into the non-sliced child arrays + var fieldChildren = type.Mode == UnionMode.Sparse + ? SelectSlicedChildren(i) + : SelectChildren(i); + children.Add(Concatenate(fieldChildren, _allocator)); } ArrowBuffer[] buffers = new ArrowBuffer[bufferCount]; @@ -242,9 +284,30 @@ private void ConcatenateLists(NestedType type) CheckData(type, 2); ArrowBuffer validityBuffer = ConcatenateValidityBuffer(); ArrowBuffer offsetBuffer = ConcatenateOffsetBuffer(); - ArrayData child = Concatenate(SelectChildren(0), _allocator); - Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer }, new[] { child }); + var children = new List(_arrayDataList.Count); + foreach (ArrayData arrayData in _arrayDataList) + { + if (arrayData.Length == 0) + { + continue; + } + + var child = arrayData.Children[0]; + ReadOnlySpan offsets = arrayData.Buffers[1].Span.CastTo().Slice(arrayData.Offset, arrayData.Length + 1); + var firstOffset = offsets[0]; + var lastOffset = offsets[arrayData.Length]; + if (firstOffset != 0 || lastOffset != child.Length) + { + child = child.Slice(firstOffset, lastOffset - firstOffset); + } + + children.Add(child); + } + + ArrayData combinedChild = Concatenate(children, _allocator); + + Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer }, new[] { combinedChild }); } private ArrowBuffer ConcatenateValidityBuffer() @@ -254,7 +317,43 @@ private ArrowBuffer ConcatenateValidityBuffer() return ArrowBuffer.Empty; } - return ConcatenateBitmapBuffer(0); + var builder = new ArrowBuffer.BitmapBuilder(_totalLength); + + foreach (ArrayData arrayData in _arrayDataList) + { + int length = arrayData.Length; + int offset = arrayData.Offset; + ReadOnlySpan span = arrayData.Buffers[0].Span; + + if (length > 0 && span.Length == 0) + { + if (arrayData.NullCount == 0) + { + builder.AppendRange(true , length); + } + else if (arrayData.NullCount == length) + { + builder.AppendRange(false , length); + } + else + { + throw new Exception("Array has no validity buffer and null count != 0 or length"); + } + } + else if (offset == 0) + { + builder.Append(span, length); + } + else + { + for (int i = 0; i < length; ++i) + { + builder.Append(BitUtility.GetBit(span, offset + i)); + } + } + } + + return builder.Build(_allocator); } private ArrowBuffer ConcatenateBitmapBuffer(int bufferIndex) @@ -264,9 +363,20 @@ private ArrowBuffer ConcatenateBitmapBuffer(int bufferIndex) foreach (ArrayData arrayData in _arrayDataList) { int length = arrayData.Length; + int offset = arrayData.Offset; ReadOnlySpan span = arrayData.Buffers[bufferIndex].Span; - builder.Append(span, length); + if (offset == 0) + { + builder.Append(span, length); + } + else + { + for (int i = 0; i < length; ++i) + { + builder.Append(BitUtility.GetBit(span, offset + i)); + } + } } return builder.Build(_allocator); @@ -279,10 +389,10 @@ private ArrowBuffer ConcatenateFixedWidthTypeValueBuffer(int bufferIndex, FixedW foreach (ArrayData arrayData in _arrayDataList) { - int length = arrayData.Length; - int byteLength = length * typeByteWidth; + int byteLength = arrayData.Length * typeByteWidth; + int byteOffset = arrayData.Offset * typeByteWidth; - builder.Append(arrayData.Buffers[bufferIndex].Span.Slice(0, byteLength)); + builder.Append(arrayData.Buffers[bufferIndex].Span.Slice(byteOffset, byteLength)); } return builder.Build(_allocator); @@ -294,8 +404,10 @@ private ArrowBuffer ConcatenateVariableBinaryValueBuffer() foreach (ArrayData arrayData in _arrayDataList) { - int lastOffset = arrayData.Buffers[1].Span.CastTo()[arrayData.Length]; - builder.Append(arrayData.Buffers[2].Span.Slice(0, lastOffset)); + var offsets = arrayData.Buffers[1].Span.CastTo().Slice(arrayData.Offset, arrayData.Length + 1); + var firstOffset = offsets[0]; + var lastOffset = offsets[arrayData.Length]; + builder.Append(arrayData.Buffers[2].Span.Slice(firstOffset, lastOffset - firstOffset)); } return builder.Build(_allocator); @@ -306,8 +418,6 @@ private ArrowBuffer ConcatenateOffsetBuffer() var builder = new ArrowBuffer.Builder(_totalLength + 1); int baseOffset = 0; - builder.Append(0); - foreach (ArrayData arrayData in _arrayDataList) { if (arrayData.Length == 0) @@ -315,19 +425,20 @@ private ArrowBuffer ConcatenateOffsetBuffer() continue; } - // The first offset is always 0. - // It should be skipped because it duplicate to the last offset of builder. - ReadOnlySpan span = arrayData.Buffers[1].Span.CastTo().Slice(1, arrayData.Length); + ReadOnlySpan span = arrayData.Buffers[1].Span.CastTo().Slice(arrayData.Offset, arrayData.Length + 1); + // First offset may be non-zero for sliced arrays + var firstOffset = span[0]; - foreach (int offset in span) + foreach (int offset in span.Slice(0, arrayData.Length)) { - builder.Append(baseOffset + offset); + builder.Append(baseOffset + offset - firstOffset); } - // The next offset must start from the current last offset. - baseOffset += span[arrayData.Length - 1]; + baseOffset += span[arrayData.Length] - firstOffset; } + builder.Append(baseOffset); + return builder.Build(_allocator); } @@ -342,7 +453,7 @@ private ArrowBuffer ConcatenateViewBuffer(out int variadicBufferCount) continue; } - ReadOnlySpan span = arrayData.Buffers[1].Span.CastTo().Slice(0, arrayData.Length); + ReadOnlySpan span = arrayData.Buffers[1].Span.CastTo().Slice(arrayData.Offset, arrayData.Length); foreach (BinaryView view in span) { if (view.Length > BinaryView.MaxInlineLength) @@ -412,6 +523,26 @@ private List SelectChildren(int index) return children; } + + private List SelectSlicedChildren(int index) + { + var children = new List(_arrayDataList.Count); + + foreach (ArrayData arrayData in _arrayDataList) + { + var offset = arrayData.Offset; + var length = arrayData.Length; + var child = arrayData.Children[index]; + if (offset != 0 || child.Length != length) + { + child = child.Slice(offset, length); + } + + children.Add(child); + } + + return children; + } } } } diff --git a/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs b/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs index f8367102082f5..cb7164aa146c6 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs @@ -82,12 +82,21 @@ internal static IArrowArrayBuilder> return new Decimal128Array.Builder(dataType as Decimal128Type); case ArrowTypeId.Decimal256: return new Decimal256Array.Builder(dataType as Decimal256Type); + case ArrowTypeId.Interval: + var intervalType = (IntervalType)dataType; + return intervalType.Unit switch + { + IntervalUnit.YearMonth => new YearMonthIntervalArray.Builder(), + IntervalUnit.DayTime => new DayTimeIntervalArray.Builder(), + IntervalUnit.MonthDayNanosecond => new MonthDayNanosecondIntervalArray.Builder(), + _ => throw new ArgumentOutOfRangeException($"unsupported interval unit <{intervalType.Unit}>") + }; + case ArrowTypeId.Map: + return new MapArray.Builder(dataType as MapType); case ArrowTypeId.Struct: case ArrowTypeId.Union: case ArrowTypeId.Dictionary: case ArrowTypeId.FixedSizedBinary: - case ArrowTypeId.Interval: - case ArrowTypeId.Map: default: throw new NotSupportedException($"An ArrowArrayBuilder cannot be built for type {dataType.TypeId}."); } diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayBuilderFactoryReflector.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayBuilderFactoryReflector.cs deleted file mode 100644 index 69894ab3cd325..0000000000000 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayBuilderFactoryReflector.cs +++ /dev/null @@ -1,32 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one or more -// contributor license agreements. See the NOTICE file distributed with -// this work for additional information regarding copyright ownership. -// The ASF licenses this file to You under the Apache License, Version 2.0 -// (the "License"); you may not use this file except in compliance with -// the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -using System; -using System.Reflection; -using Apache.Arrow.Types; - -namespace Apache.Arrow.Tests -{ - static class ArrayArrayBuilderFactoryReflector - { - private static readonly MethodInfo s_buildInfo = typeof(ArrayData).Assembly.GetType("Apache.Arrow.ArrowArrayBuilderFactory") - .GetMethod("Build", BindingFlags.Static | BindingFlags.NonPublic); - - internal static IArrowArrayBuilder> InvokeBuild(IArrowType dataType) - { - return s_buildInfo.Invoke(null, new object[] { dataType }) as IArrowArrayBuilder>; - } - } -} diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs index a1f6b1b8d80a0..2437d3d94c446 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayConcatenatorTests.cs @@ -34,6 +34,16 @@ public void TestStandardCases() } } + [Fact] + public void TestConcatenateSlices() + { + foreach ((List testTargetArrayList, IArrowArray expectedArray) in GenerateTestData(slicedArrays: true)) + { + IArrowArray actualArray = ArrowArrayConcatenator.Concatenate(testTargetArrayList); + ArrowReaderVerifier.CompareArrays(expectedArray, actualArray, strictCompare: false); + } + } + [Fact] public void TestNullOrEmpty() { @@ -49,7 +59,7 @@ public void TestSingleElement() ArrowReaderVerifier.CompareArrays(array, actualArray); } - private static IEnumerable, IArrowArray>> GenerateTestData() + private static IEnumerable, IArrowArray>> GenerateTestData(bool slicedArrays = false) { var targetTypes = new List() { BooleanType.Default, @@ -78,7 +88,7 @@ private static IEnumerable, IArrowArray>> GenerateTestDa new Field.Builder().Name("Strings").DataType(StringType.Default).Nullable(true).Build(), new Field.Builder().Name("Ints").DataType(Int32Type.Default).Nullable(true).Build() }), - new FixedSizeListType(Int32Type.Default, 1), + new FixedSizeListType(Int32Type.Default, 2), new UnionType( new List{ new Field.Builder().Name("Strings").DataType(StringType.Default).Nullable(true).Build(), @@ -106,7 +116,7 @@ private static IEnumerable, IArrowArray>> GenerateTestDa foreach (IArrowType type in targetTypes) { - var creator = new TestDataGenerator(); + var creator = new TestDataGenerator(slicedArrays); type.Accept(creator); yield return Tuple.Create(creator.TestTargetArrayList, creator.ExpectedArray); } @@ -142,26 +152,44 @@ private class TestDataGenerator : IArrowTypeVisitor, IArrowTypeVisitor { + private readonly List> _baseData; - private List> _baseData; + private readonly int _baseDataListCount; - private int _baseDataListCount; + private readonly int _resultTotalElementCount; - private int _baseDataTotalElementCount; + private readonly List<(int Offset, int Length)> _sliceParameters = null; public List TestTargetArrayList { get; } public IArrowArray ExpectedArray { get; private set; } - public TestDataGenerator() + public TestDataGenerator(bool slicedArrays) { _baseData = new List> { - new List { 1, 2, 3 }, - new List { 100, 101, null }, - new List { 11, null, 12 }, + new List { 1, 2, 3, 4, 5, 6 }, + new List { 100, 101, null, 102, null, 103 }, + new List { null, null }, + new List { }, + new List { 11, null, 12, 13, 14 }, }; + if (slicedArrays) + { + _sliceParameters = new List<(int, int)> + { + (2, 3), + (0, 5), + (0, 2), + (0, 0), + (1, 4), + }; + } + _baseDataListCount = _baseData.Count; - _baseDataTotalElementCount = _baseData.Sum(_ => _.Count); + _resultTotalElementCount = slicedArrays + ? _sliceParameters.Sum(p => p.Length) + : _baseData.Sum(baseList => baseList.Count); + TestTargetArrayList = new List(_baseDataListCount); } @@ -179,166 +207,35 @@ public TestDataGenerator() public void Visit(Date32Type type) => GenerateTestData(type, x => DateTime.MinValue.AddDays(x)); public void Visit(Date64Type type) => GenerateTestData(type, x => DateTime.MinValue.AddDays(x)); - public void Visit(Decimal128Type type) - { - Decimal128Array.Builder resultBuilder = new Decimal128Array.Builder(type).Reserve(_baseDataTotalElementCount); + public void Visit(Decimal128Type type) => GenerateTestData(type, (builder, x) => builder.Append(x)); - for (int i = 0; i < _baseDataListCount; i++) - { - List dataList = _baseData[i]; - Decimal128Array.Builder builder = new Decimal128Array.Builder(type).Reserve(dataList.Count); - foreach (decimal? value in dataList) - { - if (value.HasValue) - { - builder.Append(value.Value); - resultBuilder.Append(value.Value); - } - else - { - builder.AppendNull(); - resultBuilder.AppendNull(); - } - } - TestTargetArrayList.Add(builder.Build()); - } - - ExpectedArray = resultBuilder.Build(); - } - - public void Visit(Decimal256Type type) - { - Decimal256Array.Builder resultBuilder = new Decimal256Array.Builder(type).Reserve(_baseDataTotalElementCount); - - for (int i = 0; i < _baseDataListCount; i++) - { - List dataList = _baseData[i]; - Decimal256Array.Builder builder = new Decimal256Array.Builder(type).Reserve(dataList.Count); - foreach (decimal? value in dataList) - { - if (value.HasValue) - { - builder.Append(value.Value); - resultBuilder.Append(value.Value); - } - else - { - builder.AppendNull(); - resultBuilder.AppendNull(); - } - } - TestTargetArrayList.Add(builder.Build()); - } - - ExpectedArray = resultBuilder.Build(); - } + public void Visit(Decimal256Type type) => GenerateTestData(type, (builder, x) => builder.Append(x)); public void Visit(TimestampType type) { - TimestampArray.Builder resultBuilder = new TimestampArray.Builder().Reserve(_baseDataTotalElementCount); DateTimeOffset basis = DateTimeOffset.UtcNow; - - for (int i = 0; i < _baseDataListCount; i++) - { - List dataList = _baseData[i]; - TimestampArray.Builder builder = new TimestampArray.Builder().Reserve(dataList.Count); - foreach (int? value in dataList) - { - if (value.HasValue) - { - DateTimeOffset dateValue = basis.AddMilliseconds(value.Value); - builder.Append(dateValue); - resultBuilder.Append(dateValue); - } - else - { - builder.AppendNull(); - resultBuilder.AppendNull(); - } - } - TestTargetArrayList.Add(builder.Build()); - } - - ExpectedArray = resultBuilder.Build(); + GenerateTestData(type, (builder, x) => builder.Append(basis.AddMilliseconds(x))); } - public void Visit(DurationType type) - { - DurationArray.Builder resultBuilder = new DurationArray.Builder(type).Reserve(_baseDataTotalElementCount); - - for (int i = 0; i < _baseDataListCount; i++) - { - List dataList = _baseData[i]; - DurationArray.Builder builder = new DurationArray.Builder(type).Reserve(dataList.Count); - foreach (int? value in dataList) - { - if (value.HasValue) - { - builder.Append(value.Value); - resultBuilder.Append(value.Value); - } - else - { - builder.AppendNull(); - resultBuilder.AppendNull(); - } - } - TestTargetArrayList.Add(builder.Build()); - } - - ExpectedArray = resultBuilder.Build(); - } + public void Visit(DurationType type) => GenerateTestData(type, x => (long)x); public void Visit(IntervalType type) { switch (type.Unit) { case IntervalUnit.YearMonth: - YearMonthIntervalArray.Builder yearMonthBuilder = new YearMonthIntervalArray.Builder().Reserve(_baseDataTotalElementCount); - foreach (List dataList in _baseData) - { - YearMonthIntervalArray.Builder yearMonthBuilder1 = new YearMonthIntervalArray.Builder().Reserve(dataList.Count); - foreach (int? value in dataList) - { - YearMonthInterval? ymi = value != null ? new YearMonthInterval(value.Value) : null; - yearMonthBuilder.Append(ymi); - yearMonthBuilder1.Append(ymi); - } - TestTargetArrayList.Add(yearMonthBuilder1.Build()); - } - ExpectedArray = yearMonthBuilder.Build(); + GenerateTestData( + type, x => new YearMonthInterval(x)); break; case IntervalUnit.DayTime: - DayTimeIntervalArray.Builder dayTimeBuilder = new DayTimeIntervalArray.Builder().Reserve(_baseDataTotalElementCount); - foreach (List dataList in _baseData) - { - DayTimeIntervalArray.Builder dayTimeBuilder1 = new DayTimeIntervalArray.Builder().Reserve(dataList.Count); - foreach (int? value in dataList) - { - DayTimeInterval? dti = value != null ? new DayTimeInterval(100 - 50 * value.Value, 100 * value.Value) : null; - dayTimeBuilder.Append(dti); - dayTimeBuilder1.Append(dti); - } - TestTargetArrayList.Add(dayTimeBuilder1.Build()); - } - ExpectedArray = dayTimeBuilder.Build(); + GenerateTestData( + type, x => new DayTimeInterval(100 - 50 * x, 100 * x)); break; case IntervalUnit.MonthDayNanosecond: - MonthDayNanosecondIntervalArray.Builder monthDayNanoBuilder = new MonthDayNanosecondIntervalArray.Builder().Reserve(_baseDataTotalElementCount); - foreach (List dataList in _baseData) - { - MonthDayNanosecondIntervalArray.Builder monthDayNanoBuilder1 = new MonthDayNanosecondIntervalArray.Builder().Reserve(dataList.Count); - foreach (int? value in dataList) - { - MonthDayNanosecondInterval? mdni = value != null ? new MonthDayNanosecondInterval(value.Value, 5 - value.Value, 100 * value.Value) : null; - monthDayNanoBuilder.Append(mdni); - monthDayNanoBuilder1.Append(mdni); - } - TestTargetArrayList.Add(monthDayNanoBuilder1.Build()); - } - ExpectedArray = monthDayNanoBuilder.Build(); + GenerateTestData( + type, x => new MonthDayNanosecondInterval(x, 5 - x, 100 * x)); break; default: @@ -346,245 +243,154 @@ public void Visit(IntervalType type) } } - public void Visit(BinaryType type) - { - BinaryArray.Builder resultBuilder = new BinaryArray.Builder().Reserve(_baseDataTotalElementCount); - - for (int i = 0; i < _baseDataListCount; i++) + public void Visit(BinaryType type) => + GenerateTestData(type, (builder, x) => { - List dataList = _baseData[i]; - BinaryArray.Builder builder = new BinaryArray.Builder().Reserve(dataList.Count); - - foreach (byte? value in dataList) + if (x % 2 == 0) { - if (value.HasValue) - { - builder.Append(value.Value); - resultBuilder.Append(value.Value); - } - else - { - builder.AppendNull(); - resultBuilder.AppendNull(); - } + builder.Append((byte)x); } - TestTargetArrayList.Add(builder.Build()); - } - - ExpectedArray = resultBuilder.Build(); - } - - public void Visit(BinaryViewType type) - { - BinaryViewArray.Builder resultBuilder = new BinaryViewArray.Builder().Reserve(_baseDataTotalElementCount); - - for (int i = 0; i < _baseDataListCount; i++) - { - List dataList = _baseData[i]; - BinaryViewArray.Builder builder = new BinaryViewArray.Builder().Reserve(dataList.Count); - - foreach (byte? value in dataList) + else { - if (value.HasValue) - { - builder.Append(value.Value); - resultBuilder.Append(value.Value); - } - else - { - builder.AppendNull(); - resultBuilder.AppendNull(); - } + builder.Append(new byte[] {(byte)x, (byte)(x + 1)}.AsSpan()); } - TestTargetArrayList.Add(builder.Build()); - } + }); - ExpectedArray = resultBuilder.Build(); - } - - public void Visit(StringType type) - { - StringArray.Builder resultBuilder = new StringArray.Builder().Reserve(_baseDataTotalElementCount); - - for (int i = 0; i < _baseDataListCount; i++) + public void Visit(BinaryViewType type) => + GenerateTestData(type, (builder, x) => { - List dataList = _baseData[i]; - StringArray.Builder builder = new StringArray.Builder().Reserve(dataList.Count); - - foreach (string value in dataList.Select(_ => _.ToString() ?? null)) + if (x % 2 == 0) { - builder.Append(value); - resultBuilder.Append(value); + builder.Append((byte)x); } - TestTargetArrayList.Add(builder.Build()); - } - - ExpectedArray = resultBuilder.Build(); - } - - public void Visit(StringViewType type) - { - StringViewArray.Builder resultBuilder = new StringViewArray.Builder().Reserve(_baseDataTotalElementCount); - - for (int i = 0; i < _baseDataListCount; i++) - { - List dataList = _baseData[i]; - StringViewArray.Builder builder = new StringViewArray.Builder().Reserve(dataList.Count); - - foreach (string value in dataList.Select(_ => _.ToString() ?? null)) + else { - builder.Append(value); - resultBuilder.Append(value); + builder.Append(new byte[] {(byte)x, (byte)(x + 1)}.AsSpan()); } - TestTargetArrayList.Add(builder.Build()); - } + }); - ExpectedArray = resultBuilder.Build(); - } + public void Visit(StringType type) => + GenerateTestData(type, (builder, x) => builder.Append(x.ToString())); - public void Visit(ListType type) - { - ListArray.Builder resultBuilder = new ListArray.Builder(type.ValueDataType).Reserve(_baseDataTotalElementCount); - Int64Array.Builder resultValueBuilder = (Int64Array.Builder)resultBuilder.ValueBuilder.Reserve(_baseDataTotalElementCount); + public void Visit(StringViewType type) => + GenerateTestData(type, (builder, x) => builder.Append(x.ToString())); - for (int i = 0; i < _baseDataListCount; i++) + public void Visit(ListType type) => + GenerateTestData(type, (builder, x) => { - List dataList = _baseData[i]; - - ListArray.Builder builder = new ListArray.Builder(type.ValueField).Reserve(dataList.Count); - Int64Array.Builder valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(dataList.Count); - - foreach (long? value in dataList) - { - if (value.HasValue) - { - builder.Append(); - resultBuilder.Append(); - - valueBuilder.Append(value.Value); - resultValueBuilder.Append(value.Value); - } - else - { - builder.AppendNull(); - resultBuilder.AppendNull(); - } - } - - TestTargetArrayList.Add(builder.Build()); - } - - ExpectedArray = resultBuilder.Build(); - } - - public void Visit(ListViewType type) - { - ListViewArray.Builder resultBuilder = new ListViewArray.Builder(type.ValueDataType).Reserve(_baseDataTotalElementCount); - Int64Array.Builder resultValueBuilder = (Int64Array.Builder)resultBuilder.ValueBuilder.Reserve(_baseDataTotalElementCount); - - for (int i = 0; i < _baseDataListCount; i++) + builder.Append(); + ((Int64Array.Builder)builder.ValueBuilder).Append(x); + }, initAction: (builder, length) => { - List dataList = _baseData[i]; + builder.Reserve(length); + builder.ValueBuilder.Reserve(length); + }); - ListViewArray.Builder builder = new ListViewArray.Builder(type.ValueField).Reserve(dataList.Count); - Int64Array.Builder valueBuilder = (Int64Array.Builder)builder.ValueBuilder.Reserve(dataList.Count); + public void Visit(ListViewType type) => + GenerateTestData(type, (builder, x) => + { + builder.Append(); + ((Int64Array.Builder)builder.ValueBuilder).Append(x); + }, initAction: (builder, length) => + { + builder.Reserve(length); + builder.ValueBuilder.Reserve(length); + }); - foreach (long? value in dataList) + public void Visit(FixedSizeListType type) => + GenerateTestData(type, (builder, x) => + { + builder.Append(); + var valueBuilder = (Int32Array.Builder)builder.ValueBuilder; + for (int i = 0; i < type.ListSize; ++i) { - if (value.HasValue) - { - builder.Append(); - resultBuilder.Append(); - - valueBuilder.Append(value.Value); - resultValueBuilder.Append(value.Value); - } - else - { - builder.AppendNull(); - resultBuilder.AppendNull(); - } + valueBuilder.Append(x); } + }, initAction: (builder, length) => + { + builder.Reserve(length); + builder.ValueBuilder.Reserve(length * type.ListSize); + }); - TestTargetArrayList.Add(builder.Build()); - } - - ExpectedArray = resultBuilder.Build(); - } - - public void Visit(FixedSizeListType type) + public void Visit(StructType type) { - FixedSizeListArray.Builder resultBuilder = new FixedSizeListArray.Builder(type.ValueDataType, type.ListSize).Reserve(_baseDataTotalElementCount); - Int32Array.Builder resultValueBuilder = (Int32Array.Builder)resultBuilder.ValueBuilder.Reserve(_baseDataTotalElementCount); + // TODO: Make data from type fields. - for (int i = 0; i < _baseDataListCount; i++) + // The following can be improved with a Builder class for StructArray. + StringArray.Builder resultStringBuilder = new StringArray.Builder().Reserve(_resultTotalElementCount); + Int32Array.Builder resultInt32Builder = new Int32Array.Builder().Reserve(_resultTotalElementCount); + ArrowBuffer.BitmapBuilder resultNullBitmapBuilder = new ArrowBuffer.BitmapBuilder().Reserve(_resultTotalElementCount); + int resultNullCount = 0; + + for (int i = 0; i < _baseData.Count; i++) { List dataList = _baseData[i]; + StringArray.Builder stringBuilder = new StringArray.Builder().Reserve(dataList.Count); + Int32Array.Builder int32Builder = new Int32Array.Builder().Reserve(dataList.Count); + ArrowBuffer.BitmapBuilder nullBitmapBuilder = new ArrowBuffer.BitmapBuilder().Reserve(dataList.Count); + int nullCount = 0; - FixedSizeListArray.Builder builder = new FixedSizeListArray.Builder(type.ValueField, type.ListSize).Reserve(dataList.Count); - Int32Array.Builder valueBuilder = (Int32Array.Builder)builder.ValueBuilder.Reserve(dataList.Count); - - foreach (int? value in dataList) + for (int j = 0; j < dataList.Count; ++j) { + var value = dataList[j]; if (value.HasValue) { - builder.Append(); - resultBuilder.Append(); + nullBitmapBuilder.Append(true); + stringBuilder.Append(value.Value.ToString()); + int32Builder.Append(value.Value); - valueBuilder.Append(value.Value); - resultValueBuilder.Append(value.Value); + if (IncludeInResult(i, j)) + { + resultNullBitmapBuilder.Append(true); + resultStringBuilder.Append(value.Value.ToString()); + resultInt32Builder.Append(value.Value); + } } else { - builder.AppendNull(); - resultBuilder.AppendNull(); + nullCount++; + nullBitmapBuilder.Append(false); + stringBuilder.Append(""); + int32Builder.Append(0); + + if (IncludeInResult(i, j)) + { + resultNullCount++; + resultNullBitmapBuilder.Append(false); + resultStringBuilder.Append(""); + resultInt32Builder.Append(0); + } } } - TestTargetArrayList.Add(builder.Build()); - } - - ExpectedArray = resultBuilder.Build(); - } - - public void Visit(StructType type) - { - // TODO: Make data from type fields. - - // The following can be improved with a Builder class for StructArray. - StringArray.Builder resultStringBuilder = new StringArray.Builder(); - Int32Array.Builder resultInt32Builder = new Int32Array.Builder(); - ArrowBuffer nullBitmapBuffer = new ArrowBuffer.BitmapBuilder().Append(true).Append(true).Append(false).Build(); - - for (int i = 0; i < 3; i++) - { - resultStringBuilder.Append("joe").AppendNull().AppendNull().Append("mark"); - resultInt32Builder.Append(1).Append(2).AppendNull().Append(4); - StringArray stringArray = new StringArray.Builder().Append("joe").AppendNull().AppendNull().Append("mark").Build(); - Int32Array intArray = new Int32Array.Builder().Append(1).Append(2).AppendNull().Append(4).Build(); - List arrays = new List + var arrays = new List { - stringArray, - intArray + stringBuilder.Build(), + int32Builder.Build(), }; - TestTargetArrayList.Add(new StructArray(type, 3, arrays, nullBitmapBuffer, 1)); + TestTargetArrayList.Add(SliceTargetArray( + new StructArray(type, dataList.Count, arrays, nullBitmapBuilder.Build(), nullCount), i)); } - StringArray resultStringArray = resultStringBuilder.Build(); - Int32Array resultInt32Array = resultInt32Builder.Build(); + var resultArrays = new List + { + resultStringBuilder.Build(), + resultInt32Builder.Build(), + }; - ExpectedArray = new StructArray(type, 9, new List { resultStringArray, resultInt32Array }, nullBitmapBuffer, 3); + ExpectedArray = new StructArray( + type, _resultTotalElementCount, resultArrays, resultNullBitmapBuilder.Build(), resultNullCount); } public void Visit(UnionType type) { bool isDense = type.Mode == UnionMode.Dense; - StringArray.Builder stringResultBuilder = new StringArray.Builder().Reserve(_baseDataTotalElementCount); - Int32Array.Builder intResultBuilder = new Int32Array.Builder().Reserve(_baseDataTotalElementCount); - ArrowBuffer.Builder typeResultBuilder = new ArrowBuffer.Builder().Reserve(_baseDataTotalElementCount); - ArrowBuffer.Builder offsetResultBuilder = new ArrowBuffer.Builder().Reserve(_baseDataTotalElementCount); + StringArray.Builder stringResultBuilder = new StringArray.Builder().Reserve(_resultTotalElementCount); + Int32Array.Builder intResultBuilder = new Int32Array.Builder().Reserve(_resultTotalElementCount); + ArrowBuffer.Builder typeResultBuilder = new ArrowBuffer.Builder().Reserve(_resultTotalElementCount); + ArrowBuffer.Builder offsetResultBuilder = new ArrowBuffer.Builder().Reserve(_resultTotalElementCount); int resultNullCount = 0; for (int i = 0; i < _baseDataListCount; i++) @@ -598,41 +404,59 @@ public void Visit(UnionType type) for (int j = 0; j < dataList.Count; j++) { + bool includeInResult = IncludeInResult(i, j); byte index = (byte)Math.Min(j % 3, 1); int? intValue = (index == 1) ? dataList[j] : null; string stringValue = (index == 1) ? null : dataList[j]?.ToString(); typeBuilder.Append(index); - typeResultBuilder.Append(index); + if (includeInResult) + { + typeResultBuilder.Append(index); + } if (isDense) { if (index == 0) { offsetBuilder.Append(stringBuilder.Length); - offsetResultBuilder.Append(stringResultBuilder.Length); stringBuilder.Append(stringValue); + if (includeInResult) + { + offsetResultBuilder.Append(stringResultBuilder.Length); + } + // For dense mode, concatenation doesn't slice the child arrays, so always + // add the value to the result. stringResultBuilder.Append(stringValue); } else { offsetBuilder.Append(intBuilder.Length); - offsetResultBuilder.Append(intResultBuilder.Length); intBuilder.Append(intValue); + if (includeInResult) + { + offsetResultBuilder.Append(intResultBuilder.Length); + } intResultBuilder.Append(intValue); } } else { stringBuilder.Append(stringValue); - stringResultBuilder.Append(stringValue); intBuilder.Append(intValue); - intResultBuilder.Append(intValue); + if (includeInResult) + { + stringResultBuilder.Append(stringValue); + intResultBuilder.Append(intValue); + } } if (dataList[j] == null) { nullCount++; - resultNullCount++; + if (includeInResult) + { + resultNullCount++; + } } } @@ -645,9 +469,11 @@ public void Visit(UnionType type) { buffers = new[] { typeBuilder.Build() }; } - TestTargetArrayList.Add(UnionArray.Create(new ArrayData( + + var unionArray = UnionArray.Create(new ArrayData( type, dataList.Count, nullCount, 0, buffers, - new[] { stringBuilder.Build().Data, intBuilder.Build().Data }))); + new[] { stringBuilder.Build().Data, intBuilder.Build().Data })); + TestTargetArrayList.Add(SliceTargetArray(unionArray, i)); } ArrowBuffer[] resultBuffers; @@ -659,50 +485,27 @@ public void Visit(UnionType type) { resultBuffers = new[] { typeResultBuilder.Build() }; } + ExpectedArray = UnionArray.Create(new ArrayData( - type, _baseDataTotalElementCount, resultNullCount, 0, resultBuffers, + type, _resultTotalElementCount, resultNullCount, 0, resultBuffers, new[] { stringResultBuilder.Build().Data, intResultBuilder.Build().Data })); } - public void Visit(MapType type) - { - MapArray.Builder resultBuilder = new MapArray.Builder(type).Reserve(_baseDataTotalElementCount); - StringArray.Builder resultKeyBuilder = (StringArray.Builder)resultBuilder.KeyBuilder.Reserve(_baseDataTotalElementCount); - Int32Array.Builder resultValueBuilder = (Int32Array.Builder)resultBuilder.ValueBuilder.Reserve(_baseDataTotalElementCount); - ArrowBuffer nullBitmapBuilder = new ArrowBuffer.BitmapBuilder().Append(true).Append(true).Append(false).Build(); - - for (int i = 0; i < _baseData.Count; i++) + public void Visit(MapType type) => + GenerateTestData(type, (builder, x) => { - List dataList = _baseData[i]; - - MapArray.Builder builder = new MapArray.Builder(type).Reserve(dataList.Count); - StringArray.Builder keyBuilder = (StringArray.Builder)builder.KeyBuilder.Reserve(dataList.Count); - Int32Array.Builder valueBuilder = (Int32Array.Builder)builder.ValueBuilder.Reserve(dataList.Count); - - foreach (int? value in dataList) - { - if (value.HasValue) - { - builder.Append(); - resultBuilder.Append(); - - keyBuilder.Append(value.Value.ToString()); - valueBuilder.Append(value.Value); - resultKeyBuilder.Append(value.Value.ToString()); - resultValueBuilder.Append(value.Value); - } - else - { - builder.AppendNull(); - resultBuilder.AppendNull(); - } - } - - TestTargetArrayList.Add(builder.Build()); - } + var keyBuilder = (StringArray.Builder)builder.KeyBuilder; + var valueBuilder = (Int32Array.Builder)builder.ValueBuilder; - ExpectedArray = resultBuilder.Build(); - } + builder.Append(); + keyBuilder.Append(x.ToString()); + valueBuilder.Append(x); + }, initAction: (builder, length) => + { + builder.Reserve(length); + builder.KeyBuilder.Reserve(length); + builder.ValueBuilder.Reserve(length); + }); public void Visit(IArrowType type) { @@ -713,33 +516,87 @@ private void GenerateTestData(IArrowType type, Func where TArray : IArrowArray { - var resultBuilder = (IArrowArrayBuilder)ArrayArrayBuilderFactoryReflector.InvokeBuild(type); - resultBuilder.Reserve(_baseDataTotalElementCount); + GenerateTestData(type, (builder, x) => builder.Append(generator(x))); + } + + private void GenerateTestData( + IArrowType type, Action buildAction, Action initAction=null) + where TArrayBuilder : IArrowArrayBuilder + where TArray : IArrowArray + { + var resultBuilder = (TArrayBuilder)ArrowArrayBuilderFactory.Build(type); + if (initAction != null) + { + initAction(resultBuilder, _resultTotalElementCount); + } + else + { + resultBuilder.Reserve(_resultTotalElementCount); + } for (int i = 0; i < _baseDataListCount; i++) { List dataList = _baseData[i]; - var builder = (IArrowArrayBuilder)ArrayArrayBuilderFactoryReflector.InvokeBuild(type); - builder.Reserve(dataList.Count); + var builder = (TArrayBuilder)ArrowArrayBuilderFactory.Build(type); + if (initAction != null) + { + initAction(builder, dataList.Count); + } + else + { + builder.Reserve(dataList.Count); + } - foreach (int? value in dataList) + for (int j = 0; j < dataList.Count; ++j) { + var value = dataList[j]; if (value.HasValue) { - builder.Append(generator(value.Value)); - resultBuilder.Append(generator(value.Value)); + buildAction(builder, value.Value); + if (IncludeInResult(i, j)) + { + buildAction(resultBuilder, value.Value); + } } else { builder.AppendNull(); - resultBuilder.AppendNull(); + if (IncludeInResult(i, j)) + { + resultBuilder.AppendNull(); + } } } - TestTargetArrayList.Add(builder.Build(default)); + + TestTargetArrayList.Add(SliceTargetArray(builder.Build(default), i)); } ExpectedArray = resultBuilder.Build(default); } + + private bool IncludeInResult(int listIndex, int itemIndex) + { + if (_sliceParameters == null) + { + // Unsliced arrays, all values are expected in the result + return true; + } + + var sliceParameters = _sliceParameters[listIndex]; + return itemIndex >= sliceParameters.Offset && + itemIndex < (sliceParameters.Offset + sliceParameters.Length); + } + + private IArrowArray SliceTargetArray(IArrowArray array, int targetIndex) + { + if (_sliceParameters == null) + { + return array; + } + + return ArrowArrayFactory.Slice( + array, _sliceParameters[targetIndex].Offset, _sliceParameters[targetIndex].Length); + } } } }