Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-41164: [C#] Fix concatenation of sliced arrays #41245

Merged
merged 11 commits into from
Apr 17, 2024
197 changes: 164 additions & 33 deletions csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -107,38 +107,76 @@ public void Visit(ListViewType type)
{
CheckData(type, 3);
ArrowBuffer validityBuffer = ConcatenateValidityBuffer();
ArrowBuffer sizesBuffer = ConcatenateFixedWidthTypeValueBuffer(2, Int32Type.Default);

var children = new List<ArrayData>(_arrayDataList.Count);
var offsetsBuilder = new ArrowBuffer.Builder<int>(_totalLength);
int baseOffset = 0;

foreach (ArrayData arrayData in _arrayDataList)
{
if (arrayData.Length > 0)
if (arrayData.Length == 0)
{
ReadOnlySpan<int> span = arrayData.Buffers[1].Span.CastTo<int>().Slice(0, arrayData.Length);
foreach (int offset in span)
{
offsetsBuilder.Append(baseOffset + offset);
}
continue;
}

var child = arrayData.Children[0];
ReadOnlySpan<int> offsets = arrayData.Buffers[1].Span.CastTo<int>().Slice(arrayData.Offset, arrayData.Length);
ReadOnlySpan<int> sizes = arrayData.Buffers[2].Span.CastTo<int>().Slice(arrayData.Offset, arrayData.Length);
var minOffset = offsets[0];
var maxEnd = 0;

for (int i = 0; i < arrayData.Length; ++i)
{
minOffset = Math.Min(minOffset, offsets[i]);
maxEnd = Math.Max(maxEnd, offsets[i] + sizes[i]);
}

baseOffset += arrayData.Children[0].Length;
foreach (int offset in offsets)
{
offsetsBuilder.Append(baseOffset + offset - minOffset);
}

var childLength = maxEnd - minOffset;
if (minOffset != 0 || childLength != child.Length)
{
child = child.Slice(minOffset, childLength);
}

baseOffset += childLength;
children.Add(child);
}

ArrowBuffer offsetBuffer = offsetsBuilder.Build(_allocator);
ArrowBuffer sizesBuffer = ConcatenateFixedWidthTypeValueBuffer(2, Int32Type.Default);
ArrayData child = Concatenate(SelectChildren(0), _allocator);
ArrayData combinedChild = Concatenate(children, _allocator);

Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer, sizesBuffer }, new[] { child });
Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer, sizesBuffer }, new[] { combinedChild });
}

public void Visit(FixedSizeListType type)
{
CheckData(type, 1);
var listSize = type.ListSize;
ArrowBuffer validityBuffer = ConcatenateValidityBuffer();
ArrayData child = Concatenate(SelectChildren(0), _allocator);

Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer }, new[] { child });
var children = new List<ArrayData>(_arrayDataList.Count);

foreach (ArrayData arrayData in _arrayDataList)
{
var offset = arrayData.Offset;
var length = arrayData.Length;
var child = arrayData.Children[0];
if (offset != 0 || child.Length != length * listSize)
{
child = child.Slice(offset * listSize, length * listSize);
}

children.Add(child);
}

ArrayData combinedChild = Concatenate(children, _allocator);

Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer }, new[] { combinedChild });
}

public void Visit(StructType type)
Expand All @@ -149,7 +187,7 @@ public void Visit(StructType type)

for (int i = 0; i < type.Fields.Count; i++)
{
children.Add(Concatenate(SelectChildren(i), _allocator));
children.Add(Concatenate(SelectSlicedChildren(i), _allocator));
}

Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer }, children);
Expand All @@ -169,7 +207,11 @@ public void Visit(UnionType type)

for (int i = 0; i < type.Fields.Count; i++)
{
children.Add(Concatenate(SelectChildren(i), _allocator));
// For dense mode, the offsets aren't adjusted so are into the non-sliced child arrays
var fieldChildren = type.Mode == UnionMode.Sparse
? SelectSlicedChildren(i)
: SelectChildren(i);
children.Add(Concatenate(fieldChildren, _allocator));
}

ArrowBuffer[] buffers = new ArrowBuffer[bufferCount];
Expand Down Expand Up @@ -242,9 +284,30 @@ private void ConcatenateLists(NestedType type)
CheckData(type, 2);
ArrowBuffer validityBuffer = ConcatenateValidityBuffer();
ArrowBuffer offsetBuffer = ConcatenateOffsetBuffer();
ArrayData child = Concatenate(SelectChildren(0), _allocator);

Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer }, new[] { child });
var children = new List<ArrayData>(_arrayDataList.Count);
foreach (ArrayData arrayData in _arrayDataList)
{
if (arrayData.Length == 0)
{
continue;
}

var child = arrayData.Children[0];
ReadOnlySpan<int> offsets = arrayData.Buffers[1].Span.CastTo<int>().Slice(arrayData.Offset, arrayData.Length + 1);
var firstOffset = offsets[0];
var lastOffset = offsets[arrayData.Length];
if (firstOffset != 0 || lastOffset != child.Length)
{
child = child.Slice(firstOffset, lastOffset - firstOffset);
}

children.Add(child);
}

ArrayData combinedChild = Concatenate(children, _allocator);

Result = new ArrayData(type, _totalLength, _totalNullCount, 0, new ArrowBuffer[] { validityBuffer, offsetBuffer }, new[] { combinedChild });
}

private ArrowBuffer ConcatenateValidityBuffer()
Expand All @@ -254,7 +317,43 @@ private ArrowBuffer ConcatenateValidityBuffer()
return ArrowBuffer.Empty;
}

return ConcatenateBitmapBuffer(0);
var builder = new ArrowBuffer.BitmapBuilder(_totalLength);

foreach (ArrayData arrayData in _arrayDataList)
{
int length = arrayData.Length;
int offset = arrayData.Offset;
ReadOnlySpan<byte> span = arrayData.Buffers[0].Span;

if (length > 0 && span.Length == 0)
{
if (arrayData.NullCount == 0)
{
builder.AppendRange(true , length);
}
else if (arrayData.NullCount == length)
{
builder.AppendRange(false , length);
}
else
{
throw new Exception("Array has no validity buffer and null count != 0 or length");
}
}
else if (offset == 0)
{
builder.Append(span, length);
}
else
{
for (int i = 0; i < length; ++i)
{
builder.Append(BitUtility.GetBit(span, offset + i));
}
}
}

return builder.Build(_allocator);
}

private ArrowBuffer ConcatenateBitmapBuffer(int bufferIndex)
Expand All @@ -264,9 +363,20 @@ private ArrowBuffer ConcatenateBitmapBuffer(int bufferIndex)
foreach (ArrayData arrayData in _arrayDataList)
{
int length = arrayData.Length;
int offset = arrayData.Offset;
ReadOnlySpan<byte> span = arrayData.Buffers[bufferIndex].Span;

builder.Append(span, length);
if (offset == 0)
{
builder.Append(span, length);
}
else
{
for (int i = 0; i < length; ++i)
{
builder.Append(BitUtility.GetBit(span, offset + i));
}
}
}

return builder.Build(_allocator);
Expand All @@ -279,10 +389,10 @@ private ArrowBuffer ConcatenateFixedWidthTypeValueBuffer(int bufferIndex, FixedW

foreach (ArrayData arrayData in _arrayDataList)
{
int length = arrayData.Length;
int byteLength = length * typeByteWidth;
int byteLength = arrayData.Length * typeByteWidth;
int byteOffset = arrayData.Offset * typeByteWidth;

builder.Append(arrayData.Buffers[bufferIndex].Span.Slice(0, byteLength));
builder.Append(arrayData.Buffers[bufferIndex].Span.Slice(byteOffset, byteLength));
}

return builder.Build(_allocator);
Expand All @@ -294,8 +404,10 @@ private ArrowBuffer ConcatenateVariableBinaryValueBuffer()

foreach (ArrayData arrayData in _arrayDataList)
{
int lastOffset = arrayData.Buffers[1].Span.CastTo<int>()[arrayData.Length];
builder.Append(arrayData.Buffers[2].Span.Slice(0, lastOffset));
var offsets = arrayData.Buffers[1].Span.CastTo<int>().Slice(arrayData.Offset, arrayData.Length + 1);
var firstOffset = offsets[0];
var lastOffset = offsets[arrayData.Length];
builder.Append(arrayData.Buffers[2].Span.Slice(firstOffset, lastOffset - firstOffset));
}

return builder.Build(_allocator);
Expand All @@ -306,28 +418,27 @@ private ArrowBuffer ConcatenateOffsetBuffer()
var builder = new ArrowBuffer.Builder<int>(_totalLength + 1);
int baseOffset = 0;

builder.Append(0);

foreach (ArrayData arrayData in _arrayDataList)
{
if (arrayData.Length == 0)
{
continue;
}

// The first offset is always 0.
// It should be skipped because it duplicate to the last offset of builder.
ReadOnlySpan<int> span = arrayData.Buffers[1].Span.CastTo<int>().Slice(1, arrayData.Length);
ReadOnlySpan<int> span = arrayData.Buffers[1].Span.CastTo<int>().Slice(arrayData.Offset, arrayData.Length + 1);
// First offset may be non-zero for sliced arrays
var firstOffset = span[0];

foreach (int offset in span)
foreach (int offset in span.Slice(0, arrayData.Length))
{
builder.Append(baseOffset + offset);
builder.Append(baseOffset + offset - firstOffset);
}

// The next offset must start from the current last offset.
baseOffset += span[arrayData.Length - 1];
baseOffset += span[arrayData.Length] - firstOffset;
}

builder.Append(baseOffset);

return builder.Build(_allocator);
}

Expand All @@ -342,7 +453,7 @@ private ArrowBuffer ConcatenateViewBuffer(out int variadicBufferCount)
continue;
}

ReadOnlySpan<BinaryView> span = arrayData.Buffers[1].Span.CastTo<BinaryView>().Slice(0, arrayData.Length);
ReadOnlySpan<BinaryView> span = arrayData.Buffers[1].Span.CastTo<BinaryView>().Slice(arrayData.Offset, arrayData.Length);
foreach (BinaryView view in span)
{
if (view.Length > BinaryView.MaxInlineLength)
Expand Down Expand Up @@ -412,6 +523,26 @@ private List<ArrayData> SelectChildren(int index)

return children;
}

private List<ArrayData> SelectSlicedChildren(int index)
{
var children = new List<ArrayData>(_arrayDataList.Count);

foreach (ArrayData arrayData in _arrayDataList)
{
var offset = arrayData.Offset;
var length = arrayData.Length;
var child = arrayData.Children[index];
if (offset != 0 || child.Length != length)
{
child = child.Slice(offset, length);
}

children.Add(child);
}

return children;
}
}
}
}
13 changes: 11 additions & 2 deletions csharp/src/Apache.Arrow/Arrays/ArrowArrayBuilderFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,21 @@ internal static IArrowArrayBuilder<IArrowArray, IArrowArrayBuilder<IArrowArray>>
return new Decimal128Array.Builder(dataType as Decimal128Type);
case ArrowTypeId.Decimal256:
return new Decimal256Array.Builder(dataType as Decimal256Type);
case ArrowTypeId.Interval:
var intervalType = (IntervalType)dataType;
return intervalType.Unit switch
{
IntervalUnit.YearMonth => new YearMonthIntervalArray.Builder(),
IntervalUnit.DayTime => new DayTimeIntervalArray.Builder(),
IntervalUnit.MonthDayNanosecond => new MonthDayNanosecondIntervalArray.Builder(),
_ => throw new ArgumentOutOfRangeException($"unsupported interval unit <{intervalType.Unit}>")
};
case ArrowTypeId.Map:
return new MapArray.Builder(dataType as MapType);
case ArrowTypeId.Struct:
case ArrowTypeId.Union:
case ArrowTypeId.Dictionary:
case ArrowTypeId.FixedSizedBinary:
case ArrowTypeId.Interval:
case ArrowTypeId.Map:
default:
throw new NotSupportedException($"An ArrowArrayBuilder cannot be built for type {dataType.TypeId}.");
}
Expand Down

This file was deleted.

Loading
Loading