Skip to content

Commit

Permalink
apacheGH-41136: [C#] Recompute null count for sliced arrays on demand (
Browse files Browse the repository at this point in the history
…apache#41144)

### Rationale for this change

See apache#41136. This is required for writing sliced arrays to IPC files and streams. It should also be generally useful too by allowing users to be able to rely on valid null count values for sliced arrays.

### What changes are included in this PR?

Adds a new `ArrayData.GetNullCount()` method which computes the actual null count if it is unknown, and changes most uses of `ArrayData.NullCount` to `GetNullCount()`.

### Are these changes tested?

Yes, I've extended the existing slicing tests to also verify the null count, and added new tests specifically for union arrays, which needed special handling.

### Are there any user-facing changes?

Yes, this is a user-facing behaviour change. I don't expect that it should break any existing code as users are unlikely to rely on the NullCount being -1.

* GitHub Issue: apache#41136

Authored-by: Adam Reeve <[email protected]>
Signed-off-by: Curt Hagenlocher <[email protected]>
  • Loading branch information
adamreeve authored and rok committed May 8, 2024
1 parent 3968adc commit db10ac8
Show file tree
Hide file tree
Showing 4 changed files with 109 additions and 7 deletions.
23 changes: 23 additions & 0 deletions csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs
Original file line number Diff line number Diff line change
Expand Up @@ -52,5 +52,28 @@ protected override bool FieldIsValid(IArrowArray fieldArray, int index)
{
return fieldArray.IsValid(ValueOffsets[index]);
}

internal new static int ComputeNullCount(ArrayData data)
{
var offset = data.Offset;
var length = data.Length;
var typeIds = data.Buffers[0].Span.Slice(offset, length);
var valueOffsets = data.Buffers[1].Span.CastTo<int>().Slice(offset, length);
var childArrays = new IArrowArray[data.Children.Length];
for (var childIdx = 0; childIdx < data.Children.Length; ++childIdx)
{
childArrays[childIdx] = ArrowArrayFactory.BuildArray(data.Children[childIdx]);
}

var nullCount = 0;
for (var i = 0; i < length; ++i)
{
var typeId = typeIds[i];
var valueOffset = valueOffsets[i];
nullCount += childArrays[typeId].IsNull(valueOffset) ? 1 : 0;
}

return nullCount;
}
}
}
21 changes: 21 additions & 0 deletions csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,26 @@ protected override bool FieldIsValid(IArrowArray fieldArray, int index)
{
return fieldArray.IsValid(index);
}

internal new static int ComputeNullCount(ArrayData data)
{
var offset = data.Offset;
var length = data.Length;
var typeIds = data.Buffers[0].Span.Slice(offset, length);
var childArrays = new IArrowArray[data.Children.Length];
for (var childIdx = 0; childIdx < data.Children.Length; ++childIdx)
{
childArrays[childIdx] = ArrowArrayFactory.BuildArray(data.Children[childIdx]);
}

var nullCount = 0;
for (var i = 0; i < data.Length; ++i)
{
var typeId = typeIds[i];
nullCount += childArrays[typeId].IsNull(offset + i) ? 1 : 0;
}

return nullCount;
}
}
}
23 changes: 23 additions & 0 deletions csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,29 @@ public ArrowStreamWriter(Stream baseStream, Schema schema, bool leaveOpen, IpcOp
}
}

private void CreateSelfAndChildrenFieldNodes(ArrayData data)
{
if (data.DataType is NestedType)
{
// flatbuffer struct vectors have to be created in reverse order
for (int i = data.Children.Length - 1; i >= 0; i--)
{
CreateSelfAndChildrenFieldNodes(data.Children[i]);
}
}
Flatbuf.FieldNode.CreateFieldNode(Builder, data.Length, data.GetNullCount());
}

private static int CountAllNodes(IReadOnlyList<Field> fields)
{
int count = 0;
foreach (Field arrowArray in fields)
{
CountSelfAndChildrenNodes(arrowArray.DataType, ref count);
}
return count;
}

private Offset<Flatbuf.BodyCompression> GetBodyCompression()
{
if (_options.CompressionCodec == null)
Expand Down
49 changes: 42 additions & 7 deletions csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,46 @@ public class UnionArrayTests
[InlineData(UnionMode.Sparse)]
[InlineData(UnionMode.Dense)]
public void UnionArray_IsNull(UnionMode mode)
{
var (array, expectedNull) = BuildUnionArray(mode, 100);

for (var i = 0; i < array.Length; ++i)
{
Assert.Equal(expectedNull[i], array.IsNull(i));
Assert.Equal(!expectedNull[i], array.IsValid(i));
}
}

[Theory]
[InlineData(UnionMode.Sparse)]
[InlineData(UnionMode.Dense)]
public void UnionArray_Slice(UnionMode mode)
{
var (array, expectedNull) = BuildUnionArray(mode, 10);

for (var offset = 0; offset < array.Length; ++offset)
{
for (var length = 0; length < array.Length - offset; ++length)
{
var slicedArray = ArrowArrayFactory.Slice(array, offset, length);

var nullCount = 0;
for (var i = 0; i < slicedArray.Length; ++i)
{
// TODO: Shouldn't need to add offset in IsNull/IsValid calls,
// see https://github.com/apache/arrow/issues/41140
Assert.Equal(expectedNull[offset + i], slicedArray.IsNull(offset + i));
Assert.Equal(!expectedNull[offset + i], slicedArray.IsValid(offset + i));
nullCount += expectedNull[offset + i] ? 1 : 0;
}

Assert.True(nullCount == slicedArray.NullCount, $"offset = {offset}, length = {length}");
Assert.Equal(nullCount, slicedArray.NullCount);
}
}
}

private static (UnionArray array, bool[] isNull) BuildUnionArray(UnionMode mode, int length)
{
var fields = new Field[]
{
Expand All @@ -34,7 +74,6 @@ public void UnionArray_IsNull(UnionMode mode)
var typeIds = fields.Select(f => (int) f.DataType.TypeId).ToArray();
var type = new UnionType(fields, typeIds, mode);

const int length = 100;
var nullCount = 0;
var field0Builder = new Int32Array.Builder();
var field1Builder = new FloatArray.Builder();
Expand All @@ -44,7 +83,7 @@ public void UnionArray_IsNull(UnionMode mode)

for (var i = 0; i < length; ++i)
{
var isNull = i % 5 == 0;
var isNull = i % 3 == 0;
expectedNull[i] = isNull;
nullCount += isNull ? 1 : 0;

Expand Down Expand Up @@ -104,10 +143,6 @@ public void UnionArray_IsNull(UnionMode mode)
? new DenseUnionArray(type, length, children, typeIdsBuffer, valuesOffsetBuffer, nullCount)
: new SparseUnionArray(type, length, children, typeIdsBuffer, nullCount);

for (var i = 0; i < length; ++i)
{
Assert.Equal(expectedNull[i], array.IsNull(i));
Assert.Equal(!expectedNull[i], array.IsValid(i));
}
return (array, expectedNull);
}
}

0 comments on commit db10ac8

Please sign in to comment.