From f20c1e815089f9113273393634cf925fde59958a Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Fri, 12 Apr 2024 15:00:02 +1200 Subject: [PATCH] GH-41136: [C#] Recompute null count for sliced arrays on demand (#41144) ### Rationale for this change See #41136. This is required for writing sliced arrays to IPC files and streams. It should also be generally useful too by allowing users to be able to rely on valid null count values for sliced arrays. ### What changes are included in this PR? Adds a new `ArrayData.GetNullCount()` method which computes the actual null count if it is unknown, and changes most uses of `ArrayData.NullCount` to `GetNullCount()`. ### Are these changes tested? Yes, I've extended the existing slicing tests to also verify the null count, and added new tests specifically for union arrays, which needed special handling. ### Are there any user-facing changes? Yes, this is a user-facing behaviour change. I don't expect that it should break any existing code as users are unlikely to rely on the NullCount being -1. * GitHub Issue: #41136 Authored-by: Adam Reeve Signed-off-by: Curt Hagenlocher --- csharp/src/Apache.Arrow/Arrays/Array.cs | 2 +- csharp/src/Apache.Arrow/Arrays/ArrayData.cs | 60 ++++++++++++++++++- .../Arrays/ArrayDataConcatenator.cs | 2 +- .../Apache.Arrow/Arrays/DenseUnionArray.cs | 23 +++++++ csharp/src/Apache.Arrow/Arrays/NullArray.cs | 2 +- .../Apache.Arrow/Arrays/SparseUnionArray.cs | 21 +++++++ csharp/src/Apache.Arrow/Arrays/UnionArray.cs | 12 +++- .../src/Apache.Arrow/C/CArrowArrayExporter.cs | 2 +- .../src/Apache.Arrow/Ipc/ArrowStreamWriter.cs | 2 +- .../Apache.Arrow.Tests/ArrowArrayTests.cs | 15 +++++ .../Apache.Arrow.Tests/UnionArrayTests.cs | 49 ++++++++++++--- 11 files changed, 174 insertions(+), 16 deletions(-) diff --git a/csharp/src/Apache.Arrow/Arrays/Array.cs b/csharp/src/Apache.Arrow/Arrays/Array.cs index 0838134b19c6d..4abe63e05ad83 100644 --- a/csharp/src/Apache.Arrow/Arrays/Array.cs +++ b/csharp/src/Apache.Arrow/Arrays/Array.cs @@ -31,7 +31,7 @@ protected Array(ArrayData data) public int Offset => Data.Offset; - public int NullCount => Data.NullCount; + public int NullCount => Data.GetNullCount(); public ArrowBuffer NullBitmapBuffer => Data.Buffers[0]; diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayData.cs b/csharp/src/Apache.Arrow/Arrays/ArrayData.cs index 55d77f598c4e4..cdb6ed6b39418 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayData.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayData.cs @@ -15,7 +15,6 @@ using Apache.Arrow.Memory; using Apache.Arrow.Types; -using Google.FlatBuffers; using System; using System.Collections.Generic; using System.Linq; @@ -28,12 +27,30 @@ public sealed class ArrayData : IDisposable public readonly IArrowType DataType; public readonly int Length; - public readonly int NullCount; + + /// + /// The number of null values in the Array. May be -1 if the null count has not been computed. + /// + public int NullCount; + public readonly int Offset; public readonly ArrowBuffer[] Buffers; public readonly ArrayData[] Children; public readonly ArrayData Dictionary; // Only used for dictionary type + /// + /// Get the number of null values in the Array, computing the count if required. + /// + public int GetNullCount() + { + if (NullCount == RecalculateNullCount) + { + NullCount = ComputeNullCount(); + } + + return NullCount; + } + // This is left for compatibility with lower version binaries // before the dictionary type was supported. public ArrayData( @@ -111,7 +128,25 @@ public ArrayData Slice(int offset, int length) length = Math.Min(Length - offset, length); offset += Offset; - return new ArrayData(DataType, length, RecalculateNullCount, offset, Buffers, Children, Dictionary); + int nullCount; + if (NullCount == 0) + { + nullCount = 0; + } + else if (NullCount == Length) + { + nullCount = length; + } + else if (offset == Offset && length == Length) + { + nullCount = NullCount; + } + else + { + nullCount = RecalculateNullCount; + } + + return new ArrayData(DataType, length, nullCount, offset, Buffers, Children, Dictionary); } public ArrayData Clone(MemoryAllocator allocator = default) @@ -125,5 +160,24 @@ public ArrayData Clone(MemoryAllocator allocator = default) Children?.Select(b => b.Clone(allocator))?.ToArray(), Dictionary?.Clone(allocator)); } + + private int ComputeNullCount() + { + if (DataType.TypeId == ArrowTypeId.Union) + { + return UnionArray.ComputeNullCount(this); + } + + if (Buffers == null || Buffers.Length == 0 || Buffers[0].IsEmpty) + { + return 0; + } + + // Note: Dictionary arrays may be logically null if there is a null in the dictionary values, + // but this isn't accounted for by the IArrowArray.IsNull implementation, + // so we maintain consistency with that behaviour here. + + return Length - BitUtility.CountBits(Buffers[0].Span, Offset, Length); + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs index 698d74e4bac84..84658a5fab812 100644 --- a/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs +++ b/csharp/src/Apache.Arrow/Arrays/ArrayDataConcatenator.cs @@ -71,7 +71,7 @@ public ArrayDataConcatenationVisitor(IReadOnlyList arrayDataList, Mem foreach (ArrayData arrayData in _arrayDataList) { _totalLength += arrayData.Length; - _totalNullCount += arrayData.NullCount; + _totalNullCount += arrayData.GetNullCount(); } } diff --git a/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs b/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs index b6b61c560e482..459a30e22115f 100644 --- a/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/DenseUnionArray.cs @@ -53,5 +53,28 @@ protected override bool FieldIsValid(IArrowArray fieldArray, int index) { return fieldArray.IsValid(ValueOffsets[index]); } + + internal new static int ComputeNullCount(ArrayData data) + { + var offset = data.Offset; + var length = data.Length; + var typeIds = data.Buffers[0].Span.Slice(offset, length); + var valueOffsets = data.Buffers[1].Span.CastTo().Slice(offset, length); + var childArrays = new IArrowArray[data.Children.Length]; + for (var childIdx = 0; childIdx < data.Children.Length; ++childIdx) + { + childArrays[childIdx] = ArrowArrayFactory.BuildArray(data.Children[childIdx]); + } + + var nullCount = 0; + for (var i = 0; i < length; ++i) + { + var typeId = typeIds[i]; + var valueOffset = valueOffsets[i]; + nullCount += childArrays[typeId].IsNull(valueOffset) ? 1 : 0; + } + + return nullCount; + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/NullArray.cs b/csharp/src/Apache.Arrow/Arrays/NullArray.cs index 762540065c929..7f3e183829243 100644 --- a/csharp/src/Apache.Arrow/Arrays/NullArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/NullArray.cs @@ -95,7 +95,7 @@ public NullArray(int length) public int Offset => Data.Offset; - public int NullCount => Data.NullCount; + public int NullCount => Data.GetNullCount(); public void Dispose() { } public bool IsNull(int index) => true; diff --git a/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs b/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs index 07d36e25cfc23..ef55786f01a4a 100644 --- a/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/SparseUnionArray.cs @@ -47,5 +47,26 @@ protected override bool FieldIsValid(IArrowArray fieldArray, int index) { return fieldArray.IsValid(index); } + + internal new static int ComputeNullCount(ArrayData data) + { + var offset = data.Offset; + var length = data.Length; + var typeIds = data.Buffers[0].Span.Slice(offset, length); + var childArrays = new IArrowArray[data.Children.Length]; + for (var childIdx = 0; childIdx < data.Children.Length; ++childIdx) + { + childArrays[childIdx] = ArrowArrayFactory.BuildArray(data.Children[childIdx]); + } + + var nullCount = 0; + for (var i = 0; i < data.Length; ++i) + { + var typeId = typeIds[i]; + nullCount += childArrays[typeId].IsNull(offset + i) ? 1 : 0; + } + + return nullCount; + } } } diff --git a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs index 5fcb276655162..f96f527135351 100644 --- a/csharp/src/Apache.Arrow/Arrays/UnionArray.cs +++ b/csharp/src/Apache.Arrow/Arrays/UnionArray.cs @@ -41,7 +41,7 @@ public abstract class UnionArray : IArrowArray public int Offset => Data.Offset; - public int NullCount => Data.NullCount; + public int NullCount => Data.GetNullCount(); public bool IsValid(int index) => NullCount == 0 || FieldIsValid(Fields[TypeIds[index]], index); @@ -91,6 +91,16 @@ protected static void ValidateMode(UnionMode expected, UnionMode actual) } } + internal static int ComputeNullCount(ArrayData data) + { + return ((UnionType)data.DataType).Mode switch + { + UnionMode.Sparse => SparseUnionArray.ComputeNullCount(data), + UnionMode.Dense => DenseUnionArray.ComputeNullCount(data), + _ => throw new InvalidOperationException("unknown union mode in null count computation") + }; + } + private IReadOnlyList InitializeFields() { IArrowArray[] result = new IArrowArray[Data.Children.Length]; diff --git a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs index 03059eaf5d4df..b241fdfea3bda 100644 --- a/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs +++ b/csharp/src/Apache.Arrow/C/CArrowArrayExporter.cs @@ -115,7 +115,7 @@ private unsafe static void ConvertArray(ExportedAllocationOwner sharedOwner, Arr { cArray->length = array.Length; cArray->offset = array.Offset; - cArray->null_count = array.NullCount; + cArray->null_count = array.NullCount; // The C Data interface allows the null count to be -1 cArray->release = ReleaseArrayPtr; cArray->private_data = MakePrivateData(sharedOwner); diff --git a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs index b002f8c8b1578..7b319b03d790c 100644 --- a/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs +++ b/csharp/src/Apache.Arrow/Ipc/ArrowStreamWriter.cs @@ -376,7 +376,7 @@ private void CreateSelfAndChildrenFieldNodes(ArrayData data) CreateSelfAndChildrenFieldNodes(data.Children[i]); } } - Flatbuf.FieldNode.CreateFieldNode(Builder, data.Length, data.NullCount); + Flatbuf.FieldNode.CreateFieldNode(Builder, data.Length, data.GetNullCount()); } private static int CountAllNodes(IReadOnlyList fields) diff --git a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs index a0e90cbbc7c61..682ebec323dc0 100644 --- a/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/ArrowArrayTests.cs @@ -185,6 +185,7 @@ public void SlicePrimitiveArrayWithNulls() TestSlice(x => x.Append(new DateTime(2019, 1, 1)).Append(new DateTime(2019, 1, 2)).AppendNull().Append(new DateTime(2019, 1, 3))); TestSlice(x => x.Append(10).Append(20).AppendNull().Append(30)); TestSlice(x => x.Append(10).Append(20).AppendNull().Append(30)); + TestSlice(x => x.AppendNull().AppendNull().AppendNull()); // All nulls static void TestNumberSlice() where T : struct, INumber @@ -314,6 +315,8 @@ private void ValidateArrays(PrimitiveArray slicedArray) .SequenceEqual(slicedArray.Values)); Assert.Equal(baseArray.GetValue(slicedArray.Offset), slicedArray.GetValue(0)); + + ValidateNullCount(slicedArray); } private void ValidateArrays(BooleanArray slicedArray) @@ -333,6 +336,8 @@ private void ValidateArrays(BooleanArray slicedArray) #pragma warning disable CS0618 Assert.Equal(baseArray.GetBoolean(slicedArray.Offset), slicedArray.GetBoolean(0)); #pragma warning restore CS0618 + + ValidateNullCount(slicedArray); } private void ValidateArrays(BinaryArray slicedArray) @@ -347,6 +352,16 @@ private void ValidateArrays(BinaryArray slicedArray) .SequenceEqual(slicedArray.ValueOffsets)); Assert.True(baseArray.GetBytes(slicedArray.Offset).SequenceEqual(slicedArray.GetBytes(0))); + + ValidateNullCount(slicedArray); + } + + private static void ValidateNullCount(IArrowArray slicedArray) + { + var expectedNullCount = Enumerable.Range(0, slicedArray.Length) + .Select(i => slicedArray.IsNull(i) ? 1 : 0) + .Sum(); + Assert.Equal(expectedNullCount, slicedArray.NullCount); } } } diff --git a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs index 1fb5cf2415c68..45fed722a745c 100644 --- a/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs +++ b/csharp/test/Apache.Arrow.Tests/UnionArrayTests.cs @@ -25,6 +25,46 @@ public class UnionArrayTests [InlineData(UnionMode.Sparse)] [InlineData(UnionMode.Dense)] public void UnionArray_IsNull(UnionMode mode) + { + var (array, expectedNull) = BuildUnionArray(mode, 100); + + for (var i = 0; i < array.Length; ++i) + { + Assert.Equal(expectedNull[i], array.IsNull(i)); + Assert.Equal(!expectedNull[i], array.IsValid(i)); + } + } + + [Theory] + [InlineData(UnionMode.Sparse)] + [InlineData(UnionMode.Dense)] + public void UnionArray_Slice(UnionMode mode) + { + var (array, expectedNull) = BuildUnionArray(mode, 10); + + for (var offset = 0; offset < array.Length; ++offset) + { + for (var length = 0; length < array.Length - offset; ++length) + { + var slicedArray = ArrowArrayFactory.Slice(array, offset, length); + + var nullCount = 0; + for (var i = 0; i < slicedArray.Length; ++i) + { + // TODO: Shouldn't need to add offset in IsNull/IsValid calls, + // see https://github.com/apache/arrow/issues/41140 + Assert.Equal(expectedNull[offset + i], slicedArray.IsNull(offset + i)); + Assert.Equal(!expectedNull[offset + i], slicedArray.IsValid(offset + i)); + nullCount += expectedNull[offset + i] ? 1 : 0; + } + + Assert.True(nullCount == slicedArray.NullCount, $"offset = {offset}, length = {length}"); + Assert.Equal(nullCount, slicedArray.NullCount); + } + } + } + + private static (UnionArray array, bool[] isNull) BuildUnionArray(UnionMode mode, int length) { var fields = new Field[] { @@ -34,7 +74,6 @@ public void UnionArray_IsNull(UnionMode mode) var typeIds = fields.Select(f => (int) f.DataType.TypeId).ToArray(); var type = new UnionType(fields, typeIds, mode); - const int length = 100; var nullCount = 0; var field0Builder = new Int32Array.Builder(); var field1Builder = new FloatArray.Builder(); @@ -44,7 +83,7 @@ public void UnionArray_IsNull(UnionMode mode) for (var i = 0; i < length; ++i) { - var isNull = i % 5 == 0; + var isNull = i % 3 == 0; expectedNull[i] = isNull; nullCount += isNull ? 1 : 0; @@ -104,10 +143,6 @@ public void UnionArray_IsNull(UnionMode mode) ? new DenseUnionArray(type, length, children, typeIdsBuffer, valuesOffsetBuffer, nullCount) : new SparseUnionArray(type, length, children, typeIdsBuffer, nullCount); - for (var i = 0; i < length; ++i) - { - Assert.Equal(expectedNull[i], array.IsNull(i)); - Assert.Equal(!expectedNull[i], array.IsValid(i)); - } + return (array, expectedNull); } }