-
Notifications
You must be signed in to change notification settings - Fork 3.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
GH-41136: [C#] Recompute null count for sliced arrays on demand #41144
Changes from all commits
55ce751
9e7bb01
3781080
e18162a
3c9d8b8
8d3e21f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,7 +15,6 @@ | |
|
||
using Apache.Arrow.Memory; | ||
using Apache.Arrow.Types; | ||
using Google.FlatBuffers; | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
|
@@ -28,12 +27,30 @@ public sealed class ArrayData : IDisposable | |
|
||
public readonly IArrowType DataType; | ||
public readonly int Length; | ||
public readonly int NullCount; | ||
|
||
/// <summary> | ||
/// The number of null values in the Array. May be -1 if the null count has not been computed. | ||
/// </summary> | ||
public int NullCount; | ||
|
||
public readonly int Offset; | ||
public readonly ArrowBuffer[] Buffers; | ||
public readonly ArrayData[] Children; | ||
public readonly ArrayData Dictionary; // Only used for dictionary type | ||
|
||
/// <summary> | ||
/// Get the number of null values in the Array, computing the count if required. | ||
/// </summary> | ||
public int GetNullCount() | ||
{ | ||
if (NullCount == RecalculateNullCount) | ||
{ | ||
NullCount = ComputeNullCount(); | ||
} | ||
|
||
return NullCount; | ||
} | ||
|
||
// This is left for compatibility with lower version binaries | ||
// before the dictionary type was supported. | ||
public ArrayData( | ||
|
@@ -111,7 +128,25 @@ public ArrayData Slice(int offset, int length) | |
length = Math.Min(Length - offset, length); | ||
offset += Offset; | ||
|
||
return new ArrayData(DataType, length, RecalculateNullCount, offset, Buffers, Children, Dictionary); | ||
int nullCount; | ||
if (NullCount == 0) | ||
{ | ||
nullCount = 0; | ||
} | ||
else if (NullCount == Length) | ||
{ | ||
nullCount = length; | ||
} | ||
else if (offset == Offset && length == Length) | ||
{ | ||
nullCount = NullCount; | ||
} | ||
else | ||
{ | ||
nullCount = RecalculateNullCount; | ||
} | ||
|
||
return new ArrayData(DataType, length, nullCount, offset, Buffers, Children, Dictionary); | ||
} | ||
|
||
public ArrayData Clone(MemoryAllocator allocator = default) | ||
|
@@ -125,5 +160,24 @@ public ArrayData Clone(MemoryAllocator allocator = default) | |
Children?.Select(b => b.Clone(allocator))?.ToArray(), | ||
Dictionary?.Clone(allocator)); | ||
} | ||
|
||
private int ComputeNullCount() | ||
{ | ||
if (DataType.TypeId == ArrowTypeId.Union) | ||
{ | ||
return UnionArray.ComputeNullCount(this); | ||
} | ||
|
||
if (Buffers == null || Buffers.Length == 0 || Buffers[0].IsEmpty) | ||
{ | ||
return 0; | ||
} | ||
|
||
// Note: Dictionary arrays may be logically null if there is a null in the dictionary values, | ||
// but this isn't accounted for by the IArrowArray.IsNull implementation, | ||
// so we maintain consistency with that behaviour here. | ||
Comment on lines
+176
to
+178
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The C++ implementation actually has a separate |
||
|
||
return Length - BitUtility.CountBits(Buffers[0].Span, Offset, Length); | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -71,7 +71,7 @@ public ArrayDataConcatenationVisitor(IReadOnlyList<ArrayData> arrayDataList, Mem | |
foreach (ArrayData arrayData in _arrayDataList) | ||
{ | ||
_totalLength += arrayData.Length; | ||
_totalNullCount += arrayData.NullCount; | ||
_totalNullCount += arrayData.GetNullCount(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It could make sense to check There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you file a separate bug for this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep sure, I've made #41164 |
||
} | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This would force calculation of the null count on the original array. Is it worth preserving laziness and saying "if we haven't calculated the original null count then mark the sliced null count as requiring calculation"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good catch, yes definitely
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was fixed as a side effect of reverting
NullCount
back to being a normal int member.