Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

C++ refactoring: unique and is_unique #1111

Merged
merged 19 commits into from
Nov 9, 2021
447 changes: 447 additions & 0 deletions kernel-specification.yml

Large diffs are not rendered by default.

16 changes: 16 additions & 0 deletions src/awkward/_v2/contents/bitmaskedarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,22 @@ def mergemany(self, others):
def _localindex(self, axis, depth):
return self.toByteMaskedArray()._localindex(axis, depth)

def _is_unique(self, negaxis, starts, parents, outlength):
if len(self._mask) == 0:
return True
return self.toIndexedOptionArray64()._is_unique(
negaxis, starts, parents, outlength
)

def _unique(self, negaxis, starts, parents, outlength):
if len(self._mask) == 0:
return self
out = self.toIndexedOptionArray64()._unique(negaxis, starts, parents, outlength)
if negaxis is None:
return out
else:
return out._content

def _argsort_next(
self,
negaxis,
Expand Down
14 changes: 14 additions & 0 deletions src/awkward/_v2/contents/bytemaskedarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,20 @@ def _localindex(self, axis, depth):
)
return out2.simplify_optiontype()

def _is_unique(self, negaxis, starts, parents, outlength):
if len(self._mask) == 0:
return True
return self.toIndexedOptionArray64()._is_unique(
negaxis, starts, parents, outlength
)

def _unique(self, negaxis, starts, parents, outlength):
if len(self._mask) == 0:
return self
return self.toIndexedOptionArray64()._unique(
negaxis, starts, parents, outlength
)

def _argsort_next(
self,
negaxis,
Expand Down
46 changes: 46 additions & 0 deletions src/awkward/_v2/contents/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -924,6 +924,52 @@ def validityerror(self, path="layout"):
def purelist_parameter(self, key):
return self.Form.purelist_parameter(self, key)

def is_unique(self, axis=None):
negaxis = axis if axis is None else -axis
starts = ak._v2.index.Index64.zeros(1, self.nplike)
parents = ak._v2.index.Index64.zeros(len(self), self.nplike)
return self._is_unique(negaxis, starts, parents, 1)

def unique(self, axis=None):
if axis == -1 or axis is None:
negaxis = axis if axis is None else -axis
if negaxis is not None:
branch, depth = self.branch_depth
if branch:
if negaxis <= 0:
raise np.AxisError(
"cannot use non-negative axis on a nested list structure "
"of variable depth (negative axis counts from the leaves "
"of the tree; non-negative from the root)"
)
if negaxis > depth:
raise np.AxisError(
"cannot use axis={0} on a nested list structure that splits into "
"different depths, the minimum of which is depth={1} from the leaves".format(
axis, depth
)
)
else:
if negaxis <= 0:
negaxis = negaxis + depth
if not (0 < negaxis and negaxis <= depth):
raise np.AxisError(
"axis={0} exceeds the depth of this array ({1})".format(
axis, depth
)
)

starts = ak._v2.index.Index64.zeros(1, self.nplike)
parents = ak._v2.index.Index64.zeros(len(self), self.nplike)

return self._unique(negaxis, starts, parents, 1)

raise np.AxisError(
"unique expects axis 'None' or '-1', got axis={0} that is not supported yet".format(
axis
)
)

@property
def purelist_isregular(self):
return self.Form.purelist_isregular.__get__(self)
Expand Down
6 changes: 6 additions & 0 deletions src/awkward/_v2/contents/emptyarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,12 @@ def mergemany(self, others):
def _localindex(self, axis, depth):
return ak._v2.contents.numpyarray.NumpyArray(np.empty(0, np.int64))

def _is_unique(self, negaxis, starts, parents, outlength):
return True

def _unique(self, negaxis, starts, parents, outlength):
return self

def _argsort_next(
self,
negaxis,
Expand Down
190 changes: 190 additions & 0 deletions src/awkward/_v2/contents/indexedarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,196 @@ def _localindex(self, axis, depth):
else:
return self.project()._localindex(posaxis, depth)

def _unique_index(self, index, sorted=True):
nplike = self.nplike

next = ak._v2.index.Index64.empty(len(self), nplike)
length = ak._v2.index.Index64.empty(1, nplike)

if not sorted:
next = self._index
offsets = ak._v2.index.Index64.zeros(2, nplike)
offsets[1] = len(next)
self._handle_error(
nplike[
"awkward_sort",
next.dtype.type,
next.dtype.type,
offsets.dtype.type,
](
next.to(nplike),
next.to(nplike),
offsets[1],
offsets.to(nplike),
2,
offsets[1],
True,
False,
)
)

self._handle_error(
nplike["awkward_unique", next.dtype.type, length.dtype.type](
next.to(nplike),
len(self._index),
length.to(nplike),
)
)

else:
self._handle_error(
nplike[
"awkward_unique_copy",
self._index.dtype.type,
next.dtype.type,
length.dtype.type,
](
self._index.to(nplike),
next.to(nplike),
len(self._index),
length.to(nplike),
)
)

return next[0 : length[0]]

def _is_unique(self, negaxis, starts, parents, outlength):
if len(self._index) == 0:
return True

nextindex = self._unique_index(self._index)

next = self._content._carry(nextindex, False, NestedIndexError)
return next._is_unique(negaxis, starts, parents, outlength)

def _unique(self, negaxis, starts, parents, outlength):
if len(self._index) == 0:
return self

nplike = self.nplike
branch, depth = self.branch_depth

index_length = len(self._index)
parents_length = len(parents)
next_length = index_length

nextcarry = ak._v2.index.Index64.zeros(index_length, nplike)
nextparents = ak._v2.index.Index64.zeros(index_length, nplike)
outindex = ak._v2.index.Index64.zeros(index_length, nplike)
self._handle_error(
nplike[
"awkward_IndexedArray_reduce_next_64",
nextcarry.dtype.type,
nextparents.dtype.type,
outindex.dtype.type,
self._index.dtype.type,
parents.dtype.type,
](
nextcarry.to(nplike),
nextparents.to(nplike),
outindex.to(nplike),
self._index.to(nplike),
parents.to(nplike),
index_length,
)
)
next = self._content._carry(nextcarry, False, NestedIndexError)
unique = next._unique(
negaxis,
starts,
nextparents,
outlength,
)

if branch or (negaxis is not None and negaxis != depth):
nextoutindex = ak._v2.index.Index64.empty(parents_length, nplike)
self._handle_error(
nplike[
"awkward_IndexedArray_local_preparenext_64",
nextoutindex.dtype.type,
starts.dtype.type,
parents.dtype.type,
nextparents.dtype.type,
](
nextoutindex.to(nplike),
starts.to(nplike),
parents.to(nplike),
parents_length,
nextparents.to(nplike),
next_length,
)
)

out = ak._v2.contents.IndexedOptionArray(
nextoutindex,
unique,
None,
self._parameters,
).simplify_optiontype()

return out

if not branch and negaxis == depth:
return unique
else:

if isinstance(unique, ak._v2.contents.RegularArray):
unique = unique.toListOffsetArray64(True)

elif isinstance(unique, ak._v2.contents.ListOffsetArray):
if len(starts) > 0 and starts[0] != 0:
raise AssertionError(
"reduce_next with unbranching depth > negaxis expects a "
"ListOffsetArray64 whose offsets start at zero ({0})".format(
starts[0]
)
)

outoffsets = ak._v2.index.Index64.empty(len(starts) + 1, nplike)
self._handle_error(
nplike[
"awkward_IndexedArray_reduce_next_fix_offsets_64",
outoffsets.dtype.type,
starts.dtype.type,
](
outoffsets.to(nplike),
starts.to(nplike),
len(starts),
len(self._index),
)
)

tmp = ak._v2.contents.IndexedArray(
outindex,
unique._content,
None,
None,
)

return ak._v2.contents.ListOffsetArray(
outoffsets,
tmp,
None,
None,
)

elif isinstance(unique, ak._v2.contents.NumpyArray):
nextoutindex = ak._v2.index.Index64.empty(len(unique), nplike)
# FIXME: move to kernel
for i in range(len(unique)):
nextoutindex[i] = i

out = ak._v2.contents.IndexedOptionArray(
nextoutindex,
unique,
None,
self._parameters,
).simplify_optiontype()

return out

raise NotImplementedError

def _argsort_next(
self,
negaxis,
Expand Down
Loading