diff --git a/src/awkward/_v2/behaviors/string.py b/src/awkward/_v2/behaviors/string.py index 4486af1242..d4519a58be 100644 --- a/src/awkward/_v2/behaviors/string.py +++ b/src/awkward/_v2/behaviors/string.py @@ -100,44 +100,48 @@ def __str__(self): # yield x.__bytes__() -# class StringBehavior(ak._v2.highlevel.Array): -# __name__ = "Array" - -# def __iter__(self): -# for x in super(StringBehavior, self).__iter__(): -# yield x.__str__() +class StringBehavior(Array): + __name__ = "Array" + def __iter__(self): + for x in super().__iter__(): + yield x.__str__() -# def _string_equal(one, two): -# nplike = ak.nplike.of(one, two) -# behavior = ak._v2._util.behaviorof(one, two) -# one, two = ak.without_parameters(one).layout, ak.without_parameters(two).layout +def _string_equal(one, two): + nplike = ak.nplike.of(one, two) + behavior = ak._v2._util.behavior_of(one, two) -# # first condition: string lengths must be the same -# counts1 = nplike.asarray(one.count(axis=-1)) -# counts2 = nplike.asarray(two.count(axis=-1)) + one, two = ( + ak._v2.operations.structure.without_parameters(one).layout, + ak._v2.operations.structure.without_parameters(two).layout, + ) -# out = counts1 == counts2 + # first condition: string lengths must be the same + counts1 = nplike.asarray(one.count(axis=-1)) + counts2 = nplike.asarray(two.count(axis=-1)) -# # only compare characters in strings that are possibly equal (same length) -# possible = nplike.logical_and(out, counts1) -# possible_counts = counts1[possible] + out = counts1 == counts2 -# if len(possible_counts) > 0: -# onepossible = one[possible] -# twopossible = two[possible] + # only compare characters in strings that are possibly equal (same length) + possible = nplike.logical_and(out, counts1) + possible_counts = counts1[possible] -# reduced = ak.all(ak.Array(onepossible) == ak.Array(twopossible), axis=-1).layout + if len(possible_counts) > 0: + onepossible = one[possible] + twopossible = two[possible] -# # update same-length strings with a verdict about their characters -# out[possible] = reduced + reduced = ak._v2.operations.reducers.all( + ak._v2.Array(onepossible) == ak._v2.Array(twopossible), axis=-1 + ).layout + # update same-length strings with a verdict about their characters + out[possible] = reduced.data -# return ak._v2._util.wrap(ak._v2.contents.NumpyArray(out), behavior) + return ak._v2._util.wrap(ak._v2.contents.NumpyArray(out), behavior) -# def _string_notequal(one, two): -# return ~_string_equal(one, two) +def _string_notequal(one, two): + return ~_string_equal(one, two) # def _string_broadcast(layout, offsets): @@ -250,7 +254,7 @@ def register(behavior): # behavior[ak.nplike.numpy.equal, "bytestring", "bytestring"] = _string_equal # behavior[ak.nplike.numpy.equal, "string", "string"] = _string_equal # behavior[ak.nplike.numpy.not_equal, "bytestring", "bytestring"] = _string_notequal - # behavior[ak.nplike.numpy.not_equal, "string", "string"] = _string_notequal + behavior[ak.nplike.numpy.not_equal, "string", "string"] = _string_notequal # behavior["__broadcast__", "bytestring"] = _string_broadcast # behavior["__broadcast__", "string"] = _string_broadcast diff --git a/src/awkward/_v2/operations/structure/ak_run_lengths.py b/src/awkward/_v2/operations/structure/ak_run_lengths.py index 8095f01d06..e7c4d023a8 100644 --- a/src/awkward/_v2/operations/structure/ak_run_lengths.py +++ b/src/awkward/_v2/operations/structure/ak_run_lengths.py @@ -6,218 +6,203 @@ def run_lengths(array, highlevel=True, behavior=None): - raise ak._v2._util.error(NotImplementedError) - - -# """ -# Args: -# array: Data containing runs of numbers to count. -# highlevel (bool): If True, return an #ak.Array; otherwise, return -# a low-level #ak.layout.Content subclass. -# behavior (None or dict): Custom #ak.behavior for the output array, if -# high-level. - -# Computes the lengths of sequences of identical values at the deepest level -# of nesting, returning an array with the same structure but with `int64` type. - -# For example, - -# >>> array = ak.Array([1.1, 1.1, 1.1, 2.2, 3.3, 3.3, 4.4, 4.4, 5.5]) -# >>> ak.run_lengths(array) -# - -# There are 3 instances of 1.1, followed by 1 instance of 2.2, 2 instances of 3.3, -# 2 instances of 4.4, and 1 instance of 5.5. - -# The order and uniqueness of the input data doesn't matter, - -# >>> array = ak.Array([1.1, 1.1, 1.1, 5.5, 4.4, 4.4, 1.1, 1.1, 5.5]) -# >>> ak.run_lengths(array) -# - -# just the difference between each value and its neighbors. - -# The data can be nested, but runs don't cross list boundaries. - -# >>> array = ak.Array([[1.1, 1.1, 1.1, 2.2, 3.3], [3.3, 4.4], [4.4, 5.5]]) -# >>> ak.run_lengths(array) -# - -# This function recognizes strings as distinguishable values. - -# >>> array = ak.Array([["one", "one"], ["one", "two", "two"], ["three", "two", "two"]]) -# >>> ak.run_lengths(array) -# - -# Note that this can be combined with #ak.argsort and #ak.unflatten to compute -# a "group by" operation: - -# >>> array = ak.Array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 1, "y": 1.1}, -# ... {"x": 3, "y": 3.3}, {"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}]) -# >>> sorted = array[ak.argsort(array.x)] -# >>> sorted.x -# -# >>> ak.run_lengths(sorted.x) -# -# >>> ak.unflatten(sorted, ak.run_lengths(sorted.x)).tolist() -# [[{'x': 1, 'y': 1.1}, {'x': 1, 'y': 1.1}, {'x': 1, 'y': 1.1}], -# [{'x': 2, 'y': 2.2}, {'x': 2, 'y': 2.2}], -# [{'x': 3, 'y': 3.3}]] - -# Unlike a database "group by," this operation can be applied in bulk to many sublists -# (though the run lengths need to be fully flattened to be used as `counts` for -# #ak.unflatten, and you need to specify `axis=-1` as the depth). - -# >>> array = ak.Array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 1, "y": 1.1}], -# ... [{"x": 3, "y": 3.3}, {"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}]]) -# >>> sorted = array[ak.argsort(array.x)] -# >>> sorted.x -# -# >>> ak.run_lengths(sorted.x) -# -# >>> counts = ak.flatten(ak.run_lengths(sorted.x), axis=None) -# >>> ak.unflatten(sorted, counts, axis=-1).tolist() -# [[[{'x': 1, 'y': 1.1}, {'x': 1, 'y': 1.1}], -# [{'x': 2, 'y': 2.2}]], -# [[{'x': 1, 'y': 1.1}], -# [{'x': 2, 'y': 2.2}], -# [{'x': 3, 'y': 3.3}]]] - -# See also #ak.num, #ak.argsort, #ak.unflatten. -# """ -# nplike = ak.nplike.of(array) - -# def lengths_of(data, offsets): -# if len(data) == 0: -# return nplike.empty(0, np.int64), offsets -# else: -# diffs = data[1:] != data[:-1] -# if isinstance(diffs, ak._v2.highlevel.Array): -# diffs = nplike.asarray(diffs) -# if offsets is not None: -# diffs[offsets[1:-1] - 1] = True -# positions = nplike.nonzero(diffs)[0] -# full_positions = nplike.empty(len(positions) + 2, np.int64) -# full_positions[0] = 0 -# full_positions[-1] = len(data) -# full_positions[1:-1] = positions + 1 -# nextcontent = full_positions[1:] - full_positions[:-1] -# if offsets is None: -# nextoffsets = None -# else: -# nextoffsets = nplike.searchsorted(full_positions, offsets, side="left") -# return nextcontent, nextoffsets - -# def getfunction(layout): -# if layout.branch_depth == (False, 1): -# if isinstance(layout, ak._v2._util.indexedtypes): -# layout = layout.project() - -# if ( -# layout.parameter("__array__") == "string" -# or layout.parameter("__array__") == "bytestring" -# ): -# nextcontent, _ = lengths_of(ak._v2.highlevel.Array(layout), None) -# return lambda: ak._v2.contents.NumpyArray(nextcontent) - -# if not isinstance(layout, (ak._v2.contents.NumpyArray, ak._v2.contents.EmptyArray)): -# raise ak._v2._util.error(NotImplementedError( -# "run_lengths on " -# + type(layout).__name__ -# -# )) - -# nextcontent, _ = lengths_of(nplike.asarray(layout), None) -# return lambda: ak._v2.contents.NumpyArray(nextcontent) - -# elif layout.branch_depth == (False, 2): -# if isinstance(layout, ak._v2._util.indexedtypes): -# layout = layout.project() - -# if not isinstance(layout, ak._v2._util.listtypes): -# raise ak._v2._util.error(NotImplementedError( -# "run_lengths on " -# + type(layout).__name__ -# -# )) - -# if ( -# layout.content.parameter("__array__") == "string" -# or layout.content.parameter("__array__") == "bytestring" -# ): -# listoffsetarray = layout.toListOffsetArray64(False) -# offsets = nplike.asarray(listoffsetarray.offsets) -# content = listoffsetarray.content[offsets[0] : offsets[-1]] - -# if isinstance(content, ak._v2._util.indexedtypes): -# content = content.project() - -# nextcontent, nextoffsets = lengths_of( -# ak._v2.highlevel.Array(content), offsets - offsets[0] -# ) -# return lambda: ak._v2.contents.ListOffsetArray64( -# ak._v2.index.Index64(nextoffsets), ak._v2.contents.NumpyArray(nextcontent) -# ) - -# listoffsetarray = layout.toListOffsetArray64(False) -# offsets = nplike.asarray(listoffsetarray.offsets) -# content = listoffsetarray.content[offsets[0] : offsets[-1]] - -# if isinstance(content, ak._v2._util.indexedtypes): -# content = content.project() - -# if not isinstance(content, (ak._v2.contents.NumpyArray, ak._v2.contents.EmptyArray)): -# raise ak._v2._util.error(NotImplementedError( -# "run_lengths on " -# + type(layout).__name__ -# + " with content " -# + type(content).__name__ -# -# )) - -# nextcontent, nextoffsets = lengths_of( -# nplike.asarray(content), offsets - offsets[0] -# ) -# return lambda: ak._v2.contents.ListOffsetArray64( -# ak._v2.index.Index64(nextoffsets), ak._v2.contents.NumpyArray(nextcontent) -# ) - -# else: -# return None - -# layout = ak._v2.operations.convert.to_layout( -# array, allow_record=False, allow_other=False -# ) - -# if isinstance(layout, ak.partition.PartitionedArray): # NO PARTITIONED ARRAY -# if len(layout.partitions) != 0 and layout.partitions[0].branch_depth == ( -# False, -# 1, -# ): -# out = ak._v2._util.recursively_apply( -# layout.toContent(), -# getfunction, -# pass_depth=False, -# pass_user=False, -# ) -# else: -# outparts = [] -# for part in layout.partitions: -# outparts.append( -# ak._v2._util.recursively_apply( -# part, -# getfunction, -# pass_depth=False, -# pass_user=False, -# ) -# ) -# out = ak.partition.IrregularlyPartitionedArray(outparts) # NO PARTITIONED ARRAY -# else: -# out = ak._v2._util.recursively_apply( -# layout, -# getfunction, -# pass_depth=False, -# pass_user=False, -# ) - -# return ak._v2._util.maybe_wrap_like(out, array, behavior, highlevel) + + """ + Args: + array: Data containing runs of numbers to count. + highlevel (bool): If True, return an #ak.Array; otherwise, return + a low-level #ak.layout.Content subclass. + behavior (None or dict): Custom #ak.behavior for the output array, if + high-level. + + Computes the lengths of sequences of identical values at the deepest level + of nesting, returning an array with the same structure but with `int64` type. + + For example, + + >>> array = ak.Array([1.1, 1.1, 1.1, 2.2, 3.3, 3.3, 4.4, 4.4, 5.5]) + >>> ak.run_lengths(array) + + + There are 3 instances of 1.1, followed by 1 instance of 2.2, 2 instances of 3.3, + 2 instances of 4.4, and 1 instance of 5.5. + + The order and uniqueness of the input data doesn't matter, + + >>> array = ak.Array([1.1, 1.1, 1.1, 5.5, 4.4, 4.4, 1.1, 1.1, 5.5]) + >>> ak.run_lengths(array) + + + just the difference between each value and its neighbors. + + The data can be nested, but runs don't cross list boundaries. + + >>> array = ak.Array([[1.1, 1.1, 1.1, 2.2, 3.3], [3.3, 4.4], [4.4, 5.5]]) + >>> ak.run_lengths(array) + + + This function recognizes strings as distinguishable values. + + >>> array = ak.Array([["one", "one"], ["one", "two", "two"], ["three", "two", "two"]]) + >>> ak.run_lengths(array) + + + Note that this can be combined with #ak.argsort and #ak.unflatten to compute + a "group by" operation: + + >>> array = ak.Array([{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 1, "y": 1.1}, + ... {"x": 3, "y": 3.3}, {"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}]) + >>> sorted = array[ak.argsort(array.x)] + >>> sorted.x + + >>> ak.run_lengths(sorted.x) + + >>> ak.unflatten(sorted, ak.run_lengths(sorted.x)).tolist() + [[{'x': 1, 'y': 1.1}, {'x': 1, 'y': 1.1}, {'x': 1, 'y': 1.1}], + [{'x': 2, 'y': 2.2}, {'x': 2, 'y': 2.2}], + [{'x': 3, 'y': 3.3}]] + + Unlike a database "group by," this operation can be applied in bulk to many sublists + (though the run lengths need to be fully flattened to be used as `counts` for + #ak.unflatten, and you need to specify `axis=-1` as the depth). + + >>> array = ak.Array([[{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 1, "y": 1.1}], + ... [{"x": 3, "y": 3.3}, {"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}]]) + >>> sorted = array[ak.argsort(array.x)] + >>> sorted.x + + >>> ak.run_lengths(sorted.x) + + >>> counts = ak.flatten(ak.run_lengths(sorted.x), axis=None) + >>> ak.unflatten(sorted, counts, axis=-1).tolist() + [[[{'x': 1, 'y': 1.1}, {'x': 1, 'y': 1.1}], + [{'x': 2, 'y': 2.2}]], + [[{'x': 1, 'y': 1.1}], + [{'x': 2, 'y': 2.2}], + [{'x': 3, 'y': 3.3}]]] + + See also #ak.num, #ak.argsort, #ak.unflatten. + """ + with ak._v2._util.OperationErrorContext( + "ak._v2.run_lengths", + dict( + array=array, + highlevel=highlevel, + behavior=behavior, + ), + ): + return _impl(array, highlevel, behavior) + + +def _impl(array, highlevel, behavior): + nplike = ak.nplike.of(array) + + def lengths_of(data, offsets): + if len(data) == 0: + return nplike.empty(0, np.int64), offsets + else: + diffs = data[1:] != data[:-1] + + if isinstance(diffs, ak._v2.highlevel.Array): + diffs = nplike.asarray(diffs) + if offsets is not None: + diffs[offsets[1:-1] - 1] = True + positions = nplike.nonzero(diffs)[0] + full_positions = nplike.empty(len(positions) + 2, np.int64) + full_positions[0] = 0 + full_positions[-1] = len(data) + full_positions[1:-1] = positions + 1 + + nextcontent = full_positions[1:] - full_positions[:-1] + if offsets is None: + nextoffsets = None + else: + nextoffsets = nplike.searchsorted(full_positions, offsets, side="left") + return nextcontent, nextoffsets + + def action(layout, **kwargs): + if layout.branch_depth == (False, 1): + if layout.is_IndexedType: + layout = layout.project() + + if ( + layout.parameter("__array__") == "string" + or layout.parameter("__array__") == "bytestring" + ): + nextcontent, _ = lengths_of(ak._v2.highlevel.Array(layout), None) + return ak._v2.contents.NumpyArray(nextcontent) + + if not isinstance( + layout, (ak._v2.contents.NumpyArray, ak._v2.contents.EmptyArray) + ): + raise ak._v2._util.error( + NotImplementedError("run_lengths on " + type(layout).__name__) + ) + + nextcontent, _ = lengths_of(nplike.asarray(layout), None) + return ak._v2.contents.NumpyArray(nextcontent) + + elif layout.branch_depth == (False, 2): + if layout.is_IndexedType: + layout = layout.project() + + if not layout.is_ListType: + raise ak._v2._util.error( + NotImplementedError("run_lengths on " + type(layout).__name__) + ) + + if ( + layout.content.parameter("__array__") == "string" + or layout.content.parameter("__array__") == "bytestring" + ): + listoffsetarray = layout.toListOffsetArray64(False) + offsets = nplike.asarray(listoffsetarray.offsets) + content = listoffsetarray.content[offsets[0] : offsets[-1]] + + if content.is_IndexedType: + content = content.project() + + nextcontent, nextoffsets = lengths_of( + ak._v2.highlevel.Array(content), offsets - offsets[0] + ) + return ak._v2.contents.ListOffsetArray( + ak._v2.index.Index64(nextoffsets), + ak._v2.contents.NumpyArray(nextcontent), + ) + + listoffsetarray = layout.toListOffsetArray64(False) + offsets = nplike.asarray(listoffsetarray.offsets) + content = listoffsetarray.content[offsets[0] : offsets[-1]] + + if content.is_IndexedType: + content = content.project() + + if not isinstance( + content, (ak._v2.contents.NumpyArray, ak._v2.contents.EmptyArray) + ): + raise ak._v2._util.error( + NotImplementedError( + "run_lengths on " + + type(layout).__name__ + + " with content " + + type(content).__name__ + ) + ) + + nextcontent, nextoffsets = lengths_of( + nplike.asarray(content), offsets - offsets[0] + ) + return ak._v2.contents.ListOffsetArray( + ak._v2.index.Index64(nextoffsets), + ak._v2.contents.NumpyArray(nextcontent), + ) + else: + return None + + layout = ak._v2.operations.convert.to_layout( + array, allow_record=False, allow_other=False + ) + + out = layout.recursively_apply(action) + + return ak._v2._util.wrap(out, behavior, highlevel) diff --git a/tests/v2/test_0733-run_lengths.py b/tests/v2/test_0733-run_lengths.py new file mode 100644 index 0000000000..2c5883a3e4 --- /dev/null +++ b/tests/v2/test_0733-run_lengths.py @@ -0,0 +1,84 @@ +# BSD 3-Clause License; see https://github.com/scikit-hep/awkward-1.0/blob/main/LICENSE + + +import pytest # noqa: F401 +import numpy as np # noqa: F401 +import awkward as ak # noqa: F401 + + +def test(): + array = ak._v2.Array([3, 3, 3, 5, 5, 9, 9, 9, 9, 1, 3, 3]) + assert ak._v2.operations.structure.run_lengths(array).tolist() == [3, 2, 4, 1, 2] + + array = ak._v2.Array([[3, 3, 3, 5], [5], [], [9, 9], [9, 9], [1, 3, 3]]) + assert ak._v2.operations.structure.run_lengths(array).tolist() == [ + [3, 1], + [1], + [], + [2], + [2], + [1, 2], + ] + + +@pytest.mark.skip(reason="ak.unflatten unimplemented") +def test_groupby(): + array = ak._v2.Array( + [ + {"x": 1, "y": 1.1}, + {"x": 2, "y": 2.2}, + {"x": 1, "y": 1.1}, + {"x": 3, "y": 3.3}, + {"x": 1, "y": 1.1}, + {"x": 2, "y": 2.2}, + ] + ) + sorted = array[ak._v2.operations.structure.argsort(array.x)] + assert sorted.x.tolist() == [1, 1, 1, 2, 2, 3] + assert ak._v2.operations.structure.run_lengths(sorted.x).tolist() == [3, 2, 1] + assert ak.unflatten( + sorted, ak._v2.operations.structure.run_lengths(sorted.x) + ).tolist() == [ + [{"x": 1, "y": 1.1}, {"x": 1, "y": 1.1}, {"x": 1, "y": 1.1}], + [{"x": 2, "y": 2.2}, {"x": 2, "y": 2.2}], + [{"x": 3, "y": 3.3}], + ] + + array = ak._v2.Array( + [ + [{"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}, {"x": 1, "y": 1.1}], + [{"x": 3, "y": 3.3}, {"x": 1, "y": 1.1}, {"x": 2, "y": 2.2}], + ] + ) + sorted = array[ak._v2.operations.structure.argsort(array.x)] + assert sorted.x.tolist() == [[1, 1, 2], [1, 2, 3]] + assert ak._v2.operations.structure.run_lengths(sorted.x).tolist() == [ + [2, 1], + [1, 1, 1], + ] + counts = ak._v2.operations.structure.flatten( + ak._v2.operations.structure.run_lengths(sorted.x), axis=None + ) + assert ak.unflatten(sorted, counts, axis=-1).tolist() == [ + [[{"x": 1, "y": 1.1}, {"x": 1, "y": 1.1}], [{"x": 2, "y": 2.2}]], + [[{"x": 1, "y": 1.1}], [{"x": 2, "y": 2.2}], [{"x": 3, "y": 3.3}]], + ] + + +def test_onstrings1(): + data = ak.Array(["one", "one", "one", "two", "two", "three", "two", "two"]) + assert ak.run_lengths(data).tolist() == [3, 2, 1, 2] + + data = ak._v2.Array(["one", "one", "one", "two", "two", "three", "two", "two"]) + assert ak._v2.operations.structure.run_lengths(data).tolist() == [3, 2, 1, 2] + + +def test_onstrings2(): + data = ak._v2.Array( + [["one", "one"], ["one", "two", "two"], ["three", "two", "two"]] + ) + assert ak._v2.operations.structure.run_lengths(data).tolist() == [ + [2], + [1, 2], + [1, 2], + ]