feat: Use awkward pandas, instead of the existing code that explodes …

…Pandas Dataframes (#734) * Token change to get PR number * Revert "Token change to get PR number" This reverts commit 5a631b3. * Complete basic Awkward Pandas port, and start changing tests * make some of the suggested changes * Solve some tests * Finalize tests * Add awkward-pandas to dev dependencies * awkward-pandas only supports Python 3.8+. * Declare awkward-pandas requirement in affected tests. * Spell it right. * Get this PR up to date with #784. Co-authored-by: Jim Pivarski <[email protected]>
scikit-hep · Nov 28, 2022 · e34f66a · e34f66a
1 parent b36a022
commit e34f66a
Show file tree

Hide file tree

Showing 8 changed files with 128 additions and 505 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -55,6 +55,7 @@ dev = [
     "dask[array];python_version >= \"3.8\"",
     "hist>=1.2",
     "pandas",
+    "awkward-pandas;python_version >= \"3.8\"",
 ]
 test = [
     "awkward>=2.0.0rc2",

diff --git a/src/uproot/extras.py b/src/uproot/extras.py
@@ -308,3 +308,18 @@ def dask_awkward():
         ) from err
     else:
         return dask_awkward
+
+
+def awkward_pandas():
+    """
+    Imports and returns ``awkward_pandas``.
+    """
+    try:
+        import awkward_pandas
+    except ModuleNotFoundError as err:
+        raise ModuleNotFoundError(
+            """install the 'awkward-pandas' package with:
+    pip install awkward-pandas # not on conda-forge yet"""
+        ) from err
+    else:
+        return awkward_pandas
diff --git a/src/uproot/interpretation/library.py b/src/uproot/interpretation/library.py
@@ -24,7 +24,6 @@
 
 
 import gc
-import itertools
 import json
 
 import numpy
@@ -792,21 +791,8 @@ def _pandas_only_series(pandas, original_arrays, expression_context):
     arrays = {}
     names = []
     for name, context in expression_context:
-        if isinstance(original_arrays[name], pandas.Series):
-            arrays[_rename(name, context)] = original_arrays[name]
-            names.append(_rename(name, context))
-        else:
-            df = original_arrays[name]
-            for subname in df.columns:
-                if df.leaflist:
-                    if isinstance(subname, tuple):
-                        path = (_rename(name, context),) + subname
-                    else:
-                        path = (_rename(name, context), subname)
-                else:
-                    path = _rename(name, context) + subname
-                arrays[path] = df[subname]
-                names.append(path)
+        arrays[_rename(name, context)] = original_arrays[name]
+        names.append(_rename(name, context))
     return arrays, names
 
 
@@ -858,106 +844,23 @@ def imported(self):
 
     def finalize(self, array, branch, interpretation, entry_start, entry_stop, options):
         pandas = self.imported
+        index = _pandas_basic_index(pandas, entry_start, entry_stop)
 
-        if isinstance(array, uproot.interpretation.objects.StridedObjectArray):
-            arrays = []
-            columns = []
-            _strided_to_pandas((), array.interpretation, array.array, arrays, columns)
-            maxlen = max(len(x) for x in columns)
-            if maxlen == 1:
-                columns = [x[0] for x in columns]
-            else:
-                columns = pandas.MultiIndex.from_tuples(
-                    [x + ("",) * (maxlen - len(x)) for x in columns]
-                )
-            index = _pandas_basic_index(pandas, entry_start, entry_stop)
-            out = pandas.DataFrame(
-                dict(zip(columns, arrays)), columns=columns, index=index
-            )
-            out.leaflist = maxlen != 1
-            return out
-
-        elif isinstance(array, uproot.interpretation.jagged.JaggedArray) and isinstance(
-            array.content, uproot.interpretation.objects.StridedObjectArray
+        if (
+            isinstance(array, numpy.ndarray)
+            and array.dtype.names is None
+            and len(array.shape) == 1
         ):
-            index = pandas.MultiIndex.from_arrays(
-                array.parents_localindex(entry_start, entry_stop),
-                names=["entry", "subentry"],
-            )
-            arrays = []
-            columns = []
-            _strided_to_pandas(
-                (), array.content.interpretation, array.content.array, arrays, columns
+            return pandas.Series(array, index=index)
+        else:
+            awkward_pandas = uproot.extras.awkward_pandas()
+            ak_lib = _libraries[Awkward.name]
+            ak_arr = ak_lib.finalize(
+                array, branch, interpretation, entry_start, entry_stop, options
             )
-            maxlen = max(len(x) for x in columns)
-            if maxlen == 1:
-                columns = [x[0] for x in columns]
-            else:
-                columns = pandas.MultiIndex.from_tuples(
-                    [x + ("",) * (maxlen - len(x)) for x in columns]
-                )
-            out = pandas.DataFrame(
-                dict(zip(columns, arrays)), columns=columns, index=index
+            return pandas.Series(
+                awkward_pandas.AwkwardExtensionArray(ak_arr), index=index
             )
-            out.leaflist = maxlen != 1
-            return out
-
-        elif isinstance(array, uproot.interpretation.jagged.JaggedArray):
-            index = pandas.MultiIndex.from_arrays(
-                array.parents_localindex(entry_start, entry_stop),
-                names=["entry", "subentry"],
-            )
-            return pandas.Series(array.content, index=index)
-
-        elif isinstance(
-            array,
-            (
-                uproot.interpretation.strings.StringArray,
-                uproot.interpretation.objects.ObjectArray,
-            ),
-        ):
-            out = numpy.zeros(len(array), dtype=object)
-            for i, x in enumerate(array):
-                out[i] = x
-            index = _pandas_basic_index(pandas, entry_start, entry_stop)
-            return pandas.Series(out, index=index)
-
-        elif array.dtype.names is not None and len(array.shape) != 1:
-            names = []
-            arrays = {}
-            for n in array.dtype.names:
-                for tup in itertools.product(*[range(d) for d in array.shape[1:]]):
-                    name = (n + "".join("[" + str(x) + "]" for x in tup),)
-                    names.append(name)
-                    arrays[name] = array[n][(slice(None),) + tup]
-            index = _pandas_basic_index(pandas, entry_start, entry_stop)
-            out = pandas.DataFrame(arrays, columns=names, index=index)
-            out.leaflist = True
-            return out
-
-        elif array.dtype.names is not None:
-            columns = pandas.MultiIndex.from_tuples([(x,) for x in array.dtype.names])
-            arrays = {y: array[x] for x, y in zip(array.dtype.names, columns)}
-            index = _pandas_basic_index(pandas, entry_start, entry_stop)
-            out = pandas.DataFrame(arrays, columns=columns, index=index)
-            out.leaflist = True
-            return out
-
-        elif len(array.shape) != 1:
-            names = []
-            arrays = {}
-            for tup in itertools.product(*[range(d) for d in array.shape[1:]]):
-                name = "".join("[" + str(x) + "]" for x in tup)
-                names.append(name)
-                arrays[name] = array[(slice(None),) + tup]
-            index = _pandas_basic_index(pandas, entry_start, entry_stop)
-            out = pandas.DataFrame(arrays, columns=names, index=index)
-            out.leaflist = False
-            return out
-
-        else:
-            index = _pandas_basic_index(pandas, entry_start, entry_stop)
-            return pandas.Series(array, index=index)
 
     def group(self, arrays, expression_context, how):
         pandas = self.imported
@@ -973,130 +876,7 @@ def group(self, arrays, expression_context, how):
 
         elif uproot._util.isstr(how) or how is None:
             arrays, names = _pandas_only_series(pandas, arrays, expression_context)
-
-            if any(isinstance(x, tuple) for x in names):
-                longest = max(len(x) for x in names if isinstance(x, tuple))
-                newarrays, newnames = {}, []
-                for x in names:
-                    if not isinstance(x, tuple):
-                        y = (x,) + ("",) * (longest - 1)
-                    else:
-                        y = x + ("",) * (longest - len(x))
-                    newarrays[y] = arrays[x]
-                    newnames.append(y)
-                arrays = newarrays
-                names = pandas.MultiIndex.from_tuples(newnames)
-
-            if all(_is_pandas_rangeindex(pandas, x.index) for x in arrays.values()):
-                return _pandas_memory_efficient(pandas, arrays, names)
-
-            indexes = []
-            groups = []
-            for name in names:
-                array = arrays[name]
-                if isinstance(array.index, pandas.MultiIndex):
-                    for index, group in zip(indexes, groups):
-                        if numpy.array_equal(array.index, index):
-                            group.append(name)
-                            break
-                    else:
-                        indexes.append(array.index)
-                        groups.append([name])
-            if how is None:
-                flat_index = None
-                dfs = [[] for x in indexes]
-                group_names = [[] for x in indexes]
-                for index, group, df, gn in zip(indexes, groups, dfs, group_names):
-                    for name in names:
-                        array = arrays[name]
-                        if _is_pandas_rangeindex(pandas, array.index):
-                            if flat_index is None or len(flat_index) != len(
-                                array.index
-                            ):
-                                flat_index = pandas.MultiIndex.from_arrays(
-                                    [array.index]
-                                )
-                            # Old versions of Pandas handle the following line poorly:
-                            # should we support them?
-                            #
-                            # >>> pandas.__version__
-                            # '0.22.0'
-                            # >>> from_index = pandas.MultiIndex.from_tuples([(0,), (1,)])
-                            # >>> to_index = pandas.MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2), (1, 0), (1, 1)])
-                            # >>> pandas.Series([1.1, 4.4], index=from_index).reindex(to_index)
-                            # 0  0   NaN
-                            #    1   NaN
-                            #    2   NaN
-                            # 1  0   NaN
-                            #    1   NaN
-                            # dtype: float64
-                            #
-                            # >>> pandas.__version__
-                            # '1.3.2'
-                            # >>> from_index = pandas.MultiIndex.from_tuples([(0,), (1,)])
-                            # >>> to_index = pandas.MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2), (1, 0), (1, 1)])
-                            # >>> pandas.Series([1.1, 4.4], index=from_index).reindex(to_index)
-                            # 0  0    1.1
-                            #    1    1.1
-                            #    2    1.1
-                            # 1  0    4.4
-                            #    1    4.4
-                            # dtype: float64
-                            df.append(
-                                pandas.Series(array.values, index=flat_index).reindex(
-                                    index
-                                )
-                            )
-                            gn.append(name)
-                        elif name in group:
-                            df.append(array)
-                            gn.append(name)
-                out = []
-                for index, df, gn in zip(indexes, dfs, group_names):
-                    out.append(
-                        pandas.DataFrame(
-                            data=dict(zip(gn, df)), index=index, columns=gn
-                        )
-                    )
-                if len(out) == 1:
-                    return out[0]
-                else:
-                    return tuple(out)
-            else:
-                out = None
-                for index, group in zip(indexes, groups):
-                    only = {name: arrays[name] for name in group}
-                    df = pandas.DataFrame(data=only, index=index, columns=group)
-                    if out is None:
-                        out = df
-                    else:
-                        out = pandas.merge(
-                            out, df, how=how, left_index=True, right_index=True
-                        )
-                flat_names = [
-                    name
-                    for name in names
-                    if _is_pandas_rangeindex(pandas, arrays[name].index)
-                ]
-                if len(flat_names) > 0:
-                    flat_index = pandas.MultiIndex.from_arrays(
-                        [arrays[flat_names[0]].index]
-                    )
-                    only = {
-                        name: pandas.Series(arrays[name].values, index=flat_index)
-                        for name in flat_names
-                    }
-                    df = pandas.DataFrame(
-                        data=only, index=flat_index, columns=flat_names
-                    )
-                    out = pandas.merge(
-                        df.reindex(out.index),
-                        out,
-                        how=how,
-                        left_index=True,
-                        right_index=True,
-                    )
-                return out
+            return _pandas_memory_efficient(pandas, arrays, names)
 
         else:
             raise TypeError(
@@ -1108,30 +888,18 @@ def group(self, arrays, expression_context, how):
     def global_index(self, arrays, global_offset):
         if isinstance(arrays, tuple):
             return tuple(self.global_index(x, global_offset) for x in arrays)
+        elif isinstance(arrays, list):
+            return list(self.global_index(x, global_offset) for x in arrays)
 
-        if type(arrays.index).__name__ == "MultiIndex":
-            if hasattr(arrays.index.levels[0], "arrays"):
-                index = arrays.index.levels[0].arrays  # pandas>=0.24.0
-            else:
-                index = arrays.index.levels[0].values  # pandas<0.24.0
-            numpy.add(index, global_offset, out=index)
-
-        elif type(arrays.index).__name__ == "RangeIndex":
-            if hasattr(arrays.index, "start") and hasattr(arrays.index, "stop"):
-                index_start = arrays.index.start  # pandas>=0.25.0
-                index_stop = arrays.index.stop
-            else:
-                index_start = arrays.index._start  # pandas<0.25.0
-                index_stop = arrays.index._stop
+        if type(arrays.index).__name__ == "RangeIndex":
+            index_start = arrays.index.start
+            index_stop = arrays.index.stop
             arrays.index = type(arrays.index)(
                 index_start + global_offset, index_stop + global_offset
             )
 
         else:
-            if hasattr(arrays.index, "arrays"):
-                index = arrays.index.arrays  # pandas>=0.24.0
-            else:
-                index = arrays.index.values  # pandas<0.24.0
+            index = arrays.index.arrays
             numpy.add(index, global_offset, out=index)
 
         return arrays
@@ -1217,45 +985,3 @@ def _regularize_library(library):
                 """try "np" (NumPy), "ak" (Awkward Array), or "pd" (Pandas) """
                 """instead""".format(repr(library))
             ) from err
-
-
-_libraries_lazy = {Awkward.name: _libraries[Awkward.name]}
-
-_libraries_lazy["awkward1"] = _libraries_lazy[Awkward.name]
-_libraries_lazy["Awkward1"] = _libraries_lazy[Awkward.name]
-_libraries_lazy["AWKWARD1"] = _libraries_lazy[Awkward.name]
-_libraries_lazy["awkward"] = _libraries_lazy[Awkward.name]
-_libraries_lazy["Awkward"] = _libraries_lazy[Awkward.name]
-_libraries_lazy["AWKWARD"] = _libraries_lazy[Awkward.name]
-
-
-def _regularize_library_lazy(library):
-    if isinstance(library, Library):
-        if library.name in _libraries_lazy:
-            return _libraries_lazy[library.name]
-        else:
-            raise ValueError(
-                "library {} ({}) cannot be used in this function".format(
-                    type(library).__name__, repr(library.name)
-                )
-            )
-
-    elif isinstance(library, type) and issubclass(library, Library):
-        if library().name in _libraries_lazy:
-            return _libraries_lazy[library().name]
-        else:
-            raise ValueError(
-                "library {} ({}) cannot be used in this function".format(
-                    library.__name__, repr(library().name)
-                )
-            )
-
-    else:
-        try:
-            return _libraries_lazy[library]
-        except KeyError as err:
-            raise ValueError(
-                """library {} not recognized (for this function); """
-                """try "ak" (Awkward Array) """
-                """instead""".format(repr(library))
-            ) from err