Skip to content

Commit

Permalink
feat: Use awkward pandas, instead of the existing code that explodes …
Browse files Browse the repository at this point in the history
…Pandas Dataframes (#734)

* Token change to get PR number

* Revert "Token change to get PR number"

This reverts commit 5a631b3.

* Complete basic Awkward Pandas port, and start changing tests

* make some of the suggested changes

* Solve some tests

* Finalize tests

* Add awkward-pandas to dev dependencies

* awkward-pandas only supports Python 3.8+.

* Declare awkward-pandas requirement in affected tests.

* Spell it right.

* Get this PR up to date with #784.

Co-authored-by: Jim Pivarski <[email protected]>
  • Loading branch information
kkothari2001 and jpivarski authored Nov 28, 2022
1 parent b36a022 commit e34f66a
Show file tree
Hide file tree
Showing 8 changed files with 128 additions and 505 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ dev = [
"dask[array];python_version >= \"3.8\"",
"hist>=1.2",
"pandas",
"awkward-pandas;python_version >= \"3.8\"",
]
test = [
"awkward>=2.0.0rc2",
Expand Down
15 changes: 15 additions & 0 deletions src/uproot/extras.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,3 +308,18 @@ def dask_awkward():
) from err
else:
return dask_awkward


def awkward_pandas():
"""
Imports and returns ``awkward_pandas``.
"""
try:
import awkward_pandas
except ModuleNotFoundError as err:
raise ModuleNotFoundError(
"""install the 'awkward-pandas' package with:
pip install awkward-pandas # not on conda-forge yet"""
) from err
else:
return awkward_pandas
318 changes: 22 additions & 296 deletions src/uproot/interpretation/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@


import gc
import itertools
import json

import numpy
Expand Down Expand Up @@ -792,21 +791,8 @@ def _pandas_only_series(pandas, original_arrays, expression_context):
arrays = {}
names = []
for name, context in expression_context:
if isinstance(original_arrays[name], pandas.Series):
arrays[_rename(name, context)] = original_arrays[name]
names.append(_rename(name, context))
else:
df = original_arrays[name]
for subname in df.columns:
if df.leaflist:
if isinstance(subname, tuple):
path = (_rename(name, context),) + subname
else:
path = (_rename(name, context), subname)
else:
path = _rename(name, context) + subname
arrays[path] = df[subname]
names.append(path)
arrays[_rename(name, context)] = original_arrays[name]
names.append(_rename(name, context))
return arrays, names


Expand Down Expand Up @@ -858,106 +844,23 @@ def imported(self):

def finalize(self, array, branch, interpretation, entry_start, entry_stop, options):
pandas = self.imported
index = _pandas_basic_index(pandas, entry_start, entry_stop)

if isinstance(array, uproot.interpretation.objects.StridedObjectArray):
arrays = []
columns = []
_strided_to_pandas((), array.interpretation, array.array, arrays, columns)
maxlen = max(len(x) for x in columns)
if maxlen == 1:
columns = [x[0] for x in columns]
else:
columns = pandas.MultiIndex.from_tuples(
[x + ("",) * (maxlen - len(x)) for x in columns]
)
index = _pandas_basic_index(pandas, entry_start, entry_stop)
out = pandas.DataFrame(
dict(zip(columns, arrays)), columns=columns, index=index
)
out.leaflist = maxlen != 1
return out

elif isinstance(array, uproot.interpretation.jagged.JaggedArray) and isinstance(
array.content, uproot.interpretation.objects.StridedObjectArray
if (
isinstance(array, numpy.ndarray)
and array.dtype.names is None
and len(array.shape) == 1
):
index = pandas.MultiIndex.from_arrays(
array.parents_localindex(entry_start, entry_stop),
names=["entry", "subentry"],
)
arrays = []
columns = []
_strided_to_pandas(
(), array.content.interpretation, array.content.array, arrays, columns
return pandas.Series(array, index=index)
else:
awkward_pandas = uproot.extras.awkward_pandas()
ak_lib = _libraries[Awkward.name]
ak_arr = ak_lib.finalize(
array, branch, interpretation, entry_start, entry_stop, options
)
maxlen = max(len(x) for x in columns)
if maxlen == 1:
columns = [x[0] for x in columns]
else:
columns = pandas.MultiIndex.from_tuples(
[x + ("",) * (maxlen - len(x)) for x in columns]
)
out = pandas.DataFrame(
dict(zip(columns, arrays)), columns=columns, index=index
return pandas.Series(
awkward_pandas.AwkwardExtensionArray(ak_arr), index=index
)
out.leaflist = maxlen != 1
return out

elif isinstance(array, uproot.interpretation.jagged.JaggedArray):
index = pandas.MultiIndex.from_arrays(
array.parents_localindex(entry_start, entry_stop),
names=["entry", "subentry"],
)
return pandas.Series(array.content, index=index)

elif isinstance(
array,
(
uproot.interpretation.strings.StringArray,
uproot.interpretation.objects.ObjectArray,
),
):
out = numpy.zeros(len(array), dtype=object)
for i, x in enumerate(array):
out[i] = x
index = _pandas_basic_index(pandas, entry_start, entry_stop)
return pandas.Series(out, index=index)

elif array.dtype.names is not None and len(array.shape) != 1:
names = []
arrays = {}
for n in array.dtype.names:
for tup in itertools.product(*[range(d) for d in array.shape[1:]]):
name = (n + "".join("[" + str(x) + "]" for x in tup),)
names.append(name)
arrays[name] = array[n][(slice(None),) + tup]
index = _pandas_basic_index(pandas, entry_start, entry_stop)
out = pandas.DataFrame(arrays, columns=names, index=index)
out.leaflist = True
return out

elif array.dtype.names is not None:
columns = pandas.MultiIndex.from_tuples([(x,) for x in array.dtype.names])
arrays = {y: array[x] for x, y in zip(array.dtype.names, columns)}
index = _pandas_basic_index(pandas, entry_start, entry_stop)
out = pandas.DataFrame(arrays, columns=columns, index=index)
out.leaflist = True
return out

elif len(array.shape) != 1:
names = []
arrays = {}
for tup in itertools.product(*[range(d) for d in array.shape[1:]]):
name = "".join("[" + str(x) + "]" for x in tup)
names.append(name)
arrays[name] = array[(slice(None),) + tup]
index = _pandas_basic_index(pandas, entry_start, entry_stop)
out = pandas.DataFrame(arrays, columns=names, index=index)
out.leaflist = False
return out

else:
index = _pandas_basic_index(pandas, entry_start, entry_stop)
return pandas.Series(array, index=index)

def group(self, arrays, expression_context, how):
pandas = self.imported
Expand All @@ -973,130 +876,7 @@ def group(self, arrays, expression_context, how):

elif uproot._util.isstr(how) or how is None:
arrays, names = _pandas_only_series(pandas, arrays, expression_context)

if any(isinstance(x, tuple) for x in names):
longest = max(len(x) for x in names if isinstance(x, tuple))
newarrays, newnames = {}, []
for x in names:
if not isinstance(x, tuple):
y = (x,) + ("",) * (longest - 1)
else:
y = x + ("",) * (longest - len(x))
newarrays[y] = arrays[x]
newnames.append(y)
arrays = newarrays
names = pandas.MultiIndex.from_tuples(newnames)

if all(_is_pandas_rangeindex(pandas, x.index) for x in arrays.values()):
return _pandas_memory_efficient(pandas, arrays, names)

indexes = []
groups = []
for name in names:
array = arrays[name]
if isinstance(array.index, pandas.MultiIndex):
for index, group in zip(indexes, groups):
if numpy.array_equal(array.index, index):
group.append(name)
break
else:
indexes.append(array.index)
groups.append([name])
if how is None:
flat_index = None
dfs = [[] for x in indexes]
group_names = [[] for x in indexes]
for index, group, df, gn in zip(indexes, groups, dfs, group_names):
for name in names:
array = arrays[name]
if _is_pandas_rangeindex(pandas, array.index):
if flat_index is None or len(flat_index) != len(
array.index
):
flat_index = pandas.MultiIndex.from_arrays(
[array.index]
)
# Old versions of Pandas handle the following line poorly:
# should we support them?
#
# >>> pandas.__version__
# '0.22.0'
# >>> from_index = pandas.MultiIndex.from_tuples([(0,), (1,)])
# >>> to_index = pandas.MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2), (1, 0), (1, 1)])
# >>> pandas.Series([1.1, 4.4], index=from_index).reindex(to_index)
# 0 0 NaN
# 1 NaN
# 2 NaN
# 1 0 NaN
# 1 NaN
# dtype: float64
#
# >>> pandas.__version__
# '1.3.2'
# >>> from_index = pandas.MultiIndex.from_tuples([(0,), (1,)])
# >>> to_index = pandas.MultiIndex.from_tuples([(0, 0), (0, 1), (0, 2), (1, 0), (1, 1)])
# >>> pandas.Series([1.1, 4.4], index=from_index).reindex(to_index)
# 0 0 1.1
# 1 1.1
# 2 1.1
# 1 0 4.4
# 1 4.4
# dtype: float64
df.append(
pandas.Series(array.values, index=flat_index).reindex(
index
)
)
gn.append(name)
elif name in group:
df.append(array)
gn.append(name)
out = []
for index, df, gn in zip(indexes, dfs, group_names):
out.append(
pandas.DataFrame(
data=dict(zip(gn, df)), index=index, columns=gn
)
)
if len(out) == 1:
return out[0]
else:
return tuple(out)
else:
out = None
for index, group in zip(indexes, groups):
only = {name: arrays[name] for name in group}
df = pandas.DataFrame(data=only, index=index, columns=group)
if out is None:
out = df
else:
out = pandas.merge(
out, df, how=how, left_index=True, right_index=True
)
flat_names = [
name
for name in names
if _is_pandas_rangeindex(pandas, arrays[name].index)
]
if len(flat_names) > 0:
flat_index = pandas.MultiIndex.from_arrays(
[arrays[flat_names[0]].index]
)
only = {
name: pandas.Series(arrays[name].values, index=flat_index)
for name in flat_names
}
df = pandas.DataFrame(
data=only, index=flat_index, columns=flat_names
)
out = pandas.merge(
df.reindex(out.index),
out,
how=how,
left_index=True,
right_index=True,
)
return out
return _pandas_memory_efficient(pandas, arrays, names)

else:
raise TypeError(
Expand All @@ -1108,30 +888,18 @@ def group(self, arrays, expression_context, how):
def global_index(self, arrays, global_offset):
if isinstance(arrays, tuple):
return tuple(self.global_index(x, global_offset) for x in arrays)
elif isinstance(arrays, list):
return list(self.global_index(x, global_offset) for x in arrays)

if type(arrays.index).__name__ == "MultiIndex":
if hasattr(arrays.index.levels[0], "arrays"):
index = arrays.index.levels[0].arrays # pandas>=0.24.0
else:
index = arrays.index.levels[0].values # pandas<0.24.0
numpy.add(index, global_offset, out=index)

elif type(arrays.index).__name__ == "RangeIndex":
if hasattr(arrays.index, "start") and hasattr(arrays.index, "stop"):
index_start = arrays.index.start # pandas>=0.25.0
index_stop = arrays.index.stop
else:
index_start = arrays.index._start # pandas<0.25.0
index_stop = arrays.index._stop
if type(arrays.index).__name__ == "RangeIndex":
index_start = arrays.index.start
index_stop = arrays.index.stop
arrays.index = type(arrays.index)(
index_start + global_offset, index_stop + global_offset
)

else:
if hasattr(arrays.index, "arrays"):
index = arrays.index.arrays # pandas>=0.24.0
else:
index = arrays.index.values # pandas<0.24.0
index = arrays.index.arrays
numpy.add(index, global_offset, out=index)

return arrays
Expand Down Expand Up @@ -1217,45 +985,3 @@ def _regularize_library(library):
"""try "np" (NumPy), "ak" (Awkward Array), or "pd" (Pandas) """
"""instead""".format(repr(library))
) from err


_libraries_lazy = {Awkward.name: _libraries[Awkward.name]}

_libraries_lazy["awkward1"] = _libraries_lazy[Awkward.name]
_libraries_lazy["Awkward1"] = _libraries_lazy[Awkward.name]
_libraries_lazy["AWKWARD1"] = _libraries_lazy[Awkward.name]
_libraries_lazy["awkward"] = _libraries_lazy[Awkward.name]
_libraries_lazy["Awkward"] = _libraries_lazy[Awkward.name]
_libraries_lazy["AWKWARD"] = _libraries_lazy[Awkward.name]


def _regularize_library_lazy(library):
if isinstance(library, Library):
if library.name in _libraries_lazy:
return _libraries_lazy[library.name]
else:
raise ValueError(
"library {} ({}) cannot be used in this function".format(
type(library).__name__, repr(library.name)
)
)

elif isinstance(library, type) and issubclass(library, Library):
if library().name in _libraries_lazy:
return _libraries_lazy[library().name]
else:
raise ValueError(
"library {} ({}) cannot be used in this function".format(
library.__name__, repr(library().name)
)
)

else:
try:
return _libraries_lazy[library]
except KeyError as err:
raise ValueError(
"""library {} not recognized (for this function); """
"""try "ak" (Awkward Array) """
"""instead""".format(repr(library))
) from err
Loading

0 comments on commit e34f66a

Please sign in to comment.