Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add 'interp_options' mechanism and ak_add_doc. #784

Merged
merged 2 commits into from
Nov 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 53 additions & 14 deletions src/uproot/_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def dask(
full_paths=False,
step_size="100 MB",
library="ak",
ak_add_doc=False,
custom_classes=None,
allow_missing=False,
open_files=True,
Expand Down Expand Up @@ -47,6 +48,8 @@ def dask(
such as "100 MB".
library (str or :doc:`uproot.interpretation.library.Library`): The library
that is used to represent arrays.
ak_add_doc (bool): If True and ``library="ak"``, add the TBranch ``title``
to the Awkward ``__doc__`` parameter of the array.
custom_classes (None or dict): If a dict, override the classes from
the :doc:`uproot.reading.ReadOnlyFile` or ``uproot.classes``.
allow_missing (bool): If True, skip over any files that do not contain
Expand Down Expand Up @@ -110,6 +113,8 @@ def dask(

filter_branch = uproot._util.regularize_filter(filter_branch)

interp_options = {"ak_add_doc": ak_add_doc}

if library.name == "np":
if open_files:
return _get_dask_array(
Expand All @@ -123,6 +128,7 @@ def dask(
custom_classes,
allow_missing,
real_options,
interp_options,
)
else:
return _get_dask_array_delay_open(
Expand All @@ -135,6 +141,7 @@ def dask(
custom_classes,
allow_missing,
real_options,
interp_options,
)
elif library.name == "ak":
if open_files:
Expand All @@ -149,6 +156,7 @@ def dask(
custom_classes,
allow_missing,
real_options,
interp_options,
)
else:
return _get_dak_array_delay_open(
Expand All @@ -161,6 +169,7 @@ def dask(
custom_classes,
allow_missing,
real_options,
interp_options,
)
else:
raise NotImplementedError()
Expand Down Expand Up @@ -312,23 +321,30 @@ def _dask_array_from_map(


class _UprootReadNumpy:
def __init__(self, ttrees, key) -> None:
def __init__(self, ttrees, key, interp_options) -> None:
self.ttrees = ttrees
self.key = key
self.interp_options = interp_options

def __call__(self, i_start_stop):
i, start, stop = i_start_stop
return self.ttrees[i][self.key].array(
entry_start=start, entry_stop=stop, library="np"
entry_start=start,
entry_stop=stop,
library="np",
ak_add_doc=self.interp_options["ak_add_doc"],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
ak_add_doc=self.interp_options["ak_add_doc"],
**self.interp_options,

?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it's better to be explicit, to control the list of options. Then adding a new one would be a matter of searching for all instances of ak_add_doc and adding the new one next to that.

**self.interp_options passes everything in the interp_options dict through, which might be right or it might silently overshadow arguments of TTree.arrays that aren't interp_options. We shouldn't create different types of arguments with the same names, but it would just be easier to catch such mistakes with an explicit pass-through.

)


class _UprootOpenAndReadNumpy:
def __init__(self, custom_classes, allow_missing, real_options, key):
def __init__(
self, custom_classes, allow_missing, real_options, key, interp_options
):
self.custom_classes = custom_classes
self.allow_missing = allow_missing
self.real_options = real_options
self.key = key
self.interp_options = interp_options

def __call__(self, file_path_object_path):
file_path, object_path = file_path_object_path
Expand All @@ -339,7 +355,9 @@ def __call__(self, file_path_object_path):
self.allow_missing,
self.real_options,
)
return ttree[self.key].array(library="np")
return ttree[self.key].array(
library="np", ak_add_doc=self.interp_options["ak_add_doc"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As above

)


def _get_dask_array(
Expand All @@ -353,6 +371,7 @@ def _get_dask_array(
custom_classes=None,
allow_missing=False,
real_options=None,
interp_options=None,
):
ttrees = []
common_keys = None
Expand Down Expand Up @@ -471,7 +490,7 @@ def foreach(start):
chunk_args.append((0, 0, 0))

dask_dict[key] = _dask_array_from_map(
_UprootReadNumpy(ttrees, key),
_UprootReadNumpy(ttrees, key, interp_options),
chunk_args,
chunks=(tuple(chunks),),
dtype=dt,
Expand All @@ -491,6 +510,7 @@ def _get_dask_array_delay_open(
custom_classes=None,
allow_missing=False,
real_options=None,
interp_options=None,
):
ffile_path, fobject_path = files[0]
obj = uproot._util.regularize_object_path(
Expand All @@ -514,7 +534,9 @@ def _get_dask_array_delay_open(
dt, inner_shape = dt.subdtype

dask_dict[key] = _dask_array_from_map(
_UprootOpenAndReadNumpy(custom_classes, allow_missing, real_options, key),
_UprootOpenAndReadNumpy(
custom_classes, allow_missing, real_options, key, interp_options
),
files,
chunks=((numpy.nan,) * len(files),),
dtype=dt,
Expand All @@ -524,26 +546,33 @@ def _get_dask_array_delay_open(


class _UprootRead:
def __init__(self, ttrees, branches) -> None:
def __init__(self, ttrees, branches, interp_options) -> None:
self.ttrees = ttrees
self.branches = branches
self.interp_options = interp_options

def __call__(self, i_start_stop):
i, start, stop = i_start_stop
return self.ttrees[i].arrays(self.branches, entry_start=start, entry_stop=stop)
return self.ttrees[i].arrays(
self.branches,
entry_start=start,
entry_stop=stop,
ak_add_doc=self.interp_options["ak_add_doc"],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You know where this is going ;)

)

def project_columns(self, branches):
return _UprootRead(self.ttrees, branches)
return _UprootRead(self.ttrees, branches, self.interp_options)


class _UprootOpenAndRead:
def __init__(
self, custom_classes, allow_missing, real_options, common_keys
self, custom_classes, allow_missing, real_options, common_keys, interp_options
) -> None:
self.custom_classes = custom_classes
self.allow_missing = allow_missing
self.real_options = real_options
self.common_keys = common_keys
self.interp_options = interp_options

def __call__(self, file_path_object_path):
file_path, object_path = file_path_object_path
Expand All @@ -554,11 +583,17 @@ def __call__(self, file_path_object_path):
self.allow_missing,
self.real_options,
)
return ttree.arrays(self.common_keys)
return ttree.arrays(
self.common_keys, ak_add_doc=self.interp_options["ak_add_doc"]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One more

)

def project_columns(self, common_keys):
return _UprootOpenAndRead(
self.custom_classes, self.allow_missing, self.real_options, common_keys
self.custom_classes,
self.allow_missing,
self.real_options,
common_keys,
self.interp_options,
)


Expand Down Expand Up @@ -589,6 +624,7 @@ def _get_dak_array(
custom_classes=None,
allow_missing=False,
real_options=None,
interp_options=None,
):
dask_awkward = uproot.extras.dask_awkward()
awkward = uproot.extras.awkward()
Expand Down Expand Up @@ -697,7 +733,7 @@ def foreach(start):
if len(partition_args) == 0:
partition_args.append((0, 0, 0))
return dask_awkward.from_map(
_UprootRead(ttrees, common_keys),
_UprootRead(ttrees, common_keys, interp_options),
partition_args,
label="from-uproot",
meta=meta,
Expand All @@ -714,6 +750,7 @@ def _get_dak_array_delay_open(
custom_classes=None,
allow_missing=False,
real_options=None,
interp_options=None,
):
dask_awkward = uproot.extras.dask_awkward()
awkward = uproot.extras.awkward()
Expand All @@ -733,7 +770,9 @@ def _get_dak_array_delay_open(
meta = _get_meta_array(awkward, dask_awkward, obj, common_keys)

return dask_awkward.from_map(
_UprootOpenAndRead(custom_classes, allow_missing, real_options, common_keys),
_UprootOpenAndRead(
custom_classes, allow_missing, real_options, common_keys, interp_options
),
files,
label="from-uproot",
meta=meta,
Expand Down
Loading