From 159acbbaee7b638242ea28a1bb96014f94155b0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Tue, 2 Jul 2024 10:27:05 +0200 Subject: [PATCH 1/4] Small fixes for the linter and drop_branches --- src/hepconvert/_utils.py | 47 +++++++++++++----------------- src/hepconvert/copy_root.py | 9 +++--- src/hepconvert/histogram_adding.py | 6 ++-- src/hepconvert/merge.py | 3 +- tests/test_copy_root.py | 45 +++++++++++++--------------- 5 files changed, 49 insertions(+), 61 deletions(-) diff --git a/src/hepconvert/_utils.py b/src/hepconvert/_utils.py index d7f7928..4f806ca 100644 --- a/src/hepconvert/_utils.py +++ b/src/hepconvert/_utils.py @@ -54,38 +54,33 @@ def filter_branches(tree, keep_branches, drop_branches, count_branches): if drop_branches and keep_branches: msg = "Can specify either drop_branches or keep_branches, not both." raise ValueError(msg) from None - - if drop_branches: - if isinstance(drop_branches, dict): # noqa: SIM102 + branches = drop_branches if drop_branches else keep_branches + keys = [] + if branches: + if isinstance(branches, dict): # noqa: SIM102 if ( - len(drop_branches) > 1 - and tree.name in drop_branches - or tree.name == next(iter(drop_branches.keys())) + len(branches) > 1 + and tree.name in branches + or tree.name == next(iter(branches.keys())) ): - drop_branches = drop_branches.get(tree.name) - if isinstance(drop_branches, str) or len(drop_branches) == 1: - drop_branches = tree.keys(filter_name=drop_branches) - return [ - b.name - for b in tree.branches - if b.name not in count_branches and b.name not in drop_branches - ] - if keep_branches: - if isinstance(keep_branches, dict): # noqa: SIM102 - if ( - len(keep_branches) > 1 - and tree.name in keep_branches - or tree.name == next(iter(keep_branches.keys())) - ): - keep_branches = keep_branches.get(tree.name) - if isinstance(keep_branches, str) or len(keep_branches) == 1: - keep_branches = tree.keys(filter_name=keep_branches) + keys = branches.get(tree.name) + if isinstance(branches, str) or len(branches) == 1: + keys = tree.keys(filter_name=branches) + else: + for i in branches: + keys = np.union1d(keys, tree.keys(filter_name=i)) + if drop_branches: return [ b.name for b in tree.branches - if b.name not in count_branches and b.name in keep_branches + if b.name not in count_branches and b.name not in keys ] - return [b.name for b in tree.branches if b.name not in count_branches] + return [ + b.name + for b in tree.branches + if b.name not in count_branches and b.name in keys + ] + return [b.name for b in tree.branches] def check_tqdm(): diff --git a/src/hepconvert/copy_root.py b/src/hepconvert/copy_root.py index e59a7e3..250f4f9 100644 --- a/src/hepconvert/copy_root.py +++ b/src/hepconvert/copy_root.py @@ -18,7 +18,6 @@ def copy_root( *, keep_branches=None, drop_branches=None, - # add_branches=None, #TODO: add functionality for this, just specify about the counter issue? keep_trees=None, drop_trees=None, cut=None, @@ -32,7 +31,7 @@ def copy_root( initial_basket_capacity=10, resize_factor=10.0, counter_name=lambda counted: "n" + counted, - step_size=100, + step_size="100 MB", compression="ZLIB", compression_level=1, ): @@ -216,12 +215,12 @@ def copy_root( ) raise ValueError(msg) - if len(trees) > 1 and progress_bar is not False: + if len(trees) > 1 and progress_bar is not False and progress_bar is not None: number_of_items = len(trees) if progress_bar is True: tqdm = _utils.check_tqdm() progress_bar = tqdm.tqdm(desc="Trees copied") - progress_bar.reset(total=number_of_items) + progress_bar.reset(total=number_of_items) for t in trees: tree = f[t] count_branches = get_counter_branches(tree) @@ -280,6 +279,6 @@ def copy_root( out_file[tree.name].extend(chunk) except AssertionError: msg = "Are the branch-names correct?" - if len(trees) > 1 and progress_bar is not False: + if len(trees) > 1 and progress_bar is not False and progress_bar is not None: progress_bar.update(n=1) f.close() diff --git a/src/hepconvert/histogram_adding.py b/src/hepconvert/histogram_adding.py index 0d57450..cea11ae 100644 --- a/src/hepconvert/histogram_adding.py +++ b/src/hepconvert/histogram_adding.py @@ -418,11 +418,11 @@ def add_histograms( if not force and not append: raise FileExistsError if force and append: - msg = "Cannot append to a new file. Either force or append can be true." + msg = "Cannot append to a new file. Either force or append can be true, not both." raise ValueError(msg) if append: out_file = uproot.update(destination) - elif force: + else: out_file = uproot.recreate( destination, compression=uproot.compression.Compression.from_code_pair( @@ -490,7 +490,7 @@ def add_histograms( msg = f"File: {input_file} does not exist or is corrupt." raise FileNotFoundError(msg) from None if same_names: - if progress_bar: + if progress_bar and hist_bar: hist_bar.reset(len(keys)) for key in keys: try: diff --git a/src/hepconvert/merge.py b/src/hepconvert/merge.py index 66a0c70..f0e7b6e 100644 --- a/src/hepconvert/merge.py +++ b/src/hepconvert/merge.py @@ -246,10 +246,9 @@ def merge_root( ) raise ValueError(msg) if progress_bar is not False: + number_of_items = len(files) if progress_bar is True: tqdm = _utils.check_tqdm() - number_of_items = len(files) - progress_bar = tqdm.tqdm(desc="Files added") progress_bar.reset(number_of_items) for t in trees: diff --git a/tests/test_copy_root.py b/tests/test_copy_root.py index 5ebf19d..623f2e0 100644 --- a/tests/test_copy_root.py +++ b/tests/test_copy_root.py @@ -62,33 +62,25 @@ def test_keep_branch(tmp_path): assert key not in file["events"].keys() -def test_add_branch(tmp_path): +def test_keep_branches(tmp_path): hepconvert.copy_root( Path(tmp_path) / "drop_branches.root", skhep_testdata.data_path("uproot-HZZ.root"), - drop_branches=["MClepton_py", "Jet_Px"], + drop_branches=["Jet_*", "MClepton_*"], counter_name=lambda counted: "N" + counted, force=True, ) - arrays = file["events"].arrays() - branch_types = { - name: array.type - for name, array in zip(ak.fields(arrays), ak.unzip(arrays)) - if not name.startswith("n") and not name.startswith("N") - } - - branches = { - file["events"]["MClepton_py"].name: file["events"]["MClepton_py"].arrays(), - file["events"]["Jet_Px"].name: file["events"]["Jet_Px"].arrays(), - } - jet_px = {file["events"]["Jet_Px"].name: file["events"]["Jet_Px"].arrays()} - hepconvert.copy_root( - Path(tmp_path) / "add_branches.root", - Path(tmp_path) / "drop_branches.root", - force=True, - ) + original = uproot.open(skhep_testdata.data_path("uproot-HZZ.root")) - file = uproot.open(Path(tmp_path) / "add_branches.root") + file = uproot.open(Path(tmp_path) / "drop_branches.root") + for key in original["events"].keys(): + if key.startswith("MClepton_"): + assert key in file["events"].keys() + assert ak.all( + file["events"][key].array() == original["events"][key].array() + ) + else: + assert key not in file["events"].keys() def test_hepdata_example(tmp_path): @@ -100,7 +92,6 @@ def test_hepdata_example(tmp_path): ) hepconvert_file = uproot.open(Path(tmp_path) / "copy_hepdata.root") file = uproot.open(skhep_testdata.data_path("uproot-hepdata-example.root")) - for key in hepconvert_file.keys(cycle=False): assert key in file.keys(cycle=False) @@ -203,7 +194,7 @@ def test_drop_tree(tmp_path): with pytest.raises( ValueError, - match="Key 'tree5' does not match any TTree in ROOT file/Users/zobil/Desktop/directory/two_trees.root", + match=f"Key 'tree5' does not match any TTree in ROOT {tmp_path}/two_trees.root", ): hepconvert.copy_root( Path(tmp_path) / "copied.root", @@ -244,7 +235,11 @@ def test_keep_tree_and_branch(tmp_path): import numpy as np with uproot.recreate(Path(tmp_path) / "two_trees.root") as file: - file["tree"] = {"x": np.array([1, 2, 3, 4, 5]), "y": np.array([4, 5, 6, 7, 8])} + file["tree"] = { + "x": np.array([1, 2, 3, 4, 5]), + "y": np.array([4, 5, 6, 7, 8]), + "z": np.array([4, 5, 6, 7, 8]), + } file["tree1"] = { "x": np.array([1, 2, 3, 4, 5]), "y": np.array([4, 5, 6, 7, 8]), @@ -254,14 +249,14 @@ def test_keep_tree_and_branch(tmp_path): hepconvert.copy_root( Path(tmp_path) / "copied.root", Path(tmp_path) / "two_trees.root", - keep_branches={"tree": "x", "tree1": "y"}, + keep_branches={"tree": ["x", "z"], "tree1": "y"}, force=True, ) with uproot.open(Path(tmp_path) / "copied.root") as copy: assert copy.keys(cycle=False) == ["tree", "tree1"] assert copy["tree1"].keys() == ["y"] - assert copy["tree"].keys() == ["x"] + assert copy["tree"].keys() == ["x", "z"] for tree in copy.keys(cycle=False): for key in copy[tree].keys(): assert ak.all(copy[tree][key].array() == file[tree][key].array()) From 809405d49e3cdb20bfecfcfc10fd5e32448a0fc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Tue, 2 Jul 2024 10:51:45 +0200 Subject: [PATCH 2/4] More little fixes --- src/hepconvert/copy_root.py | 44 +++++++++++++++---------------- src/hepconvert/root_to_parquet.py | 22 ++++++++++------ tests/test_root_to_parquet.py | 24 ++++++++++++++++- 3 files changed, 59 insertions(+), 31 deletions(-) diff --git a/src/hepconvert/copy_root.py b/src/hepconvert/copy_root.py index 250f4f9..b0e1bac 100644 --- a/src/hepconvert/copy_root.py +++ b/src/hepconvert/copy_root.py @@ -13,8 +13,8 @@ def copy_root( - destination, - file, + out_file, + in_file, *, keep_branches=None, drop_branches=None, @@ -36,10 +36,10 @@ def copy_root( compression_level=1, ): """ - :param destination: Name of the output file or file path. - :type destination: path-like - :param file: Local ROOT file to copy. - :type file: str + :param out_file: Name of the output file or file path. + :type out_file: path-like + :param in_file: Local ROOT file to copy. + :type in_file: str :param keep_branches: To keep only certain branches and remove all others. To remove certain branches from all TTrees in the file, pass a list of names of branches to keep, wildcarding accepted ("Jet_*"). If removing branches from one of multiple trees, pass a dict of structure: {tree: [branch1, branch2]} to keep only branch1 and branch2 in ttree "tree". Defaults to None. Command line option: ``--keep-branches``. @@ -113,7 +113,7 @@ def copy_root( .. code-block:: bash - hepconvert copy-root [options] [OUT_FILE] [IN_FILE] + hepconvert copy-root [options] [of] [IN_FILE] """ if compression in ("ZLIB", "zlib"): @@ -127,29 +127,29 @@ def copy_root( else: msg = f"unrecognized compression algorithm: {compression}. Only ZLIB, LZMA, LZ4, and ZSTD are accepted." raise ValueError(msg) - path = Path(destination) + path = Path(out_file) if Path.is_file(path): if not force: raise FileExistsError - out_file = uproot.recreate( - destination, + of = uproot.recreate( + out_file, compression=uproot.compression.Compression.from_code_pair( compression_code, compression_level ), ) first = (True,) else: - out_file = uproot.recreate( - destination, + of = uproot.recreate( + out_file, compression=uproot.compression.Compression.from_code_pair( compression_code, compression_level ), ) first = (True,) try: - f = uproot.open(file) + f = uproot.open(in_file) except FileNotFoundError: - msg = "File: ", file, " does not exist or is corrupt." + msg = "file: ", in_file, " does not exist or is corrupt." raise FileNotFoundError(msg) from None hist_keys = f.keys( @@ -158,11 +158,11 @@ def copy_root( for key in hist_keys: # just pass to hadd?? if len(f[key].axes) == 1: - out_file[key] = _hadd_1d(destination, f, key, True) + of[key] = _hadd_1d(out_file, f, key, True) elif len(f[key].axes) == 2: - out_file[key] = _hadd_2d(destination, f, key, True) + of[key] = _hadd_2d(out_file, f, key, True) else: - out_file[key] = _hadd_3d(destination, f, key, True) + of[key] = _hadd_3d(out_file, f, key, True) trees = f.keys(filter_classname="TTree", cycle=False, recursive=False) @@ -178,7 +178,7 @@ def copy_root( "Key '" + key + "' does not match any TTree in ROOT file" - + str(file) + + str(in_file) ) raise ValueError(msg) if isinstance(keep_trees, str): @@ -196,7 +196,7 @@ def copy_root( "Key '" + key + "' does not match any TTree in ROOT file" - + str(file) + + str(in_file) ) raise ValueError(msg) trees.remove(key) @@ -211,7 +211,7 @@ def copy_root( "TTree ", key, " does not match any TTree in ROOT file", - destination, + out_file, ) raise ValueError(msg) @@ -264,7 +264,7 @@ def copy_root( } else: branch_types = {name: array.type for name, array in chunk.items()} - out_file.mktree( + of.mktree( tree.name, branch_types, title=title, @@ -276,7 +276,7 @@ def copy_root( else: try: - out_file[tree.name].extend(chunk) + of[tree.name].extend(chunk) except AssertionError: msg = "Are the branch-names correct?" if len(trees) > 1 and progress_bar is not False and progress_bar is not None: diff --git a/src/hepconvert/root_to_parquet.py b/src/hepconvert/root_to_parquet.py index 324c867..e5d20e4 100644 --- a/src/hepconvert/root_to_parquet.py +++ b/src/hepconvert/root_to_parquet.py @@ -260,19 +260,25 @@ def _filter_branches(tree, keep_branches, drop_branches): if drop_branches and keep_branches: msg = "Can specify either drop_branches or keep_branches, not both." raise ValueError(msg) from None + keys = [] + from numpy import union1d if drop_branches: if isinstance(drop_branches, str): - drop_branches = tree.keys(filter_name=drop_branches) + keys = tree.keys(filter_name=drop_branches) if isinstance(drop_branches, dict) and tree.name in drop_branches: - drop_branches = drop_branches.get(tree.name) - return lambda b: b in [ - b.name for b in tree.branches if b.name not in drop_branches - ] + keys = drop_branches.get(tree.name) + else: + for i in drop_branches: + keys = union1d(keys, tree.keys(filter_name=i)) + return lambda b: b in [b.name for b in tree.branches if b.name not in keys] if keep_branches: if isinstance(keep_branches, str): - keep_branches = tree.keys(filter_name=keep_branches) + keys = tree.keys(filter_name=keep_branches) if isinstance(keep_branches, dict) and tree.name in keep_branches: - keep_branches = keep_branches.get(tree.name) - return lambda b: b in [b.name for b in tree.branches if b.name in keep_branches] + keys = keep_branches.get(tree.name) + else: + for i in keep_branches: + keys = union1d(keys, tree.keys(filter_name=i)) + return lambda b: b in [b.name for b in tree.branches if b.name in keys] return None diff --git a/tests/test_root_to_parquet.py b/tests/test_root_to_parquet.py index 7079a82..59d20d2 100644 --- a/tests/test_root_to_parquet.py +++ b/tests/test_root_to_parquet.py @@ -102,7 +102,7 @@ def drop_branches(tmp_path): ) -def keep_branches(tmp_path): +def keep_branch(tmp_path): f = uproot.open(skhep_testdata.data_path("uproot-HZZ.root")) original = f["events"].arrays() hepconvert.root_to_parquet( @@ -122,3 +122,25 @@ def keep_branches(tmp_path): assert ( ak.metadata_from_parquet(Path(tmp_path) / "test.parquet")["num_row_groups"] == 3 ) + + +def keep_branches(tmp_path): + f = uproot.open(skhep_testdata.data_path("uproot-HZZ.root")) + original = f["events"].arrays() + hepconvert.root_to_parquet( + in_file=skhep_testdata.data_path("uproot-HZZ.root"), + out_file=Path(tmp_path) / "test.parquet", + step_size=1000, + keep_branches=["Jet_*", "Muon_*"], + force=True, + ) + from_parquet = ak.from_parquet(Path(tmp_path) / "test.parquet") + for key in f["events"].keys(): + if not key.startswith("Jet") and not key.startswith("Muon"): + with pytest.raises(ak.errors.FieldNotFoundError): + from_parquet[key] + else: + assert ak.all(from_parquet[key] == original[key]) + assert ( + ak.metadata_from_parquet(Path(tmp_path) / "test.parquet")["num_row_groups"] == 3 + ) From 91472426d211ae8da0aff2ebcaa0d93b244c5a3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Tue, 2 Jul 2024 11:04:52 +0200 Subject: [PATCH 3/4] linter fix --- src/hepconvert/histogram_adding.py | 3 ++- tests/test_add_histograms.py | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/hepconvert/histogram_adding.py b/src/hepconvert/histogram_adding.py index cea11ae..eddcc7e 100644 --- a/src/hepconvert/histogram_adding.py +++ b/src/hepconvert/histogram_adding.py @@ -459,7 +459,8 @@ def add_histograms( if progress_bar is True: file_bar = tqdm.tqdm(desc="Files added") hist_bar = tqdm.tqdm(desc="Histograms added") - + else: + hist_bar = None file_bar.reset(number_of_items) if same_names: if union: diff --git a/tests/test_add_histograms.py b/tests/test_add_histograms.py index 139e12f..b6888ab 100644 --- a/tests/test_add_histograms.py +++ b/tests/test_add_histograms.py @@ -266,6 +266,9 @@ def simple_1dim_F(tmp_path): ).all +simple_1dim_F("tests/samples") + + def mult_2D_hists(tmp_path): h1 = ROOT.TH2F("name", "", 10, 0.0, 10.0, 8, 0.0, 8.0) data1 = [ From 2630830393ee2a582f5a553113c9f68bdbc010b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zo=C3=AB=20Bilodeau?= Date: Tue, 2 Jul 2024 11:07:41 +0200 Subject: [PATCH 4/4] linter --- src/hepconvert/root_to_parquet.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/hepconvert/root_to_parquet.py b/src/hepconvert/root_to_parquet.py index e5d20e4..a8f8f74 100644 --- a/src/hepconvert/root_to_parquet.py +++ b/src/hepconvert/root_to_parquet.py @@ -4,6 +4,7 @@ import awkward as ak import uproot +from numpy import union1d def root_to_parquet( @@ -261,8 +262,6 @@ def _filter_branches(tree, keep_branches, drop_branches): msg = "Can specify either drop_branches or keep_branches, not both." raise ValueError(msg) from None keys = [] - from numpy import union1d - if drop_branches: if isinstance(drop_branches, str): keys = tree.keys(filter_name=drop_branches)