Merge pull request #322 from jasondet/main

tcm structure change
legend-exp · Jul 26, 2022 · efd34cc · efd34cc
2 parents d7fb068 + 3b6cdf3
commit efd34cc
Show file tree

Hide file tree

Showing 7 changed files with 222 additions and 5 deletions.
diff --git a/src/pygama/evt/build_tcm.py b/src/pygama/evt/build_tcm.py
@@ -69,7 +69,7 @@ def build_tcm(input_tables:list, coin_col:str, hash_func:str=r'\d+',
                                       window_ref=window_ref, array_ids=array_ids)
 
     for key in tcm_cols: tcm_cols[key] = lgdo.Array(nda=tcm_cols[key])
-    tcm = lgdo.Table(col_dict=tcm_cols, attrs={ 'tables':str(all_tables), 'hash_func':str(hash_func) })
+    tcm = lgdo.Struct(obj_dict=tcm_cols, attrs={ 'tables':str(all_tables), 'hash_func':str(hash_func) })
 
     if out_file is not None:
         store.write_object(tcm, out_name, out_file, wo_mode=wo_mode)

diff --git a/src/pygama/evt/tcm.py b/src/pygama/evt/tcm.py
@@ -78,7 +78,9 @@ def generate_tcm_cols(coin_data:list, coin_window:float=0, window_ref:str='last'
         raise NotImplementedError(f'window_ref {window_ref}')
 
     # now build the outputs
-    coin_idx = tcm.coin_idx.to_numpy()
+    cumulative_length = np.where(tcm.coin_idx.diff().to_numpy() != 0)[0]
+    cumulative_length[:-1] = cumulative_length[1:]
+    cumulative_length[-1] = len(tcm.coin_idx)
     array_id = tcm.array_id.to_numpy()
     array_idx = tcm.array_idx.to_numpy() if 'array_idx' in tcm else tcm.index.to_numpy() # beautiful!
-    return { 'coin_idx':coin_idx, 'array_id':array_id, 'array_idx':array_idx }
+    return { 'cumulative_length':cumulative_length, 'array_id':array_id, 'array_idx':array_idx }
diff --git a/src/pygama/lgdo/__init__.py b/src/pygama/lgdo/__init__.py
@@ -36,5 +36,11 @@
 from pygama.lgdo.scalar import Scalar
 from pygama.lgdo.struct import Struct
 from pygama.lgdo.table import Table
-from pygama.lgdo.vectorofvectors import VectorOfVectors
+from pygama.lgdo.vectorofvectors import (
+    VectorOfVectors,
+    build_cl,
+    explode,
+    explode_arrays,
+    explode_cl,
+)
 from pygama.lgdo.waveform_table import WaveformTable
diff --git a/src/pygama/lgdo/vectorofvectors.py b/src/pygama/lgdo/vectorofvectors.py
@@ -8,6 +8,7 @@
 from typing import Any
 
 import numpy as np
+from numba import jit, njit
 
 from pygama.lgdo.array import Array
 from pygama.lgdo.lgdo_utils import get_element_type
@@ -150,3 +151,172 @@ def __str__(self) -> str:
 
     def __repr__(self) -> str:
         return str(self)
+
+
+def build_cl(sorted_array_in : Array, cumulative_length_out : np.ndarray = None) -> np.ndarray:
+    """ build a cumulative_length array from an array of sorted data
+
+    So for example if sorted_array_in contains [ 3, 3, 3, 4 ], would return
+    [ 2, 3 ]
+
+    For a sorted_array_in of indices, this is the inverse of explode_cl() below,
+    in the sense that doing build_cl(explode_cl(cumulative_length)) would
+    recover the original cumulative_length.
+
+    Parameters
+    ----------
+    sorted_array_in
+        Array of data already sorted; each N matching contiguous entries will be
+        converted into a new row of cumulative_length_out
+    cumulative_length_out
+        This is an optional pre-allocated array for the output
+        cumulative_length. It will always have length <= sorted_array_in, so
+        giving them the same length is safe if there is not a better guess.
+
+    Returns
+    -------
+    cumulative_length_out
+        The output cumulative_length. If the user provides a
+        cumulative_length_out that is too long, this return value is sliced to
+        contain only the used portion of the allocated memory
+    """
+    if len(sorted_array_in) == 0: return None
+    sorted_array_in = np.asarray(sorted_array_in)
+    if cumulative_length_out is None:
+        cumulative_length_out = np.zeros(len(sorted_array_in), dtype=np.uint64)
+    else:
+        cumulative_length_out.fill(0)
+    if len(cumulative_length_out) == 0 and len(sorted_array_in) > 0:
+        raise ValueError("cumulative_length_out too short ({len(cumulative_length_out)})")
+    return nb_build_cl(sorted_array_in, cumulative_length_out)
+
+@njit
+def nb_build_cl(sorted_array_in : np.ndarray, cumulative_length_out : np.ndarray) -> np.ndarray:
+    """ numbified inner loop for build_cl """
+    ii = 0
+    last_val = sorted_array_in[0]
+    for val in sorted_array_in:
+        if val != last_val:
+            ii += 1
+            cumulative_length_out[ii] = cumulative_length_out[ii-1]
+            if ii >= len(cumulative_length_out):
+                raise RuntimeError("cumulative_length_out too short")
+            last_val = val
+        cumulative_length_out[ii] += 1
+    ii += 1
+    return cumulative_length_out[:ii]
+
+
+def explode_cl(cumulative_length : Array, array_out : np.ndarray = None) -> np.ndarray:
+    """ explode a cumulative_length array
+
+    So for example if cumulative_length is [ 2, 3 ], would return [ 0, 0, 0, 1]
+
+    This is the inverse of build_cl() above, in the sense that doing
+    build_cl(explode_cl(cumulative_length)) would recover the original
+    cumulative_length.
+
+    Parameters
+    ----------
+    cumulative_length
+        the cumulative_length array to be exploded
+    array_out
+        an optional pre-allocated array to hold the exploded cumulative_length.
+        The length should be equal to cumulative_length[-1]
+
+    Returns
+    -------
+    array_out
+        the exploded cumulative_length array
+    """
+    cumulative_length = np.asarray(cumulative_length)
+    out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
+    if array_out is None:
+        array_out = np.empty(int(out_len), dtype=np.uint64)
+    if len(array_out) != out_len:
+        raise ValueError(f"bad lengths: cl[-1] ({cumulative_length[-1]}) != out ({len(array_out)})")
+    return nb_explode_cl(cumulative_length, array_out)
+
+@njit
+def nb_explode_cl(cumulative_length : np.ndarray, array_out : np.ndarray) -> np.ndarray:
+    """ numbified inner loop for explode_cl"""
+    out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
+    if len(array_out) != out_len:
+        raise ValueError("bad lengths")
+    start = 0
+    for ii in range(len(cumulative_length)):
+        nn = int(cumulative_length[ii] - start)
+        for jj in range(nn):
+            array_out[int(start+jj)] = ii
+        start = cumulative_length[ii]
+    return array_out
+
+
+
+def explode(cumulative_length : Array, array_in : Array, array_out : np.ndarray = None) -> np.ndarray :
+    """ explode a data array using a cumulative_length array
+
+    This is identical to allocated_explode_cl, except array_in gets exploded
+    instead of cumulative_length. So for example, if array_in = [ 3, 4 ] and
+    cumulative_length = [ 2, 3 ], array_out would be [ 3, 3, 3, 4 ]
+
+    Parameters
+    ----------
+    cumulative_length
+        the cumulative_length array to use for exploding
+    array_in
+        the data to be exploded. Must have same length as cumulative_length
+    array_out
+        a pre-allocated array to hold the exploded data. The length should be
+        equal to cumulative_length[-1]
+    """
+    cumulative_length = np.asarray(cumulative_length)
+    array_in = np.asarray(array_in)
+    out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
+    if array_out is None:
+        array_out = np.empty(out_len, dtype=array_in.dtype)
+    if len(cumulative_length) != len(array_in) or len(array_out) != out_len:
+        raise ValueError(f"bad lengths: cl ({len(cumulative_length)}) != in ({len(array_in)}) and cl[-1] ({cumulative_length[-1]}) != out ({len(array_out)})")
+    return nb_explode(cumulative_length, array_in, array_out)
+
+@njit
+def nb_explode(cumulative_length : np.ndarray, array_in : np.ndarray, array_out : np.ndarray) -> np.ndarray :
+    """ numbified inner loop for explode"""
+    out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
+    if len(cumulative_length) != len(array_in) or len(array_out) != out_len:
+        raise ValueError("bad lengths")
+    ii = 0
+    for jj in range(len(array_out)):
+        while ii < len(cumulative_length) and jj >= cumulative_length[ii]:
+            ii += 1
+        array_out[jj] = array_in[ii]
+    return array_out
+
+
+def explode_arrays(cumulative_length : Array, arrays : list, out_arrays : list = None) -> list:
+    """ explode a set of arrays using a cumulative_length array
+
+    Parameters
+    ----------
+    cumulative_length
+        the cumulative_length array to use for exploding
+    arrays
+        the data arrays to be exploded. Each array must have same length as
+        cumulative_length
+    out_arrays
+        an optional list of pre-allocated arrays to hold the exploded data. The
+        length of the list should be equal to the number of "arrays", and each
+        entry in array_out should have length cumulative_length[-1]. If not
+        provided, output arrays are allocated for the user.
+    """
+    cumulative_length = np.asarray(cumulative_length)
+    for ii in range(len(arrays)):
+        arrays[ii] = np.asarray(arrays[ii])
+    out_len = cumulative_length[-1] if len(cumulative_length) > 0 else 0
+    if out_arrays is None:
+        out_arrays = []
+        for array in arrays:
+            out_arrays.append(np.empty(out_len, dtype=array.dtype))
+    for ii in range(len(arrays)):
+        explode(cumulative_length, arrays[ii], out_arrays[ii])
+    return out_arrays
diff --git a/src/pygama/math/histogram.py b/src/pygama/math/histogram.py
@@ -439,7 +439,7 @@ def plot_hist(hist, bins, var=None, show_stats=False, stats_hloc=0.75, stats_vlo
         dmean = stddev/np.sqrt(N)
 
         mean, dmean = pgu.get_formatted_stats(mean, dmean, 2)
-        stats = fr'$\mu={mean} \pm {dmean}$\n$\sigma={stddev:#.3g}$'
+        stats = f'$\\mu={mean} \\pm {dmean}$\n$\\sigma={stddev:#.3g}$'
         stats_fontsize = rcParams['legend.fontsize']
         plt.text(stats_hloc, stats_vloc, stats, transform=plt.gca().transAxes, fontsize = stats_fontsize)
 

diff --git a/src/pygama/math/utils.py b/src/pygama/math/utils.py
@@ -94,6 +94,10 @@ def get_formatted_stats(mean, sigma, ndigs=2):
     convenience function for formatting mean +/- sigma to the right number of
     significant figures.
     """
+    if sigma == 0:
+        fmt = '%d' % ndigs
+        fmt = '%#.' + fmt + 'g'
+        return fmt % mean, fmt % sigma
     sig_pos = int(np.floor(np.log10(abs(sigma))))
     sig_fmt = '%d' % ndigs
     sig_fmt = '%#.' + sig_fmt + 'g'

diff --git a/tests/lgdo/test_vectorofvectors.py b/tests/lgdo/test_vectorofvectors.py
@@ -64,3 +64,38 @@ def test_iter(lgdo_vov):
     for v in lgdo_vov:
         assert (v == desired[c]).all()
         c += 1
+
+
+def test_build_cl_and_explodes():
+    cl = np.array([3, 4], dtype=np.uint64)
+    exp = np.array([0, 0, 0, 1], dtype=np.uint64)
+    array = np.array([5, 7], dtype=np.uint64)
+    array_exp = np.array([5, 5, 5, 7], dtype=np.uint64)
+    # build_cl
+    assert (lgdo.build_cl(exp, cl)       == cl).all()
+    assert (lgdo.build_cl(exp)           == cl).all()
+    assert (lgdo.build_cl([0, 0, 0, 1])  == cl).all()
+    assert (lgdo.build_cl(array_exp, cl) == cl).all()
+    assert (lgdo.build_cl(array_exp)     == cl).all()
+    assert (lgdo.build_cl([5, 5, 5, 7])  == cl).all()
+    # explode_cl
+    assert (lgdo.explode_cl(cl, exp) == exp).all()
+    assert (lgdo.explode_cl(cl)      == exp).all()
+    assert (lgdo.explode_cl([3,4])   == exp).all()
+    # inverse functionality
+    assert (lgdo.build_cl(lgdo.explode_cl(cl)) == cl).all()
+    assert (lgdo.explode_cl(lgdo.build_cl(array_exp)) == exp).all()
+    # explode
+    assert (lgdo.explode(cl, array, array_exp) == array_exp).all()
+    assert (lgdo.explode(cl, array)            == array_exp).all()
+    assert (lgdo.explode([3, 4], [5, 7])       == array_exp).all()
+    assert (lgdo.explode(cl, range(len(cl)))   == exp).all()
+    # explode_arrays
+    out_arrays = lgdo.explode_arrays(cl, [array, range(len(cl))])
+    assert len(out_arrays) == 2
+    assert (out_arrays[0] == array_exp).all()
+    assert (out_arrays[1] == exp).all()
+    out_arrays = lgdo.explode_arrays(cl, [array, range(len(cl))], out_arrays=out_arrays)
+    assert len(out_arrays) == 2
+    assert (out_arrays[0] == array_exp).all()
+    assert (out_arrays[1] == exp).all()