MeteoSwiss · fpavogt · Mar 25, 2022 · Mar 8, 2022 · Mar 8, 2022 · Mar 15, 2022
@@ -11,4 +11,4 @@ generated-members=nc.*,  # netCDF4
 [BASIC]
 
 # Good variable names which should always be accepted, separated by a comma.
-good-names=_, a, b, c, d, dt, i, j, k, op, w, x, y, z
+good-names=_, a, ax, b, c, d, dt, i, j, k, op, u, uc, v, w, x, y, z
diff --git a/CHANGELOG b/CHANGELOG
@@ -6,6 +6,9 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 ## [Unreleased]
 ### Added:
+ - [fpavogt, 2022-03-21] Add new uc_budget diagnostic plot to dvas
+ - [fpavogt, 2022-03-16] Enable the cropping of descent data in the "cleanup" recipe step
+ - [fpavogt, 2022-03-15] Propagate flags through process_chunk(), weighted_means(), and delta()
  - [fpavogt, 2022-03-08] Flag descent data in basic cleanup step. Add new 'descent' flag
  - [fpavogt, 2022-03-03] Enable the resampling of profiles with a new cleanup recipe step
  - [fpavogt, 2022-03-01] Add prf_summary recipe (fix #177)
@@ -16,6 +19,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
  - [modolol, 2021-11-03] Add `mid` in DB search options
  - [fpavogt, 2021-07-02] Add `has_tag()` convenience method to `Profile` and `MultiProfile` classes
 ### Fixed:
+ - [fpavogt, 2022-03-18] Fix #163
  - [fpavogt, 2022-02-25] Fix #179
  - [fpavogt, 2022-02-21] Fix #192
  - [fpavogt, 2021-12-02] Fix issue #181
@@ -26,6 +30,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
  - [modolol, 2021-07-12] Fix issue #185
  - [modolol, 2021-07-12] Fix issue #160
 ### Changed:
+ - [fpavogt, 2022-03-18] Modified & imporoved gdp_vs_cws plots into a dedicated recipe step
  - [fpavogt, 2022-03-01] Simplified the logging by removing the specific names
  - [fpavogt, 2022-03-01] Add dev option to setup.py
  - [modolol, 2022-02-28] Remove mandatory tag creation in config

diff --git a/src/dvas/config/definitions/csvorigmeta.py b/src/dvas/config/definitions/csvorigmeta.py
@@ -14,6 +14,7 @@
 PARAMETER_PATTERN_PROP = {
     r"^\w+$": {
         'oneOf': [
+            {"type": 'null'},
             {"type": 'string'},
             {"type": 'number'},
             {"type": 'boolean'},

@@ -66,6 +66,10 @@ def execute(self, prfs, freq: str = '1s', chunk_size: int = 150, n_cpus: int = 1
         # Very well, let's start looping and resampling each Profile
         for (prf_ind, prf) in enumerate(prfs):
 
+            # Here, do something only if I actually have data to resample
+            if len(prf.data) <= 1:
+                continue
+
             # Let's identify the min and max integer values, rounded to the nearest second.
             t_0 = min(prf.data.index.get_level_values(PRF_REF_TDT_NAME)).ceil('1s')
             t_1 = max(prf.data.index.get_level_values(PRF_REF_TDT_NAME)).floor('1s')

@@ -120,6 +120,38 @@ def set_mplstyle(style='base'):
         plt.style.use(str(Path(PKG_PATH, 'plots', 'mpl_styles', PLOT_STYLES[style])))
 
 
+def add_sec_axis(ax, xvals, new_xvals, offset=-0.1, which='x'):
+    """ Adds a secondary x-axis to the figure.
+
+    Args:
+        ax (matplotlib axis): the axis to which to add the new x-axis
+        xvals (np.ndarray): current x-axis values, spanning the full range of the plot. Must not
+            contain NaNs.
+        new_xvals (np.ndarray): new x-axis values. Must be able to linearly interpolate with xvals,
+            and contain no NaN's whatsoever.
+        offset (float, optional): offset by which to shift the new axis. Defaults to -0.1.
+        which (str, optional): 'x', or 'y', the axis to duplicate.
+
+    Returns:
+        matplotlib axis: the newly created axis.
+
+    """
+
+    def forward(xxx):
+        return np.interp(xxx, xvals, new_xvals)
+
+    def inverse(xxx):
+        return np.interp(xxx, new_xvals, xvals)
+
+    if which == 'x':
+        new_ax = ax.secondary_xaxis(offset, functions=(forward, inverse))
+    elif which == 'y':
+        new_ax = ax.secondary_yaxis(offset, functions=(forward, inverse))
+    else:
+        raise DvasError('Ouch ! which should be "x" or "y", not: %s' % (which))
+    return new_ax
+
+
 def fix_txt(txt):
     """ Corrects any string for problematic characters before it gets added to a plot. Fixes vary
     depending whether on the chosen plotting style's value of `usetex` in `rcparams`.
@@ -136,7 +168,16 @@ def fix_txt(txt):
     # First deal with the cases when a proper LaTeX is being used
     if usetex:
         txt = txt.replace('%', r'{\%}')
-        txt = txt.replace('_', r'{\_}')
+        txt = [item.replace('_', r'\_') if ind % 2 == 0 else item
+               for (ind, item) in enumerate(txt.split('$'))]
+        txt = '$'.join(txt)
+
+    else:
+        txt = txt.replace(r'\smaller', '')
+        txt = txt.replace(r'\bf', r'')
+        txt = txt.replace(r'\it', r'')
+        txt = txt.replace(r'\flushleft', r'')
+        txt = txt.replace(r'\newline', r'')
 
     return txt
 
@@ -258,9 +299,9 @@ def add_source(fig):
         fig (matplotlib.pyplot.figure): the figure to add the text to.
 
     """
-    msg = 'Created with dvas v{}'.format(VERSION)
+    msg = r'\it\flushleft Created with\newline dvas v{}'.format(VERSION)
 
-    fig.text(0.01, 0.02, msg, fontsize='xx-small',
+    fig.text(0.01, 0.02, fix_txt(msg), fontsize='xx-small',
              horizontalalignment='left', verticalalignment='bottom')
 
 

@@ -189,11 +189,6 @@ def combine(gdp_prfs, binning=1, method='weighted mean', mask_flgs=None, chunk_s
     # Re-assemble all the chunks into one DataFrame.
     proc_chunk = pd.concat(proc_chunks, axis=0)
 
-    # Set the intiial flags of the combined profile.
-    # Should I actually be setting any ? E.g. to make the difference between True NaN's and
-    # Compatibility NaN's ? Can I even do that in here ?
-    proc_chunk.loc[:, 'flg'] = 0
-
     # Almost there. Now we just need to package this into a clean MultiGDPProfile
     # Let's first prepare the info dict
     new_rig_tag = 'r:'+','.join([item.split(':')[1]
@@ -210,7 +205,7 @@ def combine(gdp_prfs, binning=1, method='weighted mean', mask_flgs=None, chunk_s
     # It's no different from a GDP, from the perspective of the errors.
     new_prf = CWSProfile(new_info, data=proc_chunk)
 
-    # And finally, package this into a MultiGDPProfile entity
+    # And finally, package this into a MultiCWSProfile entity
     out = MultiCWSProfile()
     out.update(gdp_prfs.db_variables, data=[new_prf])
 

@@ -21,7 +21,7 @@
 from ...errors import DvasError
 from ...hardcoded import PRF_REF_TDT_NAME, PRF_REF_ALT_NAME, PRF_REF_VAL_NAME, PRF_REF_FLG_NAME
 from ...hardcoded import PRF_REF_UCR_NAME, PRF_REF_UCS_NAME, PRF_REF_UCT_NAME, PRF_REF_UCU_NAME
-from ..tools import fancy_nansum
+from ..tools import fancy_nansum, fancy_bitwise_or
 from .correlations import coeffs
 
 # Setup local logger
@@ -73,7 +73,7 @@ def weighted_mean(df_chunk, binning=1):
 
     # Let's make sure their ID is what I expect them to be.
     if not np.array_equal(df_chunk.columns.unique(level=0), range(n_prf)):
-        raise DvasError('Ouch ! Profile values must be grouped usign MultiIndex with ids 0,1, ...')
+        raise DvasError('Ouch ! Profile values must be grouped using MultiIndex with ids 0,1, ...')
 
     # Force the weights to be NaNs if the data is a NaN. Else, the normalization will be off.
     mask = df_chunk.xs(PRF_REF_VAL_NAME, level=1, axis=1).isna()
@@ -160,6 +160,19 @@ def weighted_mean(df_chunk, binning=1):
                                  mask=(np.isnan(wtot_mat) | np.isnan(w_mat)),
                                  fill_value=np.nan)
 
+    # Before we end, let us compute the flags. We apply a general bitwise OR to them, such that
+    # they do not cancel each other or disappear: they get propagated all the way
+    # First, we assemble them at high resolution. Note here that the flag for any weight that is
+    # NaN or 0 is set to 0 (but not pd.NA). THis is on purpose, to show that the resulting point
+    # is not inheriting "no" flags.
+    flgs = pd.DataFrame(fancy_bitwise_or(
+        df_chunk.loc[:, (slice(None), 'flg')].mask((w_ps.isna() | (w_ps == 0)).values), axis=1))
+
+    # Then, only if warranted, apply the binning too
+    if binning > 1:
+        flgs = flgs.groupby(flgs.index//binning).aggregate(fancy_bitwise_or)
+    chunk_out[PRF_REF_FLG_NAME] = flgs
+
     return chunk_out, jac_mat
 
 
@@ -227,8 +240,8 @@ def delta(df_chunk, binning=1):
             # Keep track of how many valid rows I am summing in each bin.
             # It would seem logical to store these as int. However, I will soon divide by those
             # numbers, and will need NaN's to avoid RunTime Warnings. 'int' does not support these,
-            # and 'Int64' cannot be used to divide TiemDeltas ... so float it is ... sigh.
-            # WARNING: here, i look at the number of valid rows **for that specific column**.
+            # and 'Int64' cannot be used to divide TimeDeltas ... so float it is ... sigh.
+            # WARNING: here, I look at the number of valid rows **for that specific column**.
             # This implies that, potentially, the delta action may combine different rows for
             # different columns.
             # TODO: fix this ... by using flags to select valid rows ?
@@ -284,6 +297,19 @@ def delta(df_chunk, binning=1):
         # Adjust the mask accordingly.
         jac_mat[valid_rows == 0, :] = np.ma.masked
 
+    # Before we end, let us compute the flags. We apply a general bitwise OR to them, such that
+    # they do not cancel each other or disappear: they get propagated all the way
+    # First, we assemble them at high resolution. Note that we here mask any flags that belongs to
+    # a NaN value, because this is not actually used in the delta.
+    flgs = pd.DataFrame(fancy_bitwise_or(
+        df_chunk.loc[:, (slice(None), 'flg')].mask(df_chunk.loc[:, (slice(None),
+                                                                    'val')].isna().values), axis=1))
+
+    # Then, only if warranted, apply the binning too
+    if binning > 1:
+        flgs = flgs.groupby(flgs.index//binning).aggregate(fancy_bitwise_or)
+    chunk_out[PRF_REF_FLG_NAME] = flgs
+
     return chunk_out, jac_mat
 
 
@@ -337,25 +363,38 @@ def process_chunk(df_chunk, binning=1, method='weighted mean'):
     n_prf = len(df_chunk.columns.unique(level=0))
 
     # Data consistency check & cleanup: if val is NaN, all the errors should be NaNs
-    # (and vice-versa).
+    # (and vice-versa). Also, if val is NaN, then we should ignore its flags, because as a NaN
+    # it will not be taken into account (but this should not trigger a warning).
     for prf_ind in range(n_prf):
-        if all(df_chunk.loc[:, (prf_ind, 'uc_tot')].isna() ==
-               df_chunk.loc[:, (prf_ind, 'val')].isna()):
-            continue
-
-        # If I reach this point, then the data is not making sense. Warn the user and clean it up.
-        logger.warning("GDP Profile %i: NaN mismatch for 'val' and 'uc_tot'", prf_ind)
-        df_chunk.loc[df_chunk.loc[:, (prf_ind, 'uc_tot')].isna().values, (prf_ind, 'val')] = np.nan
-        for col in [PRF_REF_UCR_NAME, PRF_REF_UCS_NAME, PRF_REF_UCT_NAME, PRF_REF_UCU_NAME,
-                    'uc_tot']:
-            df_chunk.loc[df_chunk.loc[:, (prf_ind, 'val')].isna().values,
-                         (prf_ind, col)] = np.nan
+        if not all(df_chunk.loc[:, (prf_ind, 'uc_tot')].isna() ==
+                   df_chunk.loc[:, (prf_ind, PRF_REF_VAL_NAME)].isna()):
+            # If I reach this point, then the data is not making sense.
+            # Warn the user and clean it up.
+            logger.warning("GDP Profile %i: NaN mismatch for 'val' and 'uc_tot'", prf_ind)
+            df_chunk.loc[df_chunk.loc[:, (prf_ind, 'uc_tot')].isna().values,
+                         (prf_ind, 'val')] = np.nan
+            for col in [PRF_REF_UCR_NAME, PRF_REF_UCS_NAME, PRF_REF_UCT_NAME, PRF_REF_UCU_NAME,
+                        'uc_tot']:
+                df_chunk.loc[df_chunk.loc[:, (prf_ind, 'val')].isna().values,
+                             (prf_ind, col)] = np.nan
+
+        if not all(df_chunk.loc[:, (prf_ind, PRF_REF_FLG_NAME)].isna() ==
+                   df_chunk.loc[:, (prf_ind, PRF_REF_VAL_NAME)].isna()):
+            logger.debug("GDP profile %i: hiding some flags for the NaN data values.")
+            df_chunk.loc[df_chunk.loc[:, (prf_ind, PRF_REF_VAL_NAME)].isna().values,
+                         (prf_ind, PRF_REF_FLG_NAME)] = np.nan
 
     # Compute the weights for each point, if applicable
     # First I need to add the new columns (one for each Profile).
-    df_chunk = df_chunk.stack(level=0)
-    df_chunk.loc[:, 'w_ps'] = np.nan
-    df_chunk = df_chunk.unstack().swaplevel(0, 1, axis=1).sort_index(axis=1)
+    # ----- What follows is prblematic because .stack looses the dtype of the flg, which
+    # wreaks havoc down the line ---
+    # df_chunk = df_chunk.stack(level=0)
+    # df_chunk.loc[:, 'w_ps'] = np.nan
+    # df_chunk = df_chunk.unstack().swaplevel(0, 1, axis=1).sort_index(axis=1)
+    # ------------------------------
+    # Crappier (?) alternative, but that doesn't mess up with the column dtypes
+    for ind in range(n_prf):
+        df_chunk.loc[:, (ind, 'w_ps')] = np.nan
 
     # Then I can fill it with the appropriate values
     if method == 'weighted mean':

@@ -14,6 +14,7 @@
 # Import from Python packages
 import logging
 import numpy as np
+import numpy.ma as ma
 import pandas as pd
 
 # Import from this package
@@ -89,6 +90,44 @@ def fancy_nansum(vals, axis=None):
     return vals.sum(axis=axis, skipna=True).mask(vals.isna().all(axis=axis))
 
 
+@log_func_call(logger)
+def fancy_bitwise_or(vals, axis=None):
+    """ A custom bitwise_or routine that ignores NaN unless only NaNs are provided, in which case
+    a NaN is returned.
+
+    Args:
+        vals (pandas.DataFrame): the data to sum.
+        axis (int, optional): on which axis to run the fancy nansum. Defaults to None
+            (=sum everything).
+
+    Returns:
+        pd.NA|int|pd.array: the result as a scale if axis=None, and a pandas array with dtype Int64
+            if not.
+    """
+
+    # Let's make sure I have been given integers, in order to run bitwise operations
+    if not all([pd.api.types.is_integer_dtype(item) for item in vals.dtypes]):
+        raise DvasError('Ouch ! I need ints to perform a bitwise OR, but I got:', vals.dtypes)
+
+    # Easy if all is nan and axis is None
+    if axis is None and vals.isna().all(axis=axis):
+        return pd.NA  # Return the 'Int64' NaN value
+
+    # Let's compute a bitwise_or along the proper axis, not forgetting to mask any NA to not blow
+    # things up.
+    out = np.bitwise_or.reduce(vals.mask(vals.isna(), 0), axis=axis)
+
+    # If the axis is None, I'll be returning a scalar, so I do not need to do anything else
+    if axis is None:
+        return out
+
+    # If the axis is not None, then I need to mask the rows or columns that are full of NA
+    # Note here that out is a numpy array with dtype "object". We shall convert this to a pandas
+    # array with dtype='Int64'
+    out[vals.isna().all(axis=axis)] = pd.NA
+    return pd.array(out).astype('Int64')
+
+
 @log_func_call(logger)
 def df_to_chunks(df, chunk_size):
     """ A utility function that breaks a Pandas dataframe into chunks of a specified size.

@@ -70,7 +70,7 @@ _RS41:
     csv_skiprows: 'lambda x: ((x<=45) or (x==47))'
     csv_skipfooter: 0
     csv_encoding: 'latin_1'
-    csv_na_values: '/'
+    csv_na_values: '//////'
     csv_header: 0
 
     _time:

@@ -137,10 +137,10 @@
                                                                             prf_df.columns.to_list()))
     rs_prf_df = rs_prfs[0].data
     print('\nRS profile dataframe:\n  index.names={}, columns={}'.format(rs_prf_df.index.names,
-                                                                       rs_prf_df.columns.to_list()))
+                                                                         rs_prf_df.columns.to_list()))
     gdp_prf_df = gdp_prfs[0].data
     print('\nGDP profile dataframe:\n  index.names={}, columns={}'.format(gdp_prf_df.index.names,
-                                                                        gdp_prf_df.columns.to_list()))
+                                                                          gdp_prf_df.columns.to_list()))
 
     # MultiProfiles has a var_info property to link the DataFrame columns to the actual variable
     print("\n Content of prfs.var_info['val']:\n")
@@ -208,12 +208,12 @@
     # Synchronizing profiles is a 2-step process. First, the shifts must be identified.
     # dvas contains several routines to do that under dvas.tools.sync
     # For example, the most basic one is to compare the altitude arrays
-    gdp_prfs_1s.sort() # <- This helps keep the order of Profiles consistent between runs.
+    gdp_prfs_1s.sort()  # <- This helps keep the order of Profiles consistent between runs.
     sync_shifts = dts.get_sync_shifts_from_alt(gdp_prfs_1s)
 
     # A fancier option is to look at the profile values, and minimize the mean of their absolute
     # difference
-    #sync_shifts = dts.get_sync_shifts_from_val(gdp_prfs, max_shift=50, first_guess=sync_shifts)
+    # sync_shifts = dts.get_sync_shifts_from_val(gdp_prfs, max_shift=50, first_guess=sync_shifts)
 
     # Given these shifts, let's compute the new length of the synchronized Profiles.
     # Do it such that no data is actually cropped out, i.e. add NaN/NaT wherever needed.
@@ -250,7 +250,7 @@
     # binning values "m".
     start_time = datetime.now()
     incompat = dtgs.gdp_incompatibilities(gdp_prfs, alpha=0.0027, m_vals=[1, 6],
-                                           do_plot=True, n_cpus=4)
+                                          do_plot=True, n_cpus=4)
     print('GDP mismatch derived in: {}s'.format((datetime.now()-start_time).total_seconds()))
 
     # Next, we derive "validities" given a specific strategy to assess the different GDP pair
@@ -271,7 +271,10 @@
     print('CWS assembled in: {}s'.format((datetime.now()-start_time).total_seconds()))
 
     # We can now inspect the result visually
-    dpg.gdps_vs_cws(gdp_prfs, cws, index_name='_idx', show=False, fn_prefix='03')
+    # First by looking at the GDP vs CWs profiles
+    dpg.gdps_vs_cws(gdp_prfs, cws, show=True, fn_prefix='03')
+    # And then also by diving into the uncertainty budget
+    dpg.uc_budget(gdp_prfs, cws, show=True, fn_prefix='03')
 
     # Save the CWS to the database.
     # One should note here that we only save the columns of the CWS DataFrame, and not the 'alt' and

@@ -301,7 +301,11 @@ def get_all_flights_from_db():
         flights = np.array([[item[1]['eid'], item[1]['rid']] for item in global_view.iterrows()])
 
         # Drop the duplicates
-        flights = np.atleast_2d(np.unique(flights))
+        flights = np.unique(flights, axis=0)
+
+        # Make some quick test to make sure I did not mess anyting up
+        assert np.ndim(flights) == 2
+        assert np.shape(flights)[1] == 2
 
         logger.info('Found %i flight(s) in the database.', len(flights))