Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Connect flags #202

Merged
merged 13 commits into from
Mar 25, 2022
2 changes: 1 addition & 1 deletion .pylintrc
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ generated-members=nc.*, # netCDF4
[BASIC]

# Good variable names which should always be accepted, separated by a comma.
good-names=_, a, b, c, d, dt, i, j, k, op, w, x, y, z
good-names=_, a, ax, b, c, d, dt, i, j, k, op, u, uc, v, w, x, y, z
5 changes: 5 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

## [Unreleased]
### Added:
- [fpavogt, 2022-03-21] Add new uc_budget diagnostic plot to dvas
- [fpavogt, 2022-03-16] Enable the cropping of descent data in the "cleanup" recipe step
- [fpavogt, 2022-03-15] Propagate flags through process_chunk(), weighted_means(), and delta()
- [fpavogt, 2022-03-08] Flag descent data in basic cleanup step. Add new 'descent' flag
- [fpavogt, 2022-03-03] Enable the resampling of profiles with a new cleanup recipe step
- [fpavogt, 2022-03-01] Add prf_summary recipe (fix #177)
Expand All @@ -16,6 +19,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
- [modolol, 2021-11-03] Add `mid` in DB search options
- [fpavogt, 2021-07-02] Add `has_tag()` convenience method to `Profile` and `MultiProfile` classes
### Fixed:
- [fpavogt, 2022-03-18] Fix #163
- [fpavogt, 2022-02-25] Fix #179
- [fpavogt, 2022-02-21] Fix #192
- [fpavogt, 2021-12-02] Fix issue #181
Expand All @@ -26,6 +30,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
- [modolol, 2021-07-12] Fix issue #185
- [modolol, 2021-07-12] Fix issue #160
### Changed:
- [fpavogt, 2022-03-18] Modified & imporoved gdp_vs_cws plots into a dedicated recipe step
- [fpavogt, 2022-03-01] Simplified the logging by removing the specific names
- [fpavogt, 2022-03-01] Add dev option to setup.py
- [modolol, 2022-02-28] Remove mandatory tag creation in config
Expand Down
1 change: 1 addition & 0 deletions src/dvas/config/definitions/csvorigmeta.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
PARAMETER_PATTERN_PROP = {
r"^\w+$": {
'oneOf': [
{"type": 'null'},
{"type": 'string'},
{"type": 'number'},
{"type": 'boolean'},
Expand Down
4 changes: 4 additions & 0 deletions src/dvas/data/strategy/resample.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,10 @@ def execute(self, prfs, freq: str = '1s', chunk_size: int = 150, n_cpus: int = 1
# Very well, let's start looping and resampling each Profile
for (prf_ind, prf) in enumerate(prfs):

# Here, do something only if I actually have data to resample
if len(prf.data) <= 1:
continue

# Let's identify the min and max integer values, rounded to the nearest second.
t_0 = min(prf.data.index.get_level_values(PRF_REF_TDT_NAME)).ceil('1s')
t_1 = max(prf.data.index.get_level_values(PRF_REF_TDT_NAME)).floor('1s')
Expand Down
421 changes: 248 additions & 173 deletions src/dvas/plots/gdps.py

Large diffs are not rendered by default.

47 changes: 44 additions & 3 deletions src/dvas/plots/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,38 @@ def set_mplstyle(style='base'):
plt.style.use(str(Path(PKG_PATH, 'plots', 'mpl_styles', PLOT_STYLES[style])))


def add_sec_axis(ax, xvals, new_xvals, offset=-0.1, which='x'):
""" Adds a secondary x-axis to the figure.

Args:
ax (matplotlib axis): the axis to which to add the new x-axis
xvals (np.ndarray): current x-axis values, spanning the full range of the plot. Must not
contain NaNs.
new_xvals (np.ndarray): new x-axis values. Must be able to linearly interpolate with xvals,
and contain no NaN's whatsoever.
offset (float, optional): offset by which to shift the new axis. Defaults to -0.1.
which (str, optional): 'x', or 'y', the axis to duplicate.

Returns:
matplotlib axis: the newly created axis.

"""

def forward(xxx):
return np.interp(xxx, xvals, new_xvals)

def inverse(xxx):
return np.interp(xxx, new_xvals, xvals)

if which == 'x':
new_ax = ax.secondary_xaxis(offset, functions=(forward, inverse))
elif which == 'y':
new_ax = ax.secondary_yaxis(offset, functions=(forward, inverse))
else:
raise DvasError('Ouch ! which should be "x" or "y", not: %s' % (which))
return new_ax


def fix_txt(txt):
""" Corrects any string for problematic characters before it gets added to a plot. Fixes vary
depending whether on the chosen plotting style's value of `usetex` in `rcparams`.
Expand All @@ -136,7 +168,16 @@ def fix_txt(txt):
# First deal with the cases when a proper LaTeX is being used
if usetex:
txt = txt.replace('%', r'{\%}')
txt = txt.replace('_', r'{\_}')
txt = [item.replace('_', r'\_') if ind % 2 == 0 else item
for (ind, item) in enumerate(txt.split('$'))]
txt = '$'.join(txt)

else:
txt = txt.replace(r'\smaller', '')
txt = txt.replace(r'\bf', r'')
txt = txt.replace(r'\it', r'')
txt = txt.replace(r'\flushleft', r'')
txt = txt.replace(r'\newline', r'')

return txt

Expand Down Expand Up @@ -258,9 +299,9 @@ def add_source(fig):
fig (matplotlib.pyplot.figure): the figure to add the text to.

"""
msg = 'Created with dvas v{}'.format(VERSION)
msg = r'\it\flushleft Created with\newline dvas v{}'.format(VERSION)

fig.text(0.01, 0.02, msg, fontsize='xx-small',
fig.text(0.01, 0.02, fix_txt(msg), fontsize='xx-small',
horizontalalignment='left', verticalalignment='bottom')


Expand Down
7 changes: 1 addition & 6 deletions src/dvas/tools/gdps/gdps.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,11 +189,6 @@ def combine(gdp_prfs, binning=1, method='weighted mean', mask_flgs=None, chunk_s
# Re-assemble all the chunks into one DataFrame.
proc_chunk = pd.concat(proc_chunks, axis=0)

# Set the intiial flags of the combined profile.
# Should I actually be setting any ? E.g. to make the difference between True NaN's and
# Compatibility NaN's ? Can I even do that in here ?
proc_chunk.loc[:, 'flg'] = 0

# Almost there. Now we just need to package this into a clean MultiGDPProfile
# Let's first prepare the info dict
new_rig_tag = 'r:'+','.join([item.split(':')[1]
Expand All @@ -210,7 +205,7 @@ def combine(gdp_prfs, binning=1, method='weighted mean', mask_flgs=None, chunk_s
# It's no different from a GDP, from the perspective of the errors.
new_prf = CWSProfile(new_info, data=proc_chunk)

# And finally, package this into a MultiGDPProfile entity
# And finally, package this into a MultiCWSProfile entity
out = MultiCWSProfile()
out.update(gdp_prfs.db_variables, data=[new_prf])

Expand Down
77 changes: 58 additions & 19 deletions src/dvas/tools/gdps/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from ...errors import DvasError
from ...hardcoded import PRF_REF_TDT_NAME, PRF_REF_ALT_NAME, PRF_REF_VAL_NAME, PRF_REF_FLG_NAME
from ...hardcoded import PRF_REF_UCR_NAME, PRF_REF_UCS_NAME, PRF_REF_UCT_NAME, PRF_REF_UCU_NAME
from ..tools import fancy_nansum
from ..tools import fancy_nansum, fancy_bitwise_or
from .correlations import coeffs

# Setup local logger
Expand Down Expand Up @@ -73,7 +73,7 @@ def weighted_mean(df_chunk, binning=1):

# Let's make sure their ID is what I expect them to be.
if not np.array_equal(df_chunk.columns.unique(level=0), range(n_prf)):
raise DvasError('Ouch ! Profile values must be grouped usign MultiIndex with ids 0,1, ...')
raise DvasError('Ouch ! Profile values must be grouped using MultiIndex with ids 0,1, ...')

# Force the weights to be NaNs if the data is a NaN. Else, the normalization will be off.
mask = df_chunk.xs(PRF_REF_VAL_NAME, level=1, axis=1).isna()
Expand Down Expand Up @@ -160,6 +160,19 @@ def weighted_mean(df_chunk, binning=1):
mask=(np.isnan(wtot_mat) | np.isnan(w_mat)),
fill_value=np.nan)

# Before we end, let us compute the flags. We apply a general bitwise OR to them, such that
# they do not cancel each other or disappear: they get propagated all the way
# First, we assemble them at high resolution. Note here that the flag for any weight that is
# NaN or 0 is set to 0 (but not pd.NA). THis is on purpose, to show that the resulting point
# is not inheriting "no" flags.
flgs = pd.DataFrame(fancy_bitwise_or(
df_chunk.loc[:, (slice(None), 'flg')].mask((w_ps.isna() | (w_ps == 0)).values), axis=1))

# Then, only if warranted, apply the binning too
if binning > 1:
flgs = flgs.groupby(flgs.index//binning).aggregate(fancy_bitwise_or)
chunk_out[PRF_REF_FLG_NAME] = flgs

return chunk_out, jac_mat


Expand Down Expand Up @@ -227,8 +240,8 @@ def delta(df_chunk, binning=1):
# Keep track of how many valid rows I am summing in each bin.
# It would seem logical to store these as int. However, I will soon divide by those
# numbers, and will need NaN's to avoid RunTime Warnings. 'int' does not support these,
# and 'Int64' cannot be used to divide TiemDeltas ... so float it is ... sigh.
# WARNING: here, i look at the number of valid rows **for that specific column**.
# and 'Int64' cannot be used to divide TimeDeltas ... so float it is ... sigh.
# WARNING: here, I look at the number of valid rows **for that specific column**.
# This implies that, potentially, the delta action may combine different rows for
# different columns.
# TODO: fix this ... by using flags to select valid rows ?
Expand Down Expand Up @@ -284,6 +297,19 @@ def delta(df_chunk, binning=1):
# Adjust the mask accordingly.
jac_mat[valid_rows == 0, :] = np.ma.masked

# Before we end, let us compute the flags. We apply a general bitwise OR to them, such that
# they do not cancel each other or disappear: they get propagated all the way
# First, we assemble them at high resolution. Note that we here mask any flags that belongs to
# a NaN value, because this is not actually used in the delta.
flgs = pd.DataFrame(fancy_bitwise_or(
df_chunk.loc[:, (slice(None), 'flg')].mask(df_chunk.loc[:, (slice(None),
'val')].isna().values), axis=1))

# Then, only if warranted, apply the binning too
if binning > 1:
flgs = flgs.groupby(flgs.index//binning).aggregate(fancy_bitwise_or)
chunk_out[PRF_REF_FLG_NAME] = flgs

return chunk_out, jac_mat


Expand Down Expand Up @@ -337,25 +363,38 @@ def process_chunk(df_chunk, binning=1, method='weighted mean'):
n_prf = len(df_chunk.columns.unique(level=0))

# Data consistency check & cleanup: if val is NaN, all the errors should be NaNs
# (and vice-versa).
# (and vice-versa). Also, if val is NaN, then we should ignore its flags, because as a NaN
# it will not be taken into account (but this should not trigger a warning).
for prf_ind in range(n_prf):
if all(df_chunk.loc[:, (prf_ind, 'uc_tot')].isna() ==
df_chunk.loc[:, (prf_ind, 'val')].isna()):
continue

# If I reach this point, then the data is not making sense. Warn the user and clean it up.
logger.warning("GDP Profile %i: NaN mismatch for 'val' and 'uc_tot'", prf_ind)
df_chunk.loc[df_chunk.loc[:, (prf_ind, 'uc_tot')].isna().values, (prf_ind, 'val')] = np.nan
for col in [PRF_REF_UCR_NAME, PRF_REF_UCS_NAME, PRF_REF_UCT_NAME, PRF_REF_UCU_NAME,
'uc_tot']:
df_chunk.loc[df_chunk.loc[:, (prf_ind, 'val')].isna().values,
(prf_ind, col)] = np.nan
if not all(df_chunk.loc[:, (prf_ind, 'uc_tot')].isna() ==
df_chunk.loc[:, (prf_ind, PRF_REF_VAL_NAME)].isna()):
# If I reach this point, then the data is not making sense.
# Warn the user and clean it up.
logger.warning("GDP Profile %i: NaN mismatch for 'val' and 'uc_tot'", prf_ind)
df_chunk.loc[df_chunk.loc[:, (prf_ind, 'uc_tot')].isna().values,
(prf_ind, 'val')] = np.nan
for col in [PRF_REF_UCR_NAME, PRF_REF_UCS_NAME, PRF_REF_UCT_NAME, PRF_REF_UCU_NAME,
'uc_tot']:
df_chunk.loc[df_chunk.loc[:, (prf_ind, 'val')].isna().values,
(prf_ind, col)] = np.nan

if not all(df_chunk.loc[:, (prf_ind, PRF_REF_FLG_NAME)].isna() ==
df_chunk.loc[:, (prf_ind, PRF_REF_VAL_NAME)].isna()):
logger.debug("GDP profile %i: hiding some flags for the NaN data values.")
df_chunk.loc[df_chunk.loc[:, (prf_ind, PRF_REF_VAL_NAME)].isna().values,
(prf_ind, PRF_REF_FLG_NAME)] = np.nan

# Compute the weights for each point, if applicable
# First I need to add the new columns (one for each Profile).
df_chunk = df_chunk.stack(level=0)
df_chunk.loc[:, 'w_ps'] = np.nan
df_chunk = df_chunk.unstack().swaplevel(0, 1, axis=1).sort_index(axis=1)
# ----- What follows is prblematic because .stack looses the dtype of the flg, which
# wreaks havoc down the line ---
# df_chunk = df_chunk.stack(level=0)
# df_chunk.loc[:, 'w_ps'] = np.nan
# df_chunk = df_chunk.unstack().swaplevel(0, 1, axis=1).sort_index(axis=1)
# ------------------------------
# Crappier (?) alternative, but that doesn't mess up with the column dtypes
for ind in range(n_prf):
df_chunk.loc[:, (ind, 'w_ps')] = np.nan

# Then I can fill it with the appropriate values
if method == 'weighted mean':
Expand Down
39 changes: 39 additions & 0 deletions src/dvas/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# Import from Python packages
import logging
import numpy as np
import numpy.ma as ma
import pandas as pd

# Import from this package
Expand Down Expand Up @@ -89,6 +90,44 @@ def fancy_nansum(vals, axis=None):
return vals.sum(axis=axis, skipna=True).mask(vals.isna().all(axis=axis))


@log_func_call(logger)
def fancy_bitwise_or(vals, axis=None):
""" A custom bitwise_or routine that ignores NaN unless only NaNs are provided, in which case
a NaN is returned.

Args:
vals (pandas.DataFrame): the data to sum.
axis (int, optional): on which axis to run the fancy nansum. Defaults to None
(=sum everything).

Returns:
pd.NA|int|pd.array: the result as a scale if axis=None, and a pandas array with dtype Int64
if not.
"""

# Let's make sure I have been given integers, in order to run bitwise operations
if not all([pd.api.types.is_integer_dtype(item) for item in vals.dtypes]):
raise DvasError('Ouch ! I need ints to perform a bitwise OR, but I got:', vals.dtypes)

# Easy if all is nan and axis is None
if axis is None and vals.isna().all(axis=axis):
return pd.NA # Return the 'Int64' NaN value

# Let's compute a bitwise_or along the proper axis, not forgetting to mask any NA to not blow
# things up.
out = np.bitwise_or.reduce(vals.mask(vals.isna(), 0), axis=axis)

# If the axis is None, I'll be returning a scalar, so I do not need to do anything else
if axis is None:
return out

# If the axis is not None, then I need to mask the rows or columns that are full of NA
# Note here that out is a numpy array with dtype "object". We shall convert this to a pandas
# array with dtype='Int64'
out[vals.isna().all(axis=axis)] = pd.NA
return pd.array(out).astype('Int64')


@log_func_call(logger)
def df_to_chunks(df, chunk_size):
""" A utility function that breaks a Pandas dataframe into chunks of a specified size.
Expand Down
2 changes: 1 addition & 1 deletion src/dvas_demo/config/orig_data/RS41_orig_data_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ _RS41:
csv_skiprows: 'lambda x: ((x<=45) or (x==47))'
csv_skipfooter: 0
csv_encoding: 'latin_1'
csv_na_values: '/'
csv_na_values: '//////'
csv_header: 0

_time:
Expand Down
15 changes: 9 additions & 6 deletions src/dvas_demo/demo_script.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,10 +137,10 @@
prf_df.columns.to_list()))
rs_prf_df = rs_prfs[0].data
print('\nRS profile dataframe:\n index.names={}, columns={}'.format(rs_prf_df.index.names,
rs_prf_df.columns.to_list()))
rs_prf_df.columns.to_list()))
gdp_prf_df = gdp_prfs[0].data
print('\nGDP profile dataframe:\n index.names={}, columns={}'.format(gdp_prf_df.index.names,
gdp_prf_df.columns.to_list()))
gdp_prf_df.columns.to_list()))

# MultiProfiles has a var_info property to link the DataFrame columns to the actual variable
print("\n Content of prfs.var_info['val']:\n")
Expand Down Expand Up @@ -208,12 +208,12 @@
# Synchronizing profiles is a 2-step process. First, the shifts must be identified.
# dvas contains several routines to do that under dvas.tools.sync
# For example, the most basic one is to compare the altitude arrays
gdp_prfs_1s.sort() # <- This helps keep the order of Profiles consistent between runs.
gdp_prfs_1s.sort() # <- This helps keep the order of Profiles consistent between runs.
sync_shifts = dts.get_sync_shifts_from_alt(gdp_prfs_1s)

# A fancier option is to look at the profile values, and minimize the mean of their absolute
# difference
#sync_shifts = dts.get_sync_shifts_from_val(gdp_prfs, max_shift=50, first_guess=sync_shifts)
# sync_shifts = dts.get_sync_shifts_from_val(gdp_prfs, max_shift=50, first_guess=sync_shifts)

# Given these shifts, let's compute the new length of the synchronized Profiles.
# Do it such that no data is actually cropped out, i.e. add NaN/NaT wherever needed.
Expand Down Expand Up @@ -250,7 +250,7 @@
# binning values "m".
start_time = datetime.now()
incompat = dtgs.gdp_incompatibilities(gdp_prfs, alpha=0.0027, m_vals=[1, 6],
do_plot=True, n_cpus=4)
do_plot=True, n_cpus=4)
print('GDP mismatch derived in: {}s'.format((datetime.now()-start_time).total_seconds()))

# Next, we derive "validities" given a specific strategy to assess the different GDP pair
Expand All @@ -271,7 +271,10 @@
print('CWS assembled in: {}s'.format((datetime.now()-start_time).total_seconds()))

# We can now inspect the result visually
dpg.gdps_vs_cws(gdp_prfs, cws, index_name='_idx', show=False, fn_prefix='03')
# First by looking at the GDP vs CWs profiles
dpg.gdps_vs_cws(gdp_prfs, cws, show=True, fn_prefix='03')
# And then also by diving into the uncertainty budget
dpg.uc_budget(gdp_prfs, cws, show=True, fn_prefix='03')

# Save the CWS to the database.
# One should note here that we only save the columns of the CWS DataFrame, and not the 'alt' and
Expand Down
6 changes: 5 additions & 1 deletion src/dvas_recipes/recipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,11 @@ def get_all_flights_from_db():
flights = np.array([[item[1]['eid'], item[1]['rid']] for item in global_view.iterrows()])

# Drop the duplicates
flights = np.atleast_2d(np.unique(flights))
flights = np.unique(flights, axis=0)

# Make some quick test to make sure I did not mess anyting up
assert np.ndim(flights) == 2
assert np.shape(flights)[1] == 2

logger.info('Found %i flight(s) in the database.', len(flights))

Expand Down
Loading