Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

framework for a python buddy check equivalent of TITAN #375

Merged
merged 6 commits into from
Sep 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 122 additions & 0 deletions metobs_toolkit/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
step_check,
window_variation_check,
invalid_input_check,
toolkit_buddy_check,
titan_buddy_check,
titan_sct_resistant_check
)
Expand Down Expand Up @@ -1553,6 +1554,127 @@ def apply_quality_control(self, obstype="temp",
self._qc_checked_obstypes = list(set(self._qc_checked_obstypes))
self.outliersdf = self.outliersdf.sort_index()


def apply_buddy_check(self, obstype='temp', use_constant_altitude=False,
haversine_approx=True, metric_epsg='31370'):
"""Apply the buddy check on the observations.

The buddy check compares an observation against its neighbours (i.e.
buddies). The check looks for buddies in a neighbourhood specified by
a certain radius. The buddy check flags observations if the
(absolute value of the) difference between the observations and the
average of the neighbours normalized by the standard deviation in the
circle is greater than a predefined threshold.

This check is based on the buddy check from titanlib. Documentation on
the titanlib buddy check can be found
`here <https://github.com/metno/titanlib/wiki/Buddy-check>`_.


The observation and outliers attributes will be updated accordingly.

Parameters
----------
obstype : String, optional
Name of the observationtype you want to apply the checks on. The
default is 'temp'.
use_constant_altitude : bool, optional
Use a constant altitude for all stations. The default is False.
haversine_approx : bool, optional
Use the haversine approximation (earth is a sphere) to calculate
distances between stations. The default is True.
metric_epsg : str, optional
EPSG code for the metric CRS to calculate distances in. Only used when
haversine approximation is set to False. Thus becoming a better
distance approximation but not global applicable The default is '31370'
(which is suitable for Belgium).

Returns
-------
None.

"""

logger.info("Applying the toolkit buddy check")

checkname = 'buddy_check'

# 1. coordinates are available?
if self.metadf['lat'].isnull().any():
logger.warning(f'Not all coordinates are available, the {checkname} cannot be executed!')
return
if self.metadf['lon'].isnull().any():
logger.warning(f'Not all coordinates are available, the {checkname} cannot be executed!')
return

# set constant altitude if needed:

# if altitude is already available, save it to restore it after this check
restore_altitude = False
if (use_constant_altitude):
if ('altitulde' in self.metadf.columns):
self.metadf['altitude_backup'] = self.metadf['altitude']
restore_altitude = True

self.metadf['altitude'] = 2. # absolut value does not matter

# 2. altitude available?
if ((not use_constant_altitude) & ('altitude' not in self.metadf.columns)):
logger.warning(f'The altitude is not known for all stations. The {checkname} cannot be executed!')
logger.info('(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n update the "altitude" column in the metadf attribute of your Dataset.')
return
if ((not use_constant_altitude) & (self.metadf['altitude'].isnull().any())):
logger.warning(f'The altitude is not known for all stations. The {checkname} cannot be executed!')
logger.info('(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n *Update the "altitude" column in the metadf attribute of your Dataset.)')
return

apliable = _can_qc_be_applied(self, obstype, checkname)
if apliable:
buddy_set = self.settings.qc['qc_check_settings'][checkname][obstype]
outl_flag = self.settings.qc['qc_checks_info'][checkname]['outlier_flag']
obsdf, outliersdf = toolkit_buddy_check(obsdf=self.df,
metadf=self.metadf,
obstype=obstype,
buddy_radius=buddy_set['radius'],
min_sample_size=buddy_set['num_min'],
max_alt_diff=buddy_set['max_elev_diff'],
min_std=buddy_set['min_std'],
std_threshold=buddy_set['threshold'],
metric_epsg=metric_epsg,
lapserate=buddy_set['elev_gradient'],
outl_flag=outl_flag,
haversine_approx=haversine_approx,
)

# update the dataset and outliers
self.df = obsdf
if not outliersdf.empty:
self.outliersdf = pd.concat([self.outliersdf, outliersdf])

# add this check to the applied checks
self._applied_qc = pd.concat(
[
self._applied_qc,
conv_applied_qc_to_df(
obstypes=obstype, ordered_checknames=checkname
),
],
ignore_index=True,
)

else:
logger.warning(f'The {checkname} can NOT be applied on {obstype} because it was already applied on this observation type!')

# Revert artificial data that has been added if needed
if restore_altitude: # altitude was overwritten, thus revert it
self.metadf['altitude'] = self.metadf["altitude_backup"]
self.metadf = self.metadf.drop(columns=['altitude_backup'])

elif (use_constant_altitude):
# when no alitude was available apriori, remove the fake constant altitude column
self.metadf = self.metadf.drop(columns=['altitude'])


def apply_titan_buddy_check(self, obstype='temp', use_constant_altitude=False):
"""Apply the TITAN buddy check on the observations.

Expand Down
80 changes: 79 additions & 1 deletion metobs_toolkit/dataset_settings_updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,13 @@ def update_qc_settings(self, obstype='temp',
win_var_time_win_to_check=None,
win_var_min_num_obs=None,
step_max_increase_per_sec=None,
step_max_decrease_per_sec=None):
step_max_decrease_per_sec=None,
buddy_radius=None,
buddy_min_sample_size=None,
buddy_max_elev_diff=None,
buddy_min_std=None,
buddy_threshold=None,
buddy_elev_gradient=None):
"""Update the QC settings for the specified observation type.

If a argument value is None, the default settings will not be updated.
Expand Down Expand Up @@ -228,6 +234,23 @@ def update_qc_settings(self, obstype='temp',
Maximal increase per second for step check. The default is None.
step_max_decrease_per_sec : numeric (< 0), optional
Maximal decrease per second for step check. The default is None.
buddy_radius : numeric (> 0), optional
The radius to define neighbours in meters. The default is None.
buddy_min_sample_size : int (> 2), optional
The minimum sample size to calculate statistics on. The default is
None.
buddy_max_elev_diff : numeric (> 0), optional
The maximum altitude difference allowed for buddies. The default is
None.
buddy_min_std : numeric (> 0), optional
The minimum standard deviation for sample statistics. This should
represent the accuracty of the observations. The default is None.
buddy_threshold : numeric (> 0), optional
The threshold (std units) for flaggging observations as buddy
outliers. The default is None.
buddy_elev_gradient : numeric, optional
Describes how the obstype changes with altitude (in meters). The
default is -0.0065. The default is None.

Returns
-------
Expand Down Expand Up @@ -371,6 +394,61 @@ def _updater(dictionary, obstype, argname, value):

logger.info(f'Maximal decrease per second for step check updated: {updatestr}')

# Buddy check
buddy_elev_gradient=None
if buddy_radius is not None:
self.settings.qc['qc_check_settings']["buddy_check"], updatestr = _updater(
self.settings.qc['qc_check_settings']["buddy_check"],
obstype=obstype,
argname="radius",
value=abs(float(buddy_radius)))
logger.info(f'Buddy radius for buddy check updated: {updatestr}')

if buddy_min_sample_size is not None:
value = abs(int(buddy_min_sample_size))
if value >= 2:
self.settings.qc['qc_check_settings']["buddy_check"], updatestr = _updater(
self.settings.qc['qc_check_settings']["buddy_check"],
obstype=obstype,
argname="num_min",
value=value)
logger.info(f'Minimum number of buddies for buddy check updated: {updatestr}')
else:
logger.warning(f'Minimum number of buddies must be >= 2, but {value} is given. Not updated.')

if buddy_max_elev_diff is not None:
self.settings.qc['qc_check_settings']["buddy_check"], updatestr = _updater(
self.settings.qc['qc_check_settings']["buddy_check"],
obstype=obstype,
argname="max_elev_diff",
value=abs(float(buddy_max_elev_diff)))
logger.info(f'Max elevation differences for buddy check updated: {updatestr}')

if buddy_min_std is not None:
self.settings.qc['qc_check_settings']["buddy_check"], updatestr = _updater(
self.settings.qc['qc_check_settings']["buddy_check"],
obstype=obstype,
argname="min_std",
value=abs(float(buddy_min_std)))
logger.info(f'Minimum std in sample for buddy check updated: {updatestr}')

if buddy_threshold is not None:
self.settings.qc['qc_check_settings']["buddy_check"], updatestr = _updater(
self.settings.qc['qc_check_settings']["buddy_check"],
obstype=obstype,
argname="threshold",
value=abs(float(buddy_threshold)))
logger.info(f'Outlier threshold (in sigma) for buddy check updated: {updatestr}')

if buddy_elev_gradient is not None:
self.settings.qc['qc_check_settings']["buddy_check"], updatestr = _updater(
self.settings.qc['qc_check_settings']["buddy_check"],
obstype=obstype,
argname="elev_gradient",
value=float(buddy_max_elev_diff))
logger.info(f'Elevation gradient for buddy check updated: {updatestr}')


def update_titan_qc_settings(self, obstype='temp',
# buddy settings
buddy_radius=None,
Expand Down
Loading