From dd6276849cdf2dc14e25d7a263549acbc6868d4a Mon Sep 17 00:00:00 2001 From: Thomas Vergauwen Date: Thu, 31 Aug 2023 21:58:38 +0200 Subject: [PATCH 1/6] framework for a python buddy check equivalent of TITAN --- metobs_toolkit/dataset.py | 83 +++++++++++ metobs_toolkit/qc_checks.py | 135 ++++++++++++++++++ .../default_formats_settings.py | 1 + metobs_toolkit/settings_files/qc_settings.py | 17 +++ 4 files changed, 236 insertions(+) diff --git a/metobs_toolkit/dataset.py b/metobs_toolkit/dataset.py index d5c6db19..91a3bae7 100644 --- a/metobs_toolkit/dataset.py +++ b/metobs_toolkit/dataset.py @@ -49,6 +49,7 @@ step_check, window_variation_check, invalid_input_check, + toolkit_buddy_check, titan_buddy_check, titan_sct_resistant_check ) @@ -1553,6 +1554,88 @@ def apply_quality_control(self, obstype="temp", self._qc_checked_obstypes = list(set(self._qc_checked_obstypes)) self.outliersdf = self.outliersdf.sort_index() + + def apply_toolkit_buddy_check(self, obstype='temp', use_constant_altitude=False, + metric_epsg='31370'): + + logger.info("Applying the toolkit buddy check") + + checkname = 'toolkit_buddy_check' + + # 1. coordinates are available? + if self.metadf['lat'].isnull().any(): + logger.warning(f'Not all coordinates are available, the {checkname} cannot be executed!') + return + if self.metadf['lon'].isnull().any(): + logger.warning(f'Not all coordinates are available, the {checkname} cannot be executed!') + return + + # set constant altitude if needed: + + # if altitude is already available, save it to restore it after this check + restore_altitude = False + if (use_constant_altitude): + if ('altitulde' in self.metadf.columns): + self.metadf['altitude_backup'] = self.metadf['altitude'] + restore_altitude = True + + self.metadf['altitude'] = 2. # absolut value does not matter + + # 2. altitude available? + if ((not use_constant_altitude) & ('altitude' not in self.metadf.columns)): + logger.warning(f'The altitude is not known for all stations. The {checkname} cannot be executed!') + logger.info('(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n update the "altitude" column in the metadf attribute of your Dataset.') + return + if ((not use_constant_altitude) & (self.metadf['altitude'].isnull().any())): + logger.warning(f'The altitude is not known for all stations. The {checkname} cannot be executed!') + logger.info('(To resolve this error you can: \n *Use the Dataset.get_altitude() method \n *Set use_constant_altitude to True \n *Update the "altitude" column in the metadf attribute of your Dataset.)') + return + + apliable = _can_qc_be_applied(self, obstype, checkname) + if apliable: + buddy_set = self.settings.qc['qc_check_settings'][checkname][obstype] + outl_flag = self.settings.qc['qc_checks_info']['toolkit_buddy_check']['outlier_flag'] + obsdf, outliersdf = toolkit_buddy_check(obsdf=self.df, + metadf=self.metadf, + obstype=obstype, + buddy_radius=buddy_set['radius'], + min_sample_size=buddy_set['num_min'], + max_alt_diff=buddy_set['max_elev_diff'], + min_std=buddy_set['min_std'], + std_threshold=buddy_set['threshold'], + metric_epsg=metric_epsg, + lapserate=buddy_set['elev_gradient'], + outl_flag=outl_flag) + + # update the dataset and outliers + self.df = obsdf + if not outliersdf.empty: + self.outliersdf = pd.concat([self.outliersdf, outliersdf]) + + # add this check to the applied checks + self._applied_qc = pd.concat( + [ + self._applied_qc, + conv_applied_qc_to_df( + obstypes=obstype, ordered_checknames=checkname + ), + ], + ignore_index=True, + ) + + else: + logger.warning(f'The {checkname} can NOT be applied on {obstype} because it was already applied on this observation type!') + + # Revert artificial data that has been added if needed + if restore_altitude: # altitude was overwritten, thus revert it + self.metadf['altitude'] = self.metadf["altitude_backup"] + self.metadf = self.metadf.drop(columns=['altitude_backup']) + + elif (use_constant_altitude): + # when no alitude was available apriori, remove the fake constant altitude column + self.metadf = self.metadf.drop(columns=['altitude']) + + def apply_titan_buddy_check(self, obstype='temp', use_constant_altitude=False): """Apply the TITAN buddy check on the observations. diff --git a/metobs_toolkit/qc_checks.py b/metobs_toolkit/qc_checks.py index 52577021..86e4c3fb 100644 --- a/metobs_toolkit/qc_checks.py +++ b/metobs_toolkit/qc_checks.py @@ -733,6 +733,141 @@ def create_titanlib_points_dict(obsdf, metadf, obstype): return points_dict +# ============================================================================= +# Toolkit buddy check +# ============================================================================= + + + + +def _calculate_distance_matrix(metadf, metric_epsg='31370'): + metric_metadf = metadf.to_crs(epsg=metric_epsg) + return metric_metadf.geometry.apply(lambda g: metric_metadf.geometry.distance(g)) + + + +def _find_spatial_buddies(distance_df, buddy_radius): + """ Get neighbouring stations using buddy radius.""" + buddies = {} + for refstation, distances in distance_df.iterrows(): + bud_stations =distances[distances <= buddy_radius].index.to_list() + bud_stations.remove(refstation) + buddies[refstation] = bud_stations + + return buddies + + + + +# filter altitude buddies +def _filter_to_altitude_buddies(spatial_buddies, metadf, max_altitude_diff): + """Filter neighbours by maximum altitude difference.""" + alt_buddies_dict = {} + for refstation, buddylist in spatial_buddies.items(): + alt_diff = abs((metadf.loc[buddylist, 'altitude']) - metadf.loc[refstation, 'altitude']) + alt_buddies = alt_diff[alt_diff <= max_altitude_diff].index.to_list() + alt_buddies_dict[refstation] = alt_buddies + return alt_buddies_dict + + + + + +def _filter_to_samplesize(buddydict, min_sample_size): + """Filter stations that are to isolated using minimum sample size.""" + to_check_stations = {} + for refstation, buddies in buddydict.items(): + if len(buddies) < min_sample_size: + # not enough buddies + to_check_stations[refstation] = [] # remove buddies + else: + to_check_stations[refstation] = buddies + return to_check_stations + + + + + + +def toolkit_buddy_check(obsdf, metadf, obstype, buddy_radius, min_sample_size, max_alt_diff, + min_std, std_threshold, outl_flag, metric_epsg='31370', lapserate=-0.0065): + + + outliers_idx = init_multiindex() + + # Get spatial buddies for each station + distance_df = _calculate_distance_matrix(metadf=metadf, + metric_epsg=metric_epsg) + buddies = _find_spatial_buddies(distance_df=distance_df, + buddy_radius=buddy_radius) + + # Filter by altitude difference + buddies = _filter_to_altitude_buddies(spatial_buddies=buddies, + metadf=metadf, + max_altitude_diff=max_alt_diff) + + # Filter by samplesize + buddydict =_filter_to_samplesize(buddydict=buddies, + min_sample_size=min_sample_size) + + # Apply buddy check station per station + for refstation, buddies in buddydict.items(): + if len(buddies) == 0: + logger.debug(f'{refstation} has not enough suitable buddies.') + continue + + # Get observations + buddies_obs = obsdf[obsdf.index.get_level_values('name').isin(buddies)][obstype] + # Unstack + buddies_obs = buddies_obs.unstack(level='name') + # Make lapsrate correction: + buddy_correction = (metadf.loc[buddies, 'altitude'] * lapserate).to_dict() + for bud in buddies_obs.columns: + buddies_obs[bud] = buddies_obs[bud] - buddy_correction[bud] + + # calucalate std and mean row wise + buddies_obs['mean'] = buddies_obs.mean(axis=1) + buddies_obs['std'] = buddies_obs.std(axis=1) + buddies_obs['samplesize'] = buddies_obs.count(axis=1) + # replace where needed with min std + buddies_obs['std'] = buddies_obs['std'].where(cond=buddies_obs['std'] >= min_std, + other=min_std) + + # correct refstation for altitude + ref_obs = obsdf[obsdf.index.get_level_values('name') == refstation][obstype].unstack(level='name') + ref_obs = ref_obs - (metadf.loc[refstation, 'altitude'] * lapserate) + + # left merge on buddies + buddies_obs = buddies_obs.merge(ref_obs, + how='left', + left_index=True, + right_index=True) + # Calculate sigma + buddies_obs['chi'] = (abs(buddies_obs['mean'] - buddies_obs[refstation])) / buddies_obs['std'] + + outliers = buddies_obs[(buddies_obs['chi'] > std_threshold) & (buddies_obs['samplesize'] >= min_sample_size)] + + # to multiindex + outliers['name'] = refstation + outliers = outliers.reset_index().set_index(['name', 'datetime']).index + outliers_idx = outliers_idx.append(outliers) + + # Update the outliers and replace the obsdf + + obsdf, outlier_df = make_outlier_df_for_check( + station_dt_list=outliers_idx, + obsdf=obsdf, + obstype=obstype, + flag=outl_flag, + ) + + return obsdf, outlier_df + + + +# ============================================================================= +# +# ============================================================================= def titan_buddy_check(obsdf, metadf, obstype, checks_info, checks_settings, titan_specific_labeler): """ diff --git a/metobs_toolkit/settings_files/default_formats_settings.py b/metobs_toolkit/settings_files/default_formats_settings.py index 4e369d12..1d0d1726 100644 --- a/metobs_toolkit/settings_files/default_formats_settings.py +++ b/metobs_toolkit/settings_files/default_formats_settings.py @@ -97,6 +97,7 @@ "repetitions": "#056ff0", "step": "#05d4f0", "window_variation": "#05f0c9", + 'toolkit_buddy_check': '#8300c4', "titan_buddy_check": '#8300c4', "titan_sct_resistant_check": '#c17fe1', # missing and gap diff --git a/metobs_toolkit/settings_files/qc_settings.py b/metobs_toolkit/settings_files/qc_settings.py index 99622a09..c8ba44e3 100644 --- a/metobs_toolkit/settings_files/qc_settings.py +++ b/metobs_toolkit/settings_files/qc_settings.py @@ -38,6 +38,19 @@ "max_increase_per_second": 8.0 / 3600.0, "max_decrease_per_second": -10.0 / 3600.0, } + }, + "toolkit_buddy_check": { + "temp":{ + 'radius': 15000, #Search radius in meter + 'num_min': 2, # int The minimum number of buddies a station can have + 'threshold': 1.5, # σ the variance threshold for flagging a station + 'max_elev_diff': 200, # m the maximum difference in elevation for a buddy (if negative will not check for heigh difference) + 'elev_gradient': -0.0065, # linear elevation gradient with height + 'min_std': 1.0, # If the standard deviation of values in a neighborhood are less than min_std, min_std will be used instead + + } + + } } @@ -146,6 +159,10 @@ "numeric_flag": 8, "apply_on": "obstype", }, + "toolkit_buddy_check": { + "outlier_flag": "tlk buddy check outlier", + "numeric_flag": 11, + "apply_on": "obstype"}, "titan_buddy_check": { "outlier_flag": "buddy check outlier", "numeric_flag": 9, From a691dc4ce9c2ecf6a0c3009dc8e35317efdb4704 Mon Sep 17 00:00:00 2001 From: Thomas Vergauwen Date: Fri, 1 Sep 2023 12:02:11 +0200 Subject: [PATCH 2/6] tested in comparison with titanlib --- metobs_toolkit/qc_checks.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/metobs_toolkit/qc_checks.py b/metobs_toolkit/qc_checks.py index 86e4c3fb..315cd3d4 100644 --- a/metobs_toolkit/qc_checks.py +++ b/metobs_toolkit/qc_checks.py @@ -820,26 +820,32 @@ def toolkit_buddy_check(obsdf, metadf, obstype, buddy_radius, min_sample_size, m buddies_obs = obsdf[obsdf.index.get_level_values('name').isin(buddies)][obstype] # Unstack buddies_obs = buddies_obs.unstack(level='name') + # Make lapsrate correction: - buddy_correction = (metadf.loc[buddies, 'altitude'] * lapserate).to_dict() + ref_alt = metadf.loc[refstation, 'altitude'] + buddy_correction = ((metadf.loc[buddies, 'altitude'] - ref_alt) * (-1. * lapserate)).to_dict() for bud in buddies_obs.columns: buddies_obs[bud] = buddies_obs[bud] - buddy_correction[bud] # calucalate std and mean row wise - buddies_obs['mean'] = buddies_obs.mean(axis=1) - buddies_obs['std'] = buddies_obs.std(axis=1) - buddies_obs['samplesize'] = buddies_obs.count(axis=1) + buddies_obs['mean'] = buddies_obs[buddies].mean(axis=1) + buddies_obs['std'] = buddies_obs[buddies].std(axis=1) + buddies_obs['samplesize'] = buddies_obs[buddies].count(axis=1) + + # from titan they use std adjust which is float std_adjusted = sqrt(variance + variance / n_buddies); + # This is not used + # buddies_obs['var'] = buddies_obs[buddies].var(axis=1) + # buddies_obs['std_adj'] =np.sqrt(buddies_obs['var'] + buddies_obs['var']/buddies_obs['samplesize']) + + # replace where needed with min std buddies_obs['std'] = buddies_obs['std'].where(cond=buddies_obs['std'] >= min_std, other=min_std) - # correct refstation for altitude + # Get refstation observations and merge ref_obs = obsdf[obsdf.index.get_level_values('name') == refstation][obstype].unstack(level='name') - ref_obs = ref_obs - (metadf.loc[refstation, 'altitude'] * lapserate) - - # left merge on buddies buddies_obs = buddies_obs.merge(ref_obs, - how='left', + how='left', #both not needed because if right, than there is no buddy sample per definition. left_index=True, right_index=True) # Calculate sigma @@ -847,6 +853,10 @@ def toolkit_buddy_check(obsdf, metadf, obstype, buddy_radius, min_sample_size, m outliers = buddies_obs[(buddies_obs['chi'] > std_threshold) & (buddies_obs['samplesize'] >= min_sample_size)] + logger.debug(f' Buddy outlier details for {refstation}: \n {buddies}') + # NOTE: the outliers (above) can be interesting to pass back to the dataset?? + + # to multiindex outliers['name'] = refstation outliers = outliers.reset_index().set_index(['name', 'datetime']).index From 332d41b8e5f3bead3fa3f1369847719ef00697f8 Mon Sep 17 00:00:00 2001 From: Thomas Vergauwen Date: Fri, 1 Sep 2023 13:21:49 +0200 Subject: [PATCH 3/6] update documentation --- metobs_toolkit/dataset.py | 36 +++++++++++++++++-- metobs_toolkit/qc_checks.py | 69 +++++++++++++++++++++++++++++++------ 2 files changed, 92 insertions(+), 13 deletions(-) diff --git a/metobs_toolkit/dataset.py b/metobs_toolkit/dataset.py index 91a3bae7..a13bd10a 100644 --- a/metobs_toolkit/dataset.py +++ b/metobs_toolkit/dataset.py @@ -1555,8 +1555,40 @@ def apply_quality_control(self, obstype="temp", self.outliersdf = self.outliersdf.sort_index() - def apply_toolkit_buddy_check(self, obstype='temp', use_constant_altitude=False, - metric_epsg='31370'): + def apply_buddy_check(self, obstype='temp', use_constant_altitude=False, + metric_epsg='31370'): + """Apply the buddy check on the observations. + + The buddy check compares an observation against its neighbours (i.e. + buddies). The check looks for buddies in a neighbourhood specified by + a certain radius. The buddy check flags observations if the + (absolute value of the) difference between the observations and the + average of the neighbours normalized by the standard deviation in the + circle is greater than a predefined threshold. + + This check is based on the buddy check from titanlib. Documentation on + the titanlib buddy check can be found + `here `_. + + + The observation and outliers attributes will be updated accordingly. + + Parameters + ---------- + obstype : String, optional + Name of the observationtype you want to apply the checks on. The + default is 'temp'. + use_constant_altitude : bool, optional + Use a constant altitude for all stations. The default is False. + metric_epsg : str, optional + EPSG code for a metric CRS to calculate distances in. The default + is '31370' which is percise for the region of Belgium. + + Returns + ------- + None. + + """ logger.info("Applying the toolkit buddy check") diff --git a/metobs_toolkit/qc_checks.py b/metobs_toolkit/qc_checks.py index 315cd3d4..a429778f 100644 --- a/metobs_toolkit/qc_checks.py +++ b/metobs_toolkit/qc_checks.py @@ -745,20 +745,17 @@ def _calculate_distance_matrix(metadf, metric_epsg='31370'): return metric_metadf.geometry.apply(lambda g: metric_metadf.geometry.distance(g)) - def _find_spatial_buddies(distance_df, buddy_radius): - """ Get neighbouring stations using buddy radius.""" + """Get neighbouring stations using buddy radius.""" buddies = {} for refstation, distances in distance_df.iterrows(): - bud_stations =distances[distances <= buddy_radius].index.to_list() + bud_stations = distances[distances <= buddy_radius].index.to_list() bud_stations.remove(refstation) buddies[refstation] = bud_stations return buddies - - # filter altitude buddies def _filter_to_altitude_buddies(spatial_buddies, metadf, max_altitude_diff): """Filter neighbours by maximum altitude difference.""" @@ -770,9 +767,6 @@ def _filter_to_altitude_buddies(spatial_buddies, metadf, max_altitude_diff): return alt_buddies_dict - - - def _filter_to_samplesize(buddydict, min_sample_size): """Filter stations that are to isolated using minimum sample size.""" to_check_stations = {} @@ -785,14 +779,68 @@ def _filter_to_samplesize(buddydict, min_sample_size): return to_check_stations +def toolkit_buddy_check(obsdf, metadf, obstype, buddy_radius, min_sample_size, max_alt_diff, + min_std, std_threshold, outl_flag, metric_epsg='31370', lapserate=-0.0065): + """Spatial buddy check. + The buddy check compares an observation against its neighbours (i.e. buddies). The check looks for + buddies in a neighbourhood specified by a certain radius. The buddy check flags observations if the + (absolute value of the) difference between the observations and the average of the neighbours + normalized by the standard deviation in the circle is greater than a predefined threshold. + obsdf: Pandas.DataFrame + The dataframe containing the observations + metadf: Pandas.DataFrame + The dataframe containing the metadata (e.g. latitude, longitude...) -def toolkit_buddy_check(obsdf, metadf, obstype, buddy_radius, min_sample_size, max_alt_diff, - min_std, std_threshold, outl_flag, metric_epsg='31370', lapserate=-0.0065): + obstype: String, optional + The observation type that has to be checked. The default is 'temp' + checks_info: Dictionary + Dictionary with the names of the outlier flags for each check + + checks_settings: Dictionary + Dictionary with the settings for each check + + titan_specific_labeler: Dictionary + Dictionary that maps numeric flags to 'ok' or 'outlier' flags for each titan check + + + Parameters + ---------- + obsdf: Pandas.DataFrame + The dataframe containing the observations + metadf: Pandas.DataFrame + The dataframe containing the metadata (e.g. latitude, longitude...) + obstype: String, optional + The observation type that has to be checked. The default is 'temp' + buddy_radius : numeric + The radius to define neighbours in meters. + min_sample_size : int + The minimum sample size to calculate statistics on. + max_alt_diff : numeric + The maximum altitude difference allowd for buddies. + min_std : numeric + The minimum standard deviation for sample statistics. This should + represent the accuracty of the observations. + std_threshold : numeric + The threshold (std units) for flaggging observations as outliers. + outl_flag : str + Label to give to the outliers. + metric_epsg : str, optional + EPSG code for the metric CRS to calculate distances in. The default is '31370' (which is suitable for Belgium). + lapserate : numeric, optional + Describes how the obstype changes with altitude (in meters). The default is -0.0065. + Returns + ------- + obsdf: Pandas.DataFrame + The dataframe containing the unflagged-observations + outlier_df : Pandas.DataFrame + The dataframe containing the flagged observations + + """ outliers_idx = init_multiindex() # Get spatial buddies for each station @@ -863,7 +911,6 @@ def toolkit_buddy_check(obsdf, metadf, obstype, buddy_radius, min_sample_size, m outliers_idx = outliers_idx.append(outliers) # Update the outliers and replace the obsdf - obsdf, outlier_df = make_outlier_df_for_check( station_dt_list=outliers_idx, obsdf=obsdf, From 751079b172fdb59c117d79900f7aa7e917f03a72 Mon Sep 17 00:00:00 2001 From: Thomas Vergauwen Date: Fri, 1 Sep 2023 13:43:46 +0200 Subject: [PATCH 4/6] convert to budd_check name and add the update methods for the settings --- metobs_toolkit/dataset.py | 2 +- metobs_toolkit/dataset_settings_updater.py | 80 +++++++++++++++++++- metobs_toolkit/qc_checks.py | 2 +- metobs_toolkit/settings_files/qc_settings.py | 4 +- 4 files changed, 83 insertions(+), 5 deletions(-) diff --git a/metobs_toolkit/dataset.py b/metobs_toolkit/dataset.py index a13bd10a..947d6cdc 100644 --- a/metobs_toolkit/dataset.py +++ b/metobs_toolkit/dataset.py @@ -1592,7 +1592,7 @@ def apply_buddy_check(self, obstype='temp', use_constant_altitude=False, logger.info("Applying the toolkit buddy check") - checkname = 'toolkit_buddy_check' + checkname = 'buddy_check' # 1. coordinates are available? if self.metadf['lat'].isnull().any(): diff --git a/metobs_toolkit/dataset_settings_updater.py b/metobs_toolkit/dataset_settings_updater.py index 34a0bd7c..f15b352c 100644 --- a/metobs_toolkit/dataset_settings_updater.py +++ b/metobs_toolkit/dataset_settings_updater.py @@ -191,7 +191,13 @@ def update_qc_settings(self, obstype='temp', win_var_time_win_to_check=None, win_var_min_num_obs=None, step_max_increase_per_sec=None, - step_max_decrease_per_sec=None): + step_max_decrease_per_sec=None, + buddy_radius=None, + buddy_min_sample_size=None, + buddy_max_elev_diff=None, + buddy_min_std=None, + buddy_threshold=None, + buddy_elev_gradient=None): """Update the QC settings for the specified observation type. If a argument value is None, the default settings will not be updated. @@ -228,6 +234,23 @@ def update_qc_settings(self, obstype='temp', Maximal increase per second for step check. The default is None. step_max_decrease_per_sec : numeric (< 0), optional Maximal decrease per second for step check. The default is None. + buddy_radius : numeric (> 0), optional + The radius to define neighbours in meters. The default is None. + buddy_min_sample_size : int (> 2), optional + The minimum sample size to calculate statistics on. The default is + None. + buddy_max_elev_diff : numeric (> 0), optional + The maximum altitude difference allowed for buddies. The default is + None. + buddy_min_std : numeric (> 0), optional + The minimum standard deviation for sample statistics. This should + represent the accuracty of the observations. The default is None. + buddy_threshold : numeric (> 0), optional + The threshold (std units) for flaggging observations as buddy + outliers. The default is None. + buddy_elev_gradient : numeric, optional + Describes how the obstype changes with altitude (in meters). The + default is -0.0065. The default is None. Returns ------- @@ -371,6 +394,61 @@ def _updater(dictionary, obstype, argname, value): logger.info(f'Maximal decrease per second for step check updated: {updatestr}') + # Buddy check + buddy_elev_gradient=None + if buddy_radius is not None: + self.settings.qc['qc_check_settings']["buddy_check"], updatestr = _updater( + self.settings.qc['qc_check_settings']["buddy_check"], + obstype=obstype, + argname="radius", + value=abs(float(buddy_radius))) + logger.info(f'Buddy radius for buddy check updated: {updatestr}') + + if buddy_min_sample_size is not None: + value = abs(int(buddy_min_sample_size)) + if value >= 2: + self.settings.qc['qc_check_settings']["buddy_check"], updatestr = _updater( + self.settings.qc['qc_check_settings']["buddy_check"], + obstype=obstype, + argname="num_min", + value=value) + logger.info(f'Minimum number of buddies for buddy check updated: {updatestr}') + else: + logger.warning(f'Minimum number of buddies must be >= 2, but {value} is given. Not updated.') + + if buddy_max_elev_diff is not None: + self.settings.qc['qc_check_settings']["buddy_check"], updatestr = _updater( + self.settings.qc['qc_check_settings']["buddy_check"], + obstype=obstype, + argname="max_elev_diff", + value=abs(float(buddy_max_elev_diff))) + logger.info(f'Max elevation differences for buddy check updated: {updatestr}') + + if buddy_min_std is not None: + self.settings.qc['qc_check_settings']["buddy_check"], updatestr = _updater( + self.settings.qc['qc_check_settings']["buddy_check"], + obstype=obstype, + argname="min_std", + value=abs(float(buddy_min_std))) + logger.info(f'Minimum std in sample for buddy check updated: {updatestr}') + + if buddy_threshold is not None: + self.settings.qc['qc_check_settings']["buddy_check"], updatestr = _updater( + self.settings.qc['qc_check_settings']["buddy_check"], + obstype=obstype, + argname="threshold", + value=abs(float(buddy_threshold))) + logger.info(f'Outlier threshold (in sigma) for buddy check updated: {updatestr}') + + if buddy_elev_gradient is not None: + self.settings.qc['qc_check_settings']["buddy_check"], updatestr = _updater( + self.settings.qc['qc_check_settings']["buddy_check"], + obstype=obstype, + argname="elev_gradient", + value=float(buddy_max_elev_diff)) + logger.info(f'Elevation gradient for buddy check updated: {updatestr}') + + def update_titan_qc_settings(self, obstype='temp', # buddy settings buddy_radius=None, diff --git a/metobs_toolkit/qc_checks.py b/metobs_toolkit/qc_checks.py index a429778f..2d20ce82 100644 --- a/metobs_toolkit/qc_checks.py +++ b/metobs_toolkit/qc_checks.py @@ -820,7 +820,7 @@ def toolkit_buddy_check(obsdf, metadf, obstype, buddy_radius, min_sample_size, m min_sample_size : int The minimum sample size to calculate statistics on. max_alt_diff : numeric - The maximum altitude difference allowd for buddies. + The maximum altitude difference allowed for buddies. min_std : numeric The minimum standard deviation for sample statistics. This should represent the accuracty of the observations. diff --git a/metobs_toolkit/settings_files/qc_settings.py b/metobs_toolkit/settings_files/qc_settings.py index c8ba44e3..cec2f1ac 100644 --- a/metobs_toolkit/settings_files/qc_settings.py +++ b/metobs_toolkit/settings_files/qc_settings.py @@ -39,7 +39,7 @@ "max_decrease_per_second": -10.0 / 3600.0, } }, - "toolkit_buddy_check": { + "buddy_check": { "temp":{ 'radius': 15000, #Search radius in meter 'num_min': 2, # int The minimum number of buddies a station can have @@ -159,7 +159,7 @@ "numeric_flag": 8, "apply_on": "obstype", }, - "toolkit_buddy_check": { + "buddy_check": { "outlier_flag": "tlk buddy check outlier", "numeric_flag": 11, "apply_on": "obstype"}, From 592b063257748a95c0be75a9084dd337202ffb25 Mon Sep 17 00:00:00 2001 From: Thomas Vergauwen Date: Fri, 1 Sep 2023 13:58:17 +0200 Subject: [PATCH 5/6] write test for buddy check --- metobs_toolkit/dataset.py | 2 +- .../default_formats_settings.py | 2 +- metobs_toolkit/settings_files/qc_settings.py | 4 ++-- tests/push_test/qc_test.py | 21 ++++++++++++++----- 4 files changed, 20 insertions(+), 9 deletions(-) diff --git a/metobs_toolkit/dataset.py b/metobs_toolkit/dataset.py index 947d6cdc..542a2db2 100644 --- a/metobs_toolkit/dataset.py +++ b/metobs_toolkit/dataset.py @@ -1626,7 +1626,7 @@ def apply_buddy_check(self, obstype='temp', use_constant_altitude=False, apliable = _can_qc_be_applied(self, obstype, checkname) if apliable: buddy_set = self.settings.qc['qc_check_settings'][checkname][obstype] - outl_flag = self.settings.qc['qc_checks_info']['toolkit_buddy_check']['outlier_flag'] + outl_flag = self.settings.qc['qc_checks_info'][checkname]['outlier_flag'] obsdf, outliersdf = toolkit_buddy_check(obsdf=self.df, metadf=self.metadf, obstype=obstype, diff --git a/metobs_toolkit/settings_files/default_formats_settings.py b/metobs_toolkit/settings_files/default_formats_settings.py index 1d0d1726..798bf60a 100644 --- a/metobs_toolkit/settings_files/default_formats_settings.py +++ b/metobs_toolkit/settings_files/default_formats_settings.py @@ -97,7 +97,7 @@ "repetitions": "#056ff0", "step": "#05d4f0", "window_variation": "#05f0c9", - 'toolkit_buddy_check': '#8300c4', + 'buddy_check': '#8300c4', "titan_buddy_check": '#8300c4', "titan_sct_resistant_check": '#c17fe1', # missing and gap diff --git a/metobs_toolkit/settings_files/qc_settings.py b/metobs_toolkit/settings_files/qc_settings.py index cec2f1ac..d52588f8 100644 --- a/metobs_toolkit/settings_files/qc_settings.py +++ b/metobs_toolkit/settings_files/qc_settings.py @@ -160,11 +160,11 @@ "apply_on": "obstype", }, "buddy_check": { - "outlier_flag": "tlk buddy check outlier", + "outlier_flag": "buddy check outlier", "numeric_flag": 11, "apply_on": "obstype"}, "titan_buddy_check": { - "outlier_flag": "buddy check outlier", + "outlier_flag": "titan buddy check outlier", "numeric_flag": 9, "apply_on": "obstype", }, diff --git a/tests/push_test/qc_test.py b/tests/push_test/qc_test.py index 13274cf7..7b982861 100755 --- a/tests/push_test/qc_test.py +++ b/tests/push_test/qc_test.py @@ -37,6 +37,16 @@ dataset.get_qc_stats(make_plot=False) dataset.get_qc_stats(obstype="humidity", make_plot=False) +#%% Apply buddy check +dataset.update_qc_settings(buddy_radius=17000, + buddy_min_sample_size=3, + buddy_max_elev_diff=150, + buddy_min_std=1.2, + buddy_threshold=2.4, + buddy_elev_gradient=None) + +dataset.apply_buddy_check(use_constant_altitude=True) +assert dataset.outliersdf['label'].value_counts()['buddy check outlier'] == 125, 'The buddy check did not perfom good.' # %% Apply Qc on obstype not specified in settings @@ -48,10 +58,11 @@ sta = dataset.get_station("vlinder05") sta.apply_quality_control() - +sta.apply_buddy_check(use_constant_altitude=True) test = sta.get_qc_stats(make_plot=True) -# sta.get_qc_stats(make_plot=True) + + #%% Apply titan checks @@ -61,7 +72,7 @@ buddy_radius=50000, buddy_num_min=3, buddy_max_elev_diff=200, - buddy_threshold=3) + buddy_threshold=2) @@ -69,7 +80,7 @@ # count test -assert dataset.outliersdf['label'].value_counts()['buddy check outlier'] == 57, 'The buddy check did not perfom good.' +assert dataset.outliersdf['label'].value_counts()['titan buddy check outlier'] == 277, 'The TITAN buddy check did not perfom good.' # test if a check does not overwrite itself dataset.update_titan_qc_settings(obstype='temp', @@ -79,7 +90,7 @@ buddy_threshold=0.5) dataset.apply_titan_buddy_check(use_constant_altitude=True) -assert dataset.outliersdf['label'].value_counts()['buddy check outlier'] == 57, 'The buddy check did overwrite itself!' +assert dataset.outliersdf['label'].value_counts()['titan buddy check outlier'] == 277, 'The buddy check did overwrite itself!' #%% From 017e7b3f2b232abfb5d9718fdb59b405876cfccb Mon Sep 17 00:00:00 2001 From: Thomas Vergauwen Date: Fri, 1 Sep 2023 14:41:09 +0200 Subject: [PATCH 6/6] added haversine approximation for global usage --- metobs_toolkit/dataset.py | 15 +++++++++---- metobs_toolkit/qc_checks.py | 44 +++++++++++++++++++++++++++++++++---- 2 files changed, 51 insertions(+), 8 deletions(-) diff --git a/metobs_toolkit/dataset.py b/metobs_toolkit/dataset.py index 542a2db2..bff3c56d 100644 --- a/metobs_toolkit/dataset.py +++ b/metobs_toolkit/dataset.py @@ -1556,7 +1556,7 @@ def apply_quality_control(self, obstype="temp", def apply_buddy_check(self, obstype='temp', use_constant_altitude=False, - metric_epsg='31370'): + haversine_approx=True, metric_epsg='31370'): """Apply the buddy check on the observations. The buddy check compares an observation against its neighbours (i.e. @@ -1580,9 +1580,14 @@ def apply_buddy_check(self, obstype='temp', use_constant_altitude=False, default is 'temp'. use_constant_altitude : bool, optional Use a constant altitude for all stations. The default is False. + haversine_approx : bool, optional + Use the haversine approximation (earth is a sphere) to calculate + distances between stations. The default is True. metric_epsg : str, optional - EPSG code for a metric CRS to calculate distances in. The default - is '31370' which is percise for the region of Belgium. + EPSG code for the metric CRS to calculate distances in. Only used when + haversine approximation is set to False. Thus becoming a better + distance approximation but not global applicable The default is '31370' + (which is suitable for Belgium). Returns ------- @@ -1637,7 +1642,9 @@ def apply_buddy_check(self, obstype='temp', use_constant_altitude=False, std_threshold=buddy_set['threshold'], metric_epsg=metric_epsg, lapserate=buddy_set['elev_gradient'], - outl_flag=outl_flag) + outl_flag=outl_flag, + haversine_approx=haversine_approx, + ) # update the dataset and outliers self.df = obsdf diff --git a/metobs_toolkit/qc_checks.py b/metobs_toolkit/qc_checks.py index 2d20ce82..66f42047 100644 --- a/metobs_toolkit/qc_checks.py +++ b/metobs_toolkit/qc_checks.py @@ -737,7 +737,34 @@ def create_titanlib_points_dict(obsdf, metadf, obstype): # Toolkit buddy check # ============================================================================= - +def _calculate_distance_matrix_with_haverine(metadf): + from math import radians, cos, sin, asin, sqrt + + def haversine(lon1, lat1, lon2, lat2): + """ + Calculate the great circle distance between two points + on the earth (specified in decimal degrees) + """ + # convert decimal degrees to radians + lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2]) + + # haversine formula + dlon = lon2 - lon1 + dlat = lat2 - lat1 + a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2 + c = 2 * asin(sqrt(a)) + r = 6367000 # Radius of earth in meter. + return c * r + + distance_matrix = {} + for sta1, row1 in metadf.iterrows(): + distance_matrix[sta1] = {} + for sta2, row2 in metadf.iterrows(): + distance_matrix[sta1][sta2] = haversine(row1.geometry.x, + row1.geometry.y, + row2.geometry.x, + row2.geometry.y) + return pd.DataFrame(distance_matrix) def _calculate_distance_matrix(metadf, metric_epsg='31370'): @@ -780,7 +807,7 @@ def _filter_to_samplesize(buddydict, min_sample_size): def toolkit_buddy_check(obsdf, metadf, obstype, buddy_radius, min_sample_size, max_alt_diff, - min_std, std_threshold, outl_flag, metric_epsg='31370', lapserate=-0.0065): + min_std, std_threshold, outl_flag, haversine_approx=True, metric_epsg='31370', lapserate=-0.0065): """Spatial buddy check. The buddy check compares an observation against its neighbours (i.e. buddies). The check looks for @@ -828,8 +855,14 @@ def toolkit_buddy_check(obsdf, metadf, obstype, buddy_radius, min_sample_size, m The threshold (std units) for flaggging observations as outliers. outl_flag : str Label to give to the outliers. + haversine_approx : bool, optional + Use the haversine approximation (earth is a sphere) to calculate + distances between stations. The default is True. metric_epsg : str, optional - EPSG code for the metric CRS to calculate distances in. The default is '31370' (which is suitable for Belgium). + EPSG code for the metric CRS to calculate distances in. Only used when + haversine approximation is set to False. Thus becoming a better + distance approximation but not global applicable The default is '31370' + (which is suitable for Belgium). lapserate : numeric, optional Describes how the obstype changes with altitude (in meters). The default is -0.0065. @@ -844,7 +877,10 @@ def toolkit_buddy_check(obsdf, metadf, obstype, buddy_radius, min_sample_size, m outliers_idx = init_multiindex() # Get spatial buddies for each station - distance_df = _calculate_distance_matrix(metadf=metadf, + if haversine_approx: + distance_df = _calculate_distance_matrix_with_haverine(metadf=metadf) + else: + distance_df = _calculate_distance_matrix(metadf=metadf, metric_epsg=metric_epsg) buddies = _find_spatial_buddies(distance_df=distance_df, buddy_radius=buddy_radius)