diff --git a/docs/whats-new.md b/docs/whats-new.md index a6d38c9..094c4f4 100644 --- a/docs/whats-new.md +++ b/docs/whats-new.md @@ -18,6 +18,7 @@ - added dedicated plotting functions in [#64](https://github.com/Deltares-research/kenmerkendewaarden/pull/64), [#66](https://github.com/Deltares-research/kenmerkendewaarden/pull/66) and [#68](https://github.com/Deltares-research/kenmerkendewaarden/pull/68) - added computation of hat/lat from measurements with `kw.calc_hat_lat_frommeasurements()` in [#74](https://github.com/Deltares-research/kenmerkendewaarden/pull/74) - added modular check for timeseries coverage in [#76](https://github.com/Deltares-research/kenmerkendewaarden/pull/76) +- simplified methods for overschrijdingsfrequenties and reducing public functions to `kw.calc_overschrijding()` in [#81](https://github.com/Deltares-research/kenmerkendewaarden/pull/81) ### Fix - implemented workaround for pandas 2.2.0 with different rounding behaviour in [#69](https://github.com/Deltares-research/kenmerkendewaarden/pull/69) diff --git a/examples/KWK_process.py b/examples/KWK_process.py index 111c6d7..c7a2c74 100644 --- a/examples/KWK_process.py +++ b/examples/KWK_process.py @@ -235,16 +235,10 @@ 1/500, 1/1000, 1/2000, 1/4000, 1/5000, 1/10000] #TODO: which frequencies are realistic with n years of data? probably remove this entire row >> met 40 jaar data kun je in principe tot 1/40 gaan, maar met weibull kun je extrapoleren en in theorie >> dit is voor tabel die je eruit wil hebben if compute_overschrijding and data_pd_HWLW_all is not None: - print(f'overschrijdingsfrequenties for {current_station}') - # exclude data before physical break # TODO: make optional with argument for overschrijdingsfreqs function - data_pd_measext = kw.data_retrieve.clip_timeseries_physical_break(data_pd_HWLW_all_12) # only include data up to year_slotgem - data_pd_measext = data_pd_measext.loc[:tstop_dt] - - data_pd_HW = data_pd_measext.loc[data_pd_measext['HWLWcode']==1] - data_pd_LW = data_pd_measext.loc[data_pd_measext['HWLWcode']!=1] + data_pd_measext = data_pd_HWLW_all_12.loc[:tstop_dt] #get Hydra-NL and KWK-RMM validation data (only for HOEKVHLD) dist_vali_exc = {} @@ -266,30 +260,27 @@ if os.path.exists(file_vali_dec): dist_vali_dec['validation'] = pd.read_csv(file_vali_dec,sep=';') dist_vali_dec['validation']['values'] /= 100 - - #set station rules - station_rule_type = 'break' - #TODO: we already excluded the data before the physical_break, so can just supply the starttime here - station_break_value = data_pd_measext.index.min() - + # 1. Exceedance print('Exceedance') - dist_exc = kw.compute_overschrijding(data_pd_HW, rule_type=station_rule_type, rule_value=station_break_value) - dist_exc.update(dist_vali_exc) - df_interp = kw.interpolate_interested_Tfreqs(dist_exc['Gecombineerd'], Tfreqs=Tfreqs_interested) + dist_exc = kw.calc_overschrijding(df_ext=data_pd_measext, rule_type=None, rule_value=None, + clip_physical_break=True, dist=dist_vali_exc, + interp_freqs=Tfreqs_interested) + df_interp = dist_exc['Geinterpoleerd'] df_interp.to_csv(os.path.join(dir_overschrijding, f'Exceedance_{current_station}.csv'), index=False, sep=';') - fig, ax = kw.plot_distributions(dist_exc, name=current_station, color_map='default') + fig, ax = kw.plot_overschrijding(dist_exc) ax.set_ylim(0,5.5) fig.savefig(os.path.join(dir_overschrijding, f'Exceedance_lines_{current_station}.png')) # 2. Deceedance print('Deceedance') - dist_dec = kw.compute_overschrijding(data_pd_LW, rule_type=station_rule_type, rule_value=station_break_value, inverse=True) - dist_dec.update(dist_vali_dec) - df_interp = kw.interpolate_interested_Tfreqs(dist_dec['Gecombineerd'], Tfreqs=Tfreqs_interested) + dist_dec = kw.calc_overschrijding(df_ext=data_pd_measext, rule_type=None, rule_value=None, + clip_physical_break=True, dist=dist_vali_dec, inverse=True, + interp_freqs=Tfreqs_interested) + df_interp = dist_dec['Geinterpoleerd'] df_interp.to_csv(os.path.join(dir_overschrijding, f'Deceedance_{current_station}.csv'), index=False, sep=';') - fig, ax = kw.plot_distributions(dist_dec, name=current_station, color_map='default') + fig, ax = kw.plot_overschrijding(dist_dec) fig.savefig(os.path.join(dir_overschrijding, f'Deceedance_lines_{current_station}.png')) diff --git a/kenmerkendewaarden/overschrijding.py b/kenmerkendewaarden/overschrijding.py index d3bbb38..6520869 100644 --- a/kenmerkendewaarden/overschrijding.py +++ b/kenmerkendewaarden/overschrijding.py @@ -10,13 +10,16 @@ from scipy import optimize, signal from typing import Union, List import datetime as dt -import os +import logging +from kenmerkendewaarden.data_retrieve import clip_timeseries_physical_break +from kenmerkendewaarden.utils import raise_extremes_with_aggers -__all__ = ["compute_overschrijding", - "interpolate_interested_Tfreqs", - "plot_distributions", +__all__ = ["calc_overschrijding", + "plot_overschrijding", ] +logger = logging.getLogger(__name__) + def get_threshold_rowidx(df): # TODO: base on frequency or value? @@ -25,30 +28,73 @@ def get_threshold_rowidx(df): return rowidx_tresholdfreq -def compute_overschrijding(df_extrema, rule_type, rule_value, inverse=False): +def calc_overschrijding(df_ext:pd.DataFrame, dist:dict = None, + inverse:bool = False, clip_physical_break:bool = False, + rule_type:str = None, rule_value=None, + interp_freqs:list = None): + """ + Compute exceedance/deceedance frequencies based on measured extreme waterlevels. + + Parameters + ---------- + df_ext : pd.DataFrame, optional + The timeseries of extremes (high and low waters). The default is None. + dist : dict, optional + A pre-filled dictionary with a Hydra-NL and/or validation distribution. The default is None. + inverse : bool, optional + Whether to compute deceedance instead of exceedance frequencies. The default is False. + clip_physical_break : bool, optional + Whether to exclude the part of the timeseries before physical breaks like estuary closures. The default is False. + rule_type : str, optional + break/linear/None, passed on to apply_trendanalysis(). The default is None. + rule_value : TYPE, optional + Value corresponding to rule_type. The default is None. + interp_freqs : list, optional + The frequencies to interpolate to, providing this will result in a + "Geinterpoleerd" key in the returned dictionary. The default is None. + + Returns + ------- + dist : dict + A dictionary with several distributions. + + """ + raise_extremes_with_aggers(df_ext) + # take only high or low extremes + # TODO: this might not be useful in case of river discharge influenced stations where a filter is needed + if inverse: + df_extrema = df_ext.loc[df_ext['HWLWcode']!=1] + else: + df_extrema = df_ext.loc[df_ext['HWLWcode']==1] + + if clip_physical_break: + df_extrema = clip_timeseries_physical_break(df_extrema) df_extrema_clean = df_extrema.copy()[['values']] #drop all info but the values (times-idx, HWLWcode etc) - dist = {} #TODO: replace with pandas.DataFrame? - print('Calculate unfiltered distribution') + if dist is None: + dist = {} + + logger.info('Calculate unfiltered distribution') dist['Ongefilterd'] = distribution(df_extrema_clean, inverse=inverse) + # TODO: re-enable filter for river discharge peaks """# filtering is only applicable for stations with high river discharge influence, so disabled #TODO: ext is geschikt voor getij, maar bij hoge afvoergolf wil je alleen het echte extreem. Er is dan een treshold per station nodig, is nodig om de rivierafvoerpiek te kunnen duiden. - print('Calculate filtered distribution') - df_peaks, threshold, _ = hatyan.detect_peaks(df_extrema_clean) + logger.info('Calculate filtered distribution') + df_peaks, threshold, _ = detect_peaks(df_extrema_clean) if metadata_station['apply_treshold']: temp[metadata_station['id']] = threshold - df_extrema_filt = hatyan.filter_with_threshold(df_extrema_clean, df_peaks, threshold) + df_extrema_filt = filter_with_threshold(df_extrema_clean, df_peaks, threshold) else: df_extrema_filt = df_extrema_clean.copy() - dist['Gefilterd'] = hatyan.distribution(df_extrema_filt.copy()) + dist['Gefilterd'] = distribution(df_extrema_filt.copy()) """ - print('Calculate filtered distribution with trendanalysis') + logger.info('Calculate filtered distribution with trendanalysis') df_trend = apply_trendanalysis(df_extrema_clean, rule_type=rule_type, rule_value=rule_value) dist['Trendanalyse'] = distribution(df_trend.copy(), inverse=inverse) - print('Fit Weibull to filtered distribution with trendanalysis') + logger.info('Fit Weibull to filtered distribution with trendanalysis') idx_maxfreq_trend = get_threshold_rowidx(dist['Trendanalyse']) treshold_value = dist['Trendanalyse'].iloc[idx_maxfreq_trend]['values'] treshold_Tfreq = dist['Trendanalyse'].iloc[idx_maxfreq_trend]['values_Tfreq'] @@ -57,12 +103,21 @@ def compute_overschrijding(df_extrema, rule_type, rule_value, inverse=False): Tfreqs=np.logspace(-5, np.log10(treshold_Tfreq), 5000), inverse=inverse) - print('Blend trend and weibull together') # and Hydra-NL - dist['Gecombineerd'] = blend_distributions(dist['Trendanalyse'].copy(), - dist['Weibull'].copy(), - #dist['Hydra-NL'].copy(), - ) - + if "Hydra-NL" in dist.keys(): + logger.info('Blend trend, weibull and Hydra-NL') + # TODO: now based on hardcoded Hydra-NL dict key which is already part of the input dist dict, this is tricky + df_hydra = dist['Hydra-NL'].copy() + else: + logger.info('Blend trend and weibull') + df_hydra = None + dist['Gecombineerd'] = blend_distributions(df_trend=dist['Trendanalyse'].copy(), + df_weibull=dist['Weibull'].copy(), + df_hydra=df_hydra) + + if interp_freqs is not None: + dist['Geinterpoleerd'] = interpolate_interested_Tfreqs(dist['Gecombineerd'], Tfreqs=interp_freqs) + + """ if row['apply_treshold']: keys = list(dist.keys()) @@ -130,11 +185,8 @@ def detect_peaks_hkv(df: pd.DataFrame, window: int, inverse: bool = False, thres peaks = np.ones(len(values)) * np.nan t_peaks, dt_left_peaks, dt_right_peaks = peaks.copy(), peaks.copy(), peaks.copy() - - if inverse: - print('Determining peaks (inverse)') - else: - print('Determining peaks') + + logger.info(f'Determining peaks (inverse={inverse})') peak_count = 0 while len(values_sorted) != 0: _t, _p = times_sorted[0], values_sorted[0] @@ -275,24 +327,16 @@ def apply_trendanalysis(df: pd.DataFrame, rule_type: str, rule_value: Union[floa # that there is no linear trend at the latest time (so it works its way back # in the past). rule_value should be entered as going forward in time if rule_type == 'break': - #if not isinstance(rule_value,dt.datetime): #TODO: commented this since it did not work with strings, but indexing df did not work with dt.datetime() anymore. Fix and make more generic. - # raise Exception('rule_value should be of instance dt.datetime') - #return df[dt.datetime.strptime(rule_value, '%d-%M-%Y'):].copy() return df[rule_value:].copy() elif rule_type == 'linear': df, rule_value = df.copy(), float(rule_value) dx = np.array([rule_value*x.total_seconds()/(365*24*3600) for x in (df.index[-1] - df.index)]) df['values'] = df['values'] + dx return df - elif (rule_type == '') or (rule_type is None): + elif rule_type is None: return df.copy() - elif isinstance(rule_type, np.float): - if np.isnan(rule_type): - return df.copy() - else: - raise ValueError('Incorrect rule_type passed to function. Only break or linear are supported') else: - raise ValueError('Incorrect rule_type passed to function. Only break or linear are supported') + raise ValueError(f'Incorrect rule_type="{rule_type}" passed to function. Only "break", "linear" or None are supported') def blend_distributions(df_trend: pd.DataFrame, df_weibull: pd.DataFrame, df_hydra: pd.DataFrame = None) -> pd.DataFrame: @@ -347,31 +391,47 @@ def blend_distributions(df_trend: pd.DataFrame, df_weibull: pd.DataFrame, df_hyd return df_blended -def plot_distributions(dist: dict, name: str, - keys: List[str] = None, - color_map: dict = None, - xlabel: str = 'Exceedance frequency [1/yrs]', - ylabel: str = 'Waterlevel [m]', - legend_loc: str = 'lower right'): +def plot_overschrijding(dist: dict): + """ + plot overschrijding/onderschrijding + + Parameters + ---------- + dist : dict + Dictionary as returned from `kw.calc_overschrijding()`. + + Returns + ------- + fig : matplotlib.figure.Figure + Figure handle. + ax : matplotlib.axes._axes.Axes + Figure axis handle. + """ + + station = dist["Ongefilterd"].attrs["station"] - if color_map=='default': - color_map = {'Ongefilterd': 'b', 'Gefilterd': 'orange', 'Trendanalyse': 'g', - 'Weibull': 'r', 'Hydra-NL': 'm', 'Hydra-NL met modelonzekerheid': 'cyan', - 'Gecombineerd': 'k'} + color_map = {'Ongefilterd': 'b', 'Gefilterd': 'orange', 'Trendanalyse': 'g', + 'Weibull': 'r', 'Hydra-NL': 'm', 'Hydra-NL met modelonzekerheid': 'cyan', + 'Gecombineerd': 'k', 'Geinterpoleerd': 'lime'} fig, ax = plt.subplots(figsize=(8, 6)) - if keys is None: - keys = list(dist.keys()) - for k in keys: + + for k in dist.keys(): c = color_map[k] if (color_map is not None) and (k in color_map.keys()) else None if k=='Gecombineerd': ax.plot(dist[k]['values_Tfreq'], dist[k]['values'], '--', label=k, c=c) + elif k=='Geinterpoleerd': + ax.plot(dist[k]['values_Tfreq'], dist[k]['values'], 'o', label=k, c=c, markersize=5) else: ax.plot(dist[k]['values_Tfreq'], dist[k]['values'], label=k, c=c) - ax.set_title(name) - ax.set_xlabel(xlabel), ax.set_xscale('log'), ax.set_xlim([1e-5, 1e3]), ax.invert_xaxis() - ax.set_ylabel(ylabel) - ax.legend(fontsize='medium', loc=legend_loc) + + ax.set_title(f"Distribution for {station}") + ax.set_xlabel('Frequency [1/yrs]') + ax.set_xscale('log') + ax.set_xlim([1e-5, 1e3]) + ax.invert_xaxis() + ax.set_ylabel("Waterlevel [m]") + ax.legend(fontsize='medium', loc='lower right') ax.xaxis.set_minor_locator(ticker.LogLocator(base=10.0, subs=tuple(i / 10 for i in range(1, 10)), numticks=12)) ax.xaxis.set_minor_formatter(ticker.NullFormatter()), ax.yaxis.set_minor_locator(ticker.MultipleLocator(0.1)) #this was 10, but now meters instead of cm @@ -383,17 +443,6 @@ def plot_distributions(dist: dict, name: str, return fig,ax -def interpolate_interested_Tfreqs_to_csv(df: pd.DataFrame, Tfreqs: List[float], - id: str, csv_dir: os.PathLike, prefix: str,) -> pd.DataFrame: - df_interp = pd.DataFrame(data={'values': np.interp(Tfreqs, - np.flip(df['values_Tfreq'].values), - np.flip(df['values'].values)), - 'values_Tfreq': Tfreqs}).sort_values(by='values_Tfreq', ascending=False) - #prefix = os.path.basename(csv_dir) - df_interp.to_csv(os.path.join(csv_dir, f'{prefix}_{id}.csv'), index=False, sep=';') - return df_interp - - def interpolate_interested_Tfreqs(df: pd.DataFrame, Tfreqs: List[float]) -> pd.DataFrame: df_interp = pd.DataFrame(data={'values': np.interp(Tfreqs, np.flip(df['values_Tfreq'].values), diff --git a/tests/test_overschrijding.py b/tests/test_overschrijding.py new file mode 100644 index 0000000..e7026c9 --- /dev/null +++ b/tests/test_overschrijding.py @@ -0,0 +1,78 @@ +# -*- coding: utf-8 -*- +""" +Created on Thu Jun 20 11:29:41 2024 + +@author: veenstra +""" + +import pytest +import kenmerkendewaarden as kw +import numpy as np +import pandas as pd + + +@pytest.mark.unittest +def test_calc_overschrijding(df_ext_12_2010_2014): + Tfreqs_interested = [5, 2, 1, 1/2, 1/5, 1/10, 1/20, 1/50, 1/100, 1/200] + dist = kw.calc_overschrijding(df_ext=df_ext_12_2010_2014, interp_freqs=Tfreqs_interested) + + expected_keys = ['Ongefilterd', 'Trendanalyse', 'Weibull', 'Gecombineerd', 'Geinterpoleerd'] + assert set(dist.keys()) == set(expected_keys) + assert np.allclose(dist['Geinterpoleerd']['values_Tfreq'].values, Tfreqs_interested) + expected_values = np.array([1.93 , 2.09327434, 2.26311592, 2.44480348, 2.70434509, + 2.91627091, 3.14247786, 3.46480369, 3.72735283, 4.00701551]) + assert np.allclose(dist['Geinterpoleerd']['values'].values, expected_values) + + +@pytest.mark.unittest +def test_calc_overschrijding_rule_type_break(df_ext_12_2010_2014): + Tfreqs_interested = [5, 2, 1, 1/2, 1/5, 1/10, 1/20, 1/50, 1/100, 1/200] + dist = kw.calc_overschrijding(df_ext=df_ext_12_2010_2014, interp_freqs=Tfreqs_interested, + rule_type='break', rule_value="2012") + + expected_keys = ['Ongefilterd', 'Trendanalyse', 'Weibull', 'Gecombineerd', 'Geinterpoleerd'] + assert set(dist.keys()) == set(expected_keys) + assert np.allclose(dist['Geinterpoleerd']['values_Tfreq'].values, Tfreqs_interested) + expected_values = np.array([1.93 , 2.11353631, 2.37418889, 2.61405772, 2.90685818, + 3.11376817, 3.31039598, 3.55705172, 3.73504553, 3.90661043]) + assert np.allclose(dist['Geinterpoleerd']['values'].values, expected_values) + + +@pytest.mark.unittest +def test_calc_overschrijding_clip_physical_break(df_ext_12_2010_2014): + # construct fake timeseries for VLIELHVN around physical break + tstart_2010 = df_ext_12_2010_2014.index[0] + tstart_1931 = pd.Timestamp(1931, tstart_2010.month, tstart_2010.day, + tstart_2010.hour, tstart_2010.minute, tstart_2010.second, + tz=tstart_2010.tz) + tdiff = tstart_2010 - tstart_1931 + + df_ext_12_1931_1935 = df_ext_12_2010_2014.copy() + df_ext_12_1931_1935.index = df_ext_12_1931_1935.index - tdiff + df_ext_12_1931_1935.attrs["station"] = "VLIELHVN" + + Tfreqs_interested = [5, 2, 1, 1/2, 1/5, 1/10, 1/20, 1/50, 1/100, 1/200] + dist_normal = kw.calc_overschrijding(df_ext=df_ext_12_1931_1935, interp_freqs=Tfreqs_interested, + clip_physical_break=False) + dist_clip = kw.calc_overschrijding(df_ext=df_ext_12_1931_1935, interp_freqs=Tfreqs_interested, + clip_physical_break=True) + + expected_values_normal = np.array([1.93 , 2.09327434, 2.26311592, 2.44480348, 2.70434509, + 2.91627091, 3.14247786, 3.46480369, 3.72735283, 4.00701551]) + assert np.allclose(dist_normal['Geinterpoleerd']['values'].values, expected_values_normal) + expected_values_clip = np.array([1.93 , 2.11390828, 2.37452851, 2.61437251, 2.90714768, + 3.11404269, 3.3106574 , 3.5572988 , 3.73528341, 3.90683996]) + assert np.allclose(dist_clip['Geinterpoleerd']['values'].values, expected_values_clip) + + +@pytest.mark.unittest +def test_calc_overschrijding_aggers(df_ext_2010_2014): + with pytest.raises(ValueError) as e: + kw.calc_overschrijding(df_ext=df_ext_2010_2014) + assert "contains aggers" in str(e.value) + + +@pytest.mark.unittest +def test_plot_overschrijding(df_ext_12_2010_2014): + dist = kw.calc_overschrijding(df_ext=df_ext_12_2010_2014) + kw.plot_overschrijding(dist)