26 refactor and reproduce overschrijdingsfrequenties method (#81)

* removed hatyan from comments * renamed public functions * from prints to logging * simplified rule_type and rule_value arguments * removed interpolate_interested_Tfreqs_to_csv() * fixed blending of Hydra-NL by adding `dist` optional argument * raise extremes with aggers * added tests * update whatsnew * simplified plot * added docstrings for public functions
Deltares-research · Jun 20, 2024 · 9e408f1 · 9e408f1
1 parent 723d4b3
commit 9e408f1
Show file tree

Hide file tree

Showing 4 changed files with 202 additions and 83 deletions.
diff --git a/docs/whats-new.md b/docs/whats-new.md
@@ -18,6 +18,7 @@
 - added dedicated plotting functions in [#64](https://github.com/Deltares-research/kenmerkendewaarden/pull/64), [#66](https://github.com/Deltares-research/kenmerkendewaarden/pull/66) and [#68](https://github.com/Deltares-research/kenmerkendewaarden/pull/68)
 - added computation of hat/lat from measurements with `kw.calc_hat_lat_frommeasurements()` in [#74](https://github.com/Deltares-research/kenmerkendewaarden/pull/74)
 - added modular check for timeseries coverage in [#76](https://github.com/Deltares-research/kenmerkendewaarden/pull/76)
+- simplified methods for overschrijdingsfrequenties and reducing public functions to `kw.calc_overschrijding()` in [#81](https://github.com/Deltares-research/kenmerkendewaarden/pull/81)
 
 ### Fix
 - implemented workaround for pandas 2.2.0 with different rounding behaviour in [#69](https://github.com/Deltares-research/kenmerkendewaarden/pull/69)

diff --git a/examples/KWK_process.py b/examples/KWK_process.py
@@ -235,16 +235,10 @@
                          1/500, 1/1000, 1/2000, 1/4000, 1/5000, 1/10000] #TODO: which frequencies are realistic with n years of data? probably remove this entire row >> met 40 jaar data kun je in principe tot 1/40 gaan, maar met weibull kun je extrapoleren en in theorie >> dit is voor tabel die je eruit wil hebben
 
     if compute_overschrijding and data_pd_HWLW_all is not None:
-
         print(f'overschrijdingsfrequenties for {current_station}')
 
-        # exclude data before physical break # TODO: make optional with argument for overschrijdingsfreqs function
-        data_pd_measext = kw.data_retrieve.clip_timeseries_physical_break(data_pd_HWLW_all_12) 
         # only include data up to year_slotgem
-        data_pd_measext = data_pd_measext.loc[:tstop_dt]
-
-        data_pd_HW = data_pd_measext.loc[data_pd_measext['HWLWcode']==1]
-        data_pd_LW = data_pd_measext.loc[data_pd_measext['HWLWcode']!=1]
+        data_pd_measext = data_pd_HWLW_all_12.loc[:tstop_dt]
 
         #get Hydra-NL and KWK-RMM validation data (only for HOEKVHLD)
         dist_vali_exc = {}
@@ -266,30 +260,27 @@
             if os.path.exists(file_vali_dec):
                 dist_vali_dec['validation'] = pd.read_csv(file_vali_dec,sep=';')
                 dist_vali_dec['validation']['values'] /= 100
-
-        #set station rules
-        station_rule_type = 'break'
-        #TODO: we already excluded the data before the physical_break, so can just supply the starttime here
-        station_break_value = data_pd_measext.index.min()
-
+
         # 1. Exceedance
         print('Exceedance')
-        dist_exc = kw.compute_overschrijding(data_pd_HW, rule_type=station_rule_type, rule_value=station_break_value)
-        dist_exc.update(dist_vali_exc)
-        df_interp = kw.interpolate_interested_Tfreqs(dist_exc['Gecombineerd'], Tfreqs=Tfreqs_interested)
+        dist_exc = kw.calc_overschrijding(df_ext=data_pd_measext, rule_type=None, rule_value=None, 
+                                          clip_physical_break=True, dist=dist_vali_exc,
+                                          interp_freqs=Tfreqs_interested)
+        df_interp = dist_exc['Geinterpoleerd']
         df_interp.to_csv(os.path.join(dir_overschrijding, f'Exceedance_{current_station}.csv'), index=False, sep=';')
 
-        fig, ax = kw.plot_distributions(dist_exc, name=current_station, color_map='default')
+        fig, ax = kw.plot_overschrijding(dist_exc)
         ax.set_ylim(0,5.5)
         fig.savefig(os.path.join(dir_overschrijding, f'Exceedance_lines_{current_station}.png'))
 
         # 2. Deceedance
         print('Deceedance')
-        dist_dec = kw.compute_overschrijding(data_pd_LW, rule_type=station_rule_type, rule_value=station_break_value, inverse=True)
-        dist_dec.update(dist_vali_dec)
-        df_interp = kw.interpolate_interested_Tfreqs(dist_dec['Gecombineerd'], Tfreqs=Tfreqs_interested)
+        dist_dec = kw.calc_overschrijding(df_ext=data_pd_measext, rule_type=None, rule_value=None, 
+                                          clip_physical_break=True, dist=dist_vali_dec, inverse=True,
+                                          interp_freqs=Tfreqs_interested)
+        df_interp = dist_dec['Geinterpoleerd']
         df_interp.to_csv(os.path.join(dir_overschrijding, f'Deceedance_{current_station}.csv'), index=False, sep=';')
 
-        fig, ax = kw.plot_distributions(dist_dec, name=current_station, color_map='default')
+        fig, ax = kw.plot_overschrijding(dist_dec)
         fig.savefig(os.path.join(dir_overschrijding, f'Deceedance_lines_{current_station}.png'))
 
diff --git a/kenmerkendewaarden/overschrijding.py b/kenmerkendewaarden/overschrijding.py
@@ -10,13 +10,16 @@
 from scipy import optimize, signal
 from typing import Union, List
 import datetime as dt
-import os
+import logging
+from kenmerkendewaarden.data_retrieve import clip_timeseries_physical_break
+from kenmerkendewaarden.utils import raise_extremes_with_aggers
 
-__all__ = ["compute_overschrijding",
-           "interpolate_interested_Tfreqs",
-           "plot_distributions",
+__all__ = ["calc_overschrijding",
+           "plot_overschrijding",
            ]
 
+logger = logging.getLogger(__name__)
+
 
 def get_threshold_rowidx(df):
     # TODO: base on frequency or value?
@@ -25,30 +28,73 @@ def get_threshold_rowidx(df):
     return rowidx_tresholdfreq
 
 
-def compute_overschrijding(df_extrema, rule_type, rule_value, inverse=False):
+def calc_overschrijding(df_ext:pd.DataFrame, dist:dict = None, 
+                        inverse:bool = False, clip_physical_break:bool = False, 
+                        rule_type:str = None, rule_value=None,
+                        interp_freqs:list = None):
+    """
+    Compute exceedance/deceedance frequencies based on measured extreme waterlevels.
+    
+    Parameters
+    ----------
+    df_ext : pd.DataFrame, optional
+        The timeseries of extremes (high and low waters). The default is None.
+    dist : dict, optional
+        A pre-filled dictionary with a Hydra-NL and/or validation distribution. The default is None.
+    inverse : bool, optional
+        Whether to compute deceedance instead of exceedance frequencies. The default is False.
+    clip_physical_break : bool, optional
+        Whether to exclude the part of the timeseries before physical breaks like estuary closures. The default is False.
+    rule_type : str, optional
+        break/linear/None, passed on to apply_trendanalysis(). The default is None.
+    rule_value : TYPE, optional
+        Value corresponding to rule_type. The default is None.
+    interp_freqs : list, optional
+        The frequencies to interpolate to, providing this will result in a 
+        "Geinterpoleerd" key in the returned dictionary. The default is None.
+
+    Returns
+    -------
+    dist : dict
+        A dictionary with several distributions.
+
+    """
 
+    raise_extremes_with_aggers(df_ext)
+    # take only high or low extremes
+    # TODO: this might not be useful in case of river discharge influenced stations where a filter is needed
+    if inverse:
+        df_extrema = df_ext.loc[df_ext['HWLWcode']!=1]
+    else:
+        df_extrema = df_ext.loc[df_ext['HWLWcode']==1]
+
+    if clip_physical_break:
+        df_extrema = clip_timeseries_physical_break(df_extrema)
     df_extrema_clean = df_extrema.copy()[['values']] #drop all info but the values (times-idx, HWLWcode etc)
-    dist = {} #TODO: replace with pandas.DataFrame?
 
-    print('Calculate unfiltered distribution')
+    if dist is None:
+        dist = {}
+
+    logger.info('Calculate unfiltered distribution')
     dist['Ongefilterd'] = distribution(df_extrema_clean, inverse=inverse)
 
+    # TODO: re-enable filter for river discharge peaks
     """# filtering is only applicable for stations with high river discharge influence, so disabled #TODO: ext is geschikt voor getij, maar bij hoge afvoergolf wil je alleen het echte extreem. Er is dan een treshold per station nodig, is nodig om de rivierafvoerpiek te kunnen duiden.
-    print('Calculate filtered distribution')
-    df_peaks, threshold, _ = hatyan.detect_peaks(df_extrema_clean)
+    logger.info('Calculate filtered distribution')
+    df_peaks, threshold, _ = detect_peaks(df_extrema_clean)
     if metadata_station['apply_treshold']:
         temp[metadata_station['id']] = threshold
-        df_extrema_filt = hatyan.filter_with_threshold(df_extrema_clean, df_peaks, threshold)
+        df_extrema_filt = filter_with_threshold(df_extrema_clean, df_peaks, threshold)
     else:
         df_extrema_filt = df_extrema_clean.copy()
-    dist['Gefilterd'] = hatyan.distribution(df_extrema_filt.copy())
+    dist['Gefilterd'] = distribution(df_extrema_filt.copy())
     """
 
-    print('Calculate filtered distribution with trendanalysis')
+    logger.info('Calculate filtered distribution with trendanalysis')
     df_trend = apply_trendanalysis(df_extrema_clean, rule_type=rule_type, rule_value=rule_value)
     dist['Trendanalyse'] = distribution(df_trend.copy(), inverse=inverse)
 
-    print('Fit Weibull to filtered distribution with trendanalysis')
+    logger.info('Fit Weibull to filtered distribution with trendanalysis')
     idx_maxfreq_trend = get_threshold_rowidx(dist['Trendanalyse'])
     treshold_value = dist['Trendanalyse'].iloc[idx_maxfreq_trend]['values']
     treshold_Tfreq = dist['Trendanalyse'].iloc[idx_maxfreq_trend]['values_Tfreq']
@@ -57,12 +103,21 @@ def compute_overschrijding(df_extrema, rule_type, rule_value, inverse=False):
                                   Tfreqs=np.logspace(-5, np.log10(treshold_Tfreq), 5000),
                                   inverse=inverse)
 
-    print('Blend trend and weibull together') # and Hydra-NL
-    dist['Gecombineerd'] = blend_distributions(dist['Trendanalyse'].copy(),
-                                               dist['Weibull'].copy(),
-                                               #dist['Hydra-NL'].copy(), 
-                                               )
-
+    if "Hydra-NL" in dist.keys():
+        logger.info('Blend trend, weibull and Hydra-NL')
+        # TODO: now based on hardcoded Hydra-NL dict key which is already part of the input dist dict, this is tricky
+        df_hydra = dist['Hydra-NL'].copy()
+    else:
+        logger.info('Blend trend and weibull')
+        df_hydra = None
+    dist['Gecombineerd'] = blend_distributions(df_trend=dist['Trendanalyse'].copy(), 
+                                               df_weibull=dist['Weibull'].copy(), 
+                                               df_hydra=df_hydra)
+
+    if interp_freqs is not None:
+        dist['Geinterpoleerd'] = interpolate_interested_Tfreqs(dist['Gecombineerd'], Tfreqs=interp_freqs)
+
+
     """
     if row['apply_treshold']:
         keys = list(dist.keys())
@@ -130,11 +185,8 @@ def detect_peaks_hkv(df: pd.DataFrame, window: int, inverse: bool = False, thres
 
     peaks = np.ones(len(values)) * np.nan
     t_peaks, dt_left_peaks, dt_right_peaks = peaks.copy(), peaks.copy(), peaks.copy()
-
-    if inverse:
-        print('Determining peaks (inverse)')
-    else:
-        print('Determining peaks')
+
+    logger.info(f'Determining peaks (inverse={inverse})')
     peak_count = 0
     while len(values_sorted) != 0:
         _t, _p = times_sorted[0], values_sorted[0]
@@ -275,24 +327,16 @@ def apply_trendanalysis(df: pd.DataFrame, rule_type: str, rule_value: Union[floa
     #                                      that there is no linear trend at the latest time (so it works its way back
     #                                      in the past). rule_value should be entered as going forward in time
     if rule_type == 'break':
-        #if not isinstance(rule_value,dt.datetime): #TODO: commented this since it did not work with strings, but indexing df did not work with dt.datetime() anymore. Fix and make more generic.
-        #    raise Exception('rule_value should be of instance dt.datetime')
-        #return df[dt.datetime.strptime(rule_value, '%d-%M-%Y'):].copy()
         return df[rule_value:].copy()
     elif rule_type == 'linear':
         df, rule_value = df.copy(), float(rule_value)
         dx = np.array([rule_value*x.total_seconds()/(365*24*3600) for x in (df.index[-1] - df.index)])
         df['values'] = df['values'] + dx
         return df
-    elif (rule_type == '') or (rule_type is None):
+    elif rule_type is None:
         return df.copy()
-    elif isinstance(rule_type, np.float):
-        if np.isnan(rule_type):
-            return df.copy()
-        else:
-            raise ValueError('Incorrect rule_type passed to function. Only break or linear are supported')
     else:
-        raise ValueError('Incorrect rule_type passed to function. Only break or linear are supported')
+        raise ValueError(f'Incorrect rule_type="{rule_type}" passed to function. Only "break", "linear" or None are supported')
 
 
 def blend_distributions(df_trend: pd.DataFrame, df_weibull: pd.DataFrame, df_hydra: pd.DataFrame = None) -> pd.DataFrame:
@@ -347,31 +391,47 @@ def blend_distributions(df_trend: pd.DataFrame, df_weibull: pd.DataFrame, df_hyd
     return df_blended
 
 
-def plot_distributions(dist: dict, name: str,
-                       keys: List[str] = None,
-                       color_map: dict = None,
-                       xlabel: str = 'Exceedance frequency [1/yrs]',
-                       ylabel: str = 'Waterlevel [m]',
-                       legend_loc: str = 'lower right'):
+def plot_overschrijding(dist: dict):
+    """
+    plot overschrijding/onderschrijding
+
+    Parameters
+    ----------
+    dist : dict
+        Dictionary as returned from `kw.calc_overschrijding()`.
+
+    Returns
+    -------
+    fig : matplotlib.figure.Figure
+        Figure handle.
+    ax : matplotlib.axes._axes.Axes
+        Figure axis handle.
+    """
+
+    station = dist["Ongefilterd"].attrs["station"]
 
-    if color_map=='default':
-        color_map = {'Ongefilterd':  'b', 'Gefilterd': 'orange', 'Trendanalyse': 'g',
-                     'Weibull': 'r', 'Hydra-NL': 'm', 'Hydra-NL met modelonzekerheid': 'cyan',
-                     'Gecombineerd': 'k'}
+    color_map = {'Ongefilterd':  'b', 'Gefilterd': 'orange', 'Trendanalyse': 'g',
+                 'Weibull': 'r', 'Hydra-NL': 'm', 'Hydra-NL met modelonzekerheid': 'cyan',
+                 'Gecombineerd': 'k', 'Geinterpoleerd': 'lime'}
 
     fig, ax = plt.subplots(figsize=(8, 6))
-    if keys is None:
-        keys = list(dist.keys())
-    for k in keys:
+
+    for k in dist.keys():
         c = color_map[k] if (color_map is not None) and (k in color_map.keys()) else None
         if k=='Gecombineerd':
             ax.plot(dist[k]['values_Tfreq'], dist[k]['values'], '--', label=k, c=c)
+        elif k=='Geinterpoleerd':
+            ax.plot(dist[k]['values_Tfreq'], dist[k]['values'], 'o', label=k, c=c, markersize=5)
         else:
             ax.plot(dist[k]['values_Tfreq'], dist[k]['values'], label=k, c=c)
-    ax.set_title(name)
-    ax.set_xlabel(xlabel), ax.set_xscale('log'), ax.set_xlim([1e-5, 1e3]), ax.invert_xaxis()
-    ax.set_ylabel(ylabel)
-    ax.legend(fontsize='medium', loc=legend_loc)
+
+    ax.set_title(f"Distribution for {station}")
+    ax.set_xlabel('Frequency [1/yrs]')
+    ax.set_xscale('log')
+    ax.set_xlim([1e-5, 1e3])
+    ax.invert_xaxis()
+    ax.set_ylabel("Waterlevel [m]")
+    ax.legend(fontsize='medium', loc='lower right')
     ax.xaxis.set_minor_locator(ticker.LogLocator(base=10.0, subs=tuple(i / 10 for i in range(1, 10)), numticks=12))
     ax.xaxis.set_minor_formatter(ticker.NullFormatter()),
     ax.yaxis.set_minor_locator(ticker.MultipleLocator(0.1)) #this was 10, but now meters instead of cm
@@ -383,17 +443,6 @@ def plot_distributions(dist: dict, name: str,
     return fig,ax
 
 
-def interpolate_interested_Tfreqs_to_csv(df: pd.DataFrame, Tfreqs: List[float],
-                                         id: str, csv_dir: os.PathLike, prefix: str,) -> pd.DataFrame:
-    df_interp = pd.DataFrame(data={'values': np.interp(Tfreqs,
-                                                      np.flip(df['values_Tfreq'].values),
-                                                      np.flip(df['values'].values)),
-                                   'values_Tfreq': Tfreqs}).sort_values(by='values_Tfreq', ascending=False)
-    #prefix = os.path.basename(csv_dir)
-    df_interp.to_csv(os.path.join(csv_dir, f'{prefix}_{id}.csv'), index=False, sep=';')
-    return df_interp
-
-
 def interpolate_interested_Tfreqs(df: pd.DataFrame, Tfreqs: List[float]) -> pd.DataFrame:
     df_interp = pd.DataFrame(data={'values': np.interp(Tfreqs,
                                                       np.flip(df['values_Tfreq'].values),