97 improve docstrings for public functions (#98)

* docstrings for data_retrieve.py * add docstrings to data_analysis.py * added docstrings for tidalindicators.py * added docstrings for havengetallen.py
Deltares-research · Jun 24, 2024 · 65adefe · 65adefe
1 parent 36ed77a
commit 65adefe
Show file tree

Hide file tree

Showing 6 changed files with 277 additions and 73 deletions.
diff --git a/kenmerkendewaarden/data_analysis.py b/kenmerkendewaarden/data_analysis.py
@@ -22,7 +22,26 @@
 logger = logging.getLogger(__name__)
 
 
-def plot_measurements_amount(df, relative=False):
+def plot_measurements_amount(df:pd.DataFrame, relative:bool = False):
+    """
+    Read the measurements amount csv and generate a pcolormesh figure of all years and stations. 
+    The colors indicate the absolute or relative number of measurements per year.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Dataframe with the amount of measurements for several years per station.
+    relative : bool, optional
+        Whether to scale the amount of measurements with the median of all measurement amounts for the same year. The default is False.
+
+    Returns
+    -------
+    fig : matplotlib.figure.Figure
+        Figure handle.
+    ax : matplotlib.axes._axes.Axes
+        Figure axis handle.
+
+    """
     df = df.copy()
     df[df==0] = np.nan
 
@@ -45,7 +64,25 @@ def plot_measurements_amount(df, relative=False):
     return fig, ax
 
 
-def plot_measurements(df, df_ext=None):
+def plot_measurements(df:pd.DataFrame, df_ext:pd.DataFrame = None):
+    """
+    Generate a timeseries figure for the measurement timeseries (and extremes) of this station.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        Dataframe with the measurement timeseries for a particular station.
+    df_ext : pd.DataFrame, optional
+        Dataframe with the measurement extremes for a particular station.
+
+    Returns
+    -------
+    fig : matplotlib.figure.Figure
+        Figure handle.
+    ax : matplotlib.axes._axes.Axes
+        Figure axis handle.
+
+    """
     station_df = df.attrs["station"]
     if df_ext is not None:
         station_df_ext = df_ext.attrs["station"]
@@ -83,7 +120,27 @@ def plot_measurements(df, df_ext=None):
     return fig, (ax1,ax2)
 
 
-def plot_stations(station_list, crs=None, add_labels=False):
+def plot_stations(station_list:list, crs:int = None, add_labels:bool = False):
+    """
+    Plot the stations by subsetting a ddlpy catalog with the provided list of stations.
+
+    Parameters
+    ----------
+    station_list : list
+        List of stations to plot the locations from.
+    crs : int, optional
+        Coordinate reference system, for instance 28992. The coordinates retrieved from the DDL will be converted to this EPSG. The default is None.
+    add_labels : bool, optional
+        Whether to add station code labels in the figure, useful for debugging. The default is False.
+
+    Returns
+    -------
+    fig : matplotlib.figure.Figure
+        Figure handle.
+    ax : matplotlib.axes._axes.Axes
+        Figure axis handle.
+
+    """
     locs_meas_ts_all, locs_meas_ext_all, _ = retrieve_catalog(crs=crs)
     locs_ts = locs_meas_ts_all.loc[locs_meas_ts_all.index.isin(station_list)]
     locs_ext = locs_meas_ext_all.loc[locs_meas_ext_all.index.isin(station_list)]
@@ -189,7 +246,25 @@ def get_stats_from_dataframe(df):
     return ds_stats
 
 
-def derive_statistics(dir_output, station_list, extremes):
+def derive_statistics(dir_output:str, station_list:list, extremes:bool):
+    """
+    Derive several statistics for the measurements of each station in the list.
+
+    Parameters
+    ----------
+    dir_output : str
+        Path where the measurement netcdf file will be stored.
+    station : list
+        list of station names to derive statistics for, for instance ["HOEKVHLD"].
+    extremes : bool
+        Whether to derive statistics from waterlevel timeseries or extremes.
+
+    Returns
+    -------
+    data_summary : pd.DataFrame
+        A dataframe with several statistics for each station from the provided list.
+
+    """
     row_list = []
     for current_station in station_list:
         logger.info(f'deriving statistics for {current_station} (extremes={extremes})')

diff --git a/kenmerkendewaarden/data_retrieve.py b/kenmerkendewaarden/data_retrieve.py
@@ -86,7 +86,28 @@ def check_locations_amount(locations):
         raise ValueError(f"multiple stations present after station subsetting:\n{locations}")
 
 
-def retrieve_measurements_amount(dir_output, station_list, extremes:bool, start_date, end_date):
+def retrieve_measurements_amount(dir_output:str, station_list:list, extremes:bool, start_date:pd.Timestamp, end_date:pd.Timestamp):
+    """
+    Retrieve the amount of measurements or extremes for a single station from the DDL with ddlpy.
+
+    Parameters
+    ----------
+    dir_output : str
+        Path where the measurement netcdf file will be stored.
+    station : str
+        station name, for instance "HOEKVHLD".
+    extremes : bool
+        Whether to read measurements for waterlevel timeseries or extremes.
+    start_date : pd.Timestamp (or anything understood by pd.Timestamp)
+        start date of the measurements to be retrieved.
+    end_date : pd.Timestamp (or anything understood by pd.Timestamp)
+        end date of the measurements to be retrieved.
+
+    Returns
+    -------
+    None
+
+    """
     locs_meas_ts, locs_meas_ext, locs_meas_exttype = retrieve_catalog()
 
     if extremes:
@@ -130,7 +151,23 @@ def retrieve_measurements_amount(dir_output, station_list, extremes:bool, start_
     df_amount.to_csv(file_csv_amount)
 
 
-def read_measurements_amount(dir_output, extremes:bool):
+def read_measurements_amount(dir_output:str, extremes:bool):
+    """
+    Read the measurements amount csv into a dataframe.
+
+    Parameters
+    ----------
+    dir_output : str
+        Path where the measurements are stored.
+    extremes : bool
+        Whether to read measurements amount for waterlevel timeseries or extremes.
+
+    Returns
+    -------
+    df_amount : pd.DataFrame
+        DataFrame with the amount of measurements per year.
+
+    """
     if extremes:
         fname = DICT_FNAMES['amount_ext']
     else:
@@ -146,7 +183,30 @@ def read_measurements_amount(dir_output, extremes:bool):
     return df_amount
 
 
-def retrieve_measurements(dir_output:str, station:str, extremes:bool, start_date, end_date, drop_if_constant=None):
+def retrieve_measurements(dir_output:str, station:str, extremes:bool, start_date:pd.Timestamp, end_date:pd.Timestamp, drop_if_constant:list = None):
+    """
+    Retrieve timeseries with measurements or extremes for a single station from the DDL with ddlpy.
+
+    Parameters
+    ----------
+    dir_output : str
+        Path where the measurement netcdf file will be stored.
+    station : str
+        station name, for instance "HOEKVHLD".
+    extremes : bool
+        Whether to read measurements for waterlevel timeseries or extremes.
+    start_date : pd.Timestamp (or anything understood by pd.Timestamp)
+        start date of the measurements to be retrieved.
+    end_date : pd.Timestamp (or anything understood by pd.Timestamp)
+        end date of the measurements to be retrieved.
+    drop_if_constant : list, optional
+        A list of columns to drop if the row values are constant, to save disk space. The default is None.
+
+    Returns
+    -------
+    None
+
+    """
 
     locs_meas_ts, locs_meas_ext, locs_meas_exttype = retrieve_catalog()
 
@@ -225,7 +285,29 @@ def xarray_to_hatyan(ds):
     return df
 
 
-def read_measurements(dir_output:str, station:str, extremes:bool, return_xarray=False, nap_correction=False):
+def read_measurements(dir_output:str, station:str, extremes:bool, return_xarray:bool = False, nap_correction:bool = False):
+    """
+    Read the measurements netcdf as a dataframe.
+
+    Parameters
+    ----------
+    dir_output : str
+        Path where the measurements are stored.
+    station : str
+        station name, for instance "HOEKVHLD".
+    extremes : bool
+        Whether to read measurements for waterlevel timeseries or extremes.
+    return_xarray : bool, optional
+        Whether to return raw xarray.Dataset instead of a DataFrame. The default is False.
+    nap_correction : bool, optional
+        Whether to correct for NAP2005. The default is False.
+
+    Returns
+    -------
+    df_meas : pd.DataFrame
+        DataFrame with the measurements or extremes timeseries.
+
+    """
 
     if extremes:
         fname = DICT_FNAMES["meas_ext"].format(station=station)

diff --git a/kenmerkendewaarden/havengetallen.py b/kenmerkendewaarden/havengetallen.py
@@ -49,8 +49,8 @@ def calc_havengetallen(df_ext:pd.DataFrame, return_df_ext=False, min_coverage=No
     df_havengetallen : pd.DataFrame
         DataFrame with havengetallen for all hour-classes. 
         0 corresponds to spring, 6 corresponds to neap, mean is mean.
-    return_df_ext : pd.DataFrame
-        An enriched copy of the input DataFrame, mainly for plotting.
+    df_ext : pd.DataFrame
+        An enriched copy of the input DataFrame including a 'culm_hr' column.
 
     """
     raise_extremes_with_aggers(df_ext)
@@ -174,7 +174,25 @@ def calc_HWLW_culmhr_summary_tidalcoeff(df_ext):
     return HWLW_culmhr_summary
 
 
-def plot_HWLW_pertimeclass(df_ext, df_havengetallen):
+def plot_HWLW_pertimeclass(df_ext:pd.DataFrame, df_havengetallen:pd.DataFrame):
+    """
+    Plot the extremes for each hour-class, including a median line.
+
+    Parameters
+    ----------
+    df_ext : pd.DataFrame
+        DataFrame with measurement extremes, as provided by `kw.calc_havengetallen()`.
+    df_havengetallen : pd.DataFrame
+        DataFrame with havengetallen for all hour-classes, as provided by `kw.calc_havengetallen()`.
+
+    Returns
+    -------
+    fig : matplotlib.figure.Figure
+        Figure handle.
+    ax : matplotlib.axes._axes.Axes
+        Figure axis handle.
+
+    """
 
     assert 'culm_hr' in df_ext.columns
 
@@ -204,7 +222,23 @@ def plot_HWLW_pertimeclass(df_ext, df_havengetallen):
     return fig, axs
 
 
-def plot_aardappelgrafiek(df_havengetallen):
+def plot_aardappelgrafiek(df_havengetallen:pd.DataFrame):
+    """
+    Plot the median values of each hour-class in a aardappelgrafiek.
+
+    Parameters
+    ----------
+    df_havengetallen : pd.DataFrame
+        DataFrame with havengetallen for all hour-classes, as provided by `kw.calc_havengetallen()`.
+
+    Returns
+    -------
+    fig : matplotlib.figure.Figure
+        Figure handle.
+    ax : matplotlib.axes._axes.Axes
+        Figure axis handle.
+
+    """
     # remove mean column
     HWLW_culmhr_summary = df_havengetallen.loc[:11].copy()
 

diff --git a/kenmerkendewaarden/overschrijding.py b/kenmerkendewaarden/overschrijding.py
@@ -9,7 +9,6 @@
 from matplotlib import ticker
 from scipy import optimize, signal
 from typing import Union, List
-import datetime as dt
 import logging
 from kenmerkendewaarden.data_retrieve import clip_timeseries_physical_break
 from kenmerkendewaarden.utils import raise_extremes_with_aggers
@@ -30,7 +29,7 @@ def get_threshold_rowidx(df):
 
 def calc_overschrijding(df_ext:pd.DataFrame, dist:dict = None, 
                         inverse:bool = False, clip_physical_break:bool = False, 
-                        rule_type:str = None, rule_value=None,
+                        rule_type:str = None, rule_value:(pd.Timestamp, float) = None,
                         interp_freqs:list = None):
     """
     Compute exceedance/deceedance frequencies based on measured extreme waterlevels.
@@ -47,8 +46,9 @@ def calc_overschrijding(df_ext:pd.DataFrame, dist:dict = None,
         Whether to exclude the part of the timeseries before physical breaks like estuary closures. The default is False.
     rule_type : str, optional
         break/linear/None, passed on to apply_trendanalysis(). The default is None.
-    rule_value : TYPE, optional
-        Value corresponding to rule_type. The default is None.
+    rule_value : (pd.Timestamp, float), optional
+        Value corresponding to rule_type, pd.Timestamp (or anything understood by pd.Timestamp) 
+        in case of rule_type='break', float in case of rule_type='linear'. The default is None.
     interp_freqs : list, optional
         The frequencies to interpolate to, providing this will result in a 
         "Geinterpoleerd" key in the returned dictionary. The default is None.
@@ -321,15 +321,16 @@ def get_total_years(df: pd.DataFrame) -> float:
     return (df.index[-1] - df.index[0]).total_seconds() / (3600 * 24 * 365)
 
 
-def apply_trendanalysis(df: pd.DataFrame, rule_type: str, rule_value: Union[float, dt.datetime]):
+def apply_trendanalysis(df: pd.DataFrame, rule_type: str, rule_value: Union[pd.Timestamp, float]):
     # There are 2 rule types:  - break -> Values before break are removed
     #                          - linear -> Values are increased/lowered based on value in value/year. It is assumes
     #                                      that there is no linear trend at the latest time (so it works its way back
     #                                      in the past). rule_value should be entered as going forward in time
     if rule_type == 'break':
         return df[rule_value:].copy()
     elif rule_type == 'linear':
-        df, rule_value = df.copy(), float(rule_value)
+        rule_value = float(rule_value)
+        df = df.copy()
         dx = np.array([rule_value*x.total_seconds()/(365*24*3600) for x in (df.index[-1] - df.index)])
         df['values'] = df['values'] + dx
         return df