Skip to content

Commit

Permalink
97 improve docstrings for public functions (#98)
Browse files Browse the repository at this point in the history
* docstrings for data_retrieve.py

* add docstrings to data_analysis.py

* added docstrings for tidalindicators.py

* added docstrings for havengetallen.py
  • Loading branch information
veenstrajelmer authored Jun 24, 2024
1 parent 36ed77a commit 65adefe
Show file tree
Hide file tree
Showing 6 changed files with 277 additions and 73 deletions.
83 changes: 79 additions & 4 deletions kenmerkendewaarden/data_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,26 @@
logger = logging.getLogger(__name__)


def plot_measurements_amount(df, relative=False):
def plot_measurements_amount(df:pd.DataFrame, relative:bool = False):
"""
Read the measurements amount csv and generate a pcolormesh figure of all years and stations.
The colors indicate the absolute or relative number of measurements per year.
Parameters
----------
df : pd.DataFrame
Dataframe with the amount of measurements for several years per station.
relative : bool, optional
Whether to scale the amount of measurements with the median of all measurement amounts for the same year. The default is False.
Returns
-------
fig : matplotlib.figure.Figure
Figure handle.
ax : matplotlib.axes._axes.Axes
Figure axis handle.
"""
df = df.copy()
df[df==0] = np.nan

Expand All @@ -45,7 +64,25 @@ def plot_measurements_amount(df, relative=False):
return fig, ax


def plot_measurements(df, df_ext=None):
def plot_measurements(df:pd.DataFrame, df_ext:pd.DataFrame = None):
"""
Generate a timeseries figure for the measurement timeseries (and extremes) of this station.
Parameters
----------
df : pd.DataFrame
Dataframe with the measurement timeseries for a particular station.
df_ext : pd.DataFrame, optional
Dataframe with the measurement extremes for a particular station.
Returns
-------
fig : matplotlib.figure.Figure
Figure handle.
ax : matplotlib.axes._axes.Axes
Figure axis handle.
"""
station_df = df.attrs["station"]
if df_ext is not None:
station_df_ext = df_ext.attrs["station"]
Expand Down Expand Up @@ -83,7 +120,27 @@ def plot_measurements(df, df_ext=None):
return fig, (ax1,ax2)


def plot_stations(station_list, crs=None, add_labels=False):
def plot_stations(station_list:list, crs:int = None, add_labels:bool = False):
"""
Plot the stations by subsetting a ddlpy catalog with the provided list of stations.
Parameters
----------
station_list : list
List of stations to plot the locations from.
crs : int, optional
Coordinate reference system, for instance 28992. The coordinates retrieved from the DDL will be converted to this EPSG. The default is None.
add_labels : bool, optional
Whether to add station code labels in the figure, useful for debugging. The default is False.
Returns
-------
fig : matplotlib.figure.Figure
Figure handle.
ax : matplotlib.axes._axes.Axes
Figure axis handle.
"""
locs_meas_ts_all, locs_meas_ext_all, _ = retrieve_catalog(crs=crs)
locs_ts = locs_meas_ts_all.loc[locs_meas_ts_all.index.isin(station_list)]
locs_ext = locs_meas_ext_all.loc[locs_meas_ext_all.index.isin(station_list)]
Expand Down Expand Up @@ -189,7 +246,25 @@ def get_stats_from_dataframe(df):
return ds_stats


def derive_statistics(dir_output, station_list, extremes):
def derive_statistics(dir_output:str, station_list:list, extremes:bool):
"""
Derive several statistics for the measurements of each station in the list.
Parameters
----------
dir_output : str
Path where the measurement netcdf file will be stored.
station : list
list of station names to derive statistics for, for instance ["HOEKVHLD"].
extremes : bool
Whether to derive statistics from waterlevel timeseries or extremes.
Returns
-------
data_summary : pd.DataFrame
A dataframe with several statistics for each station from the provided list.
"""
row_list = []
for current_station in station_list:
logger.info(f'deriving statistics for {current_station} (extremes={extremes})')
Expand Down
90 changes: 86 additions & 4 deletions kenmerkendewaarden/data_retrieve.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,28 @@ def check_locations_amount(locations):
raise ValueError(f"multiple stations present after station subsetting:\n{locations}")


def retrieve_measurements_amount(dir_output, station_list, extremes:bool, start_date, end_date):
def retrieve_measurements_amount(dir_output:str, station_list:list, extremes:bool, start_date:pd.Timestamp, end_date:pd.Timestamp):
"""
Retrieve the amount of measurements or extremes for a single station from the DDL with ddlpy.
Parameters
----------
dir_output : str
Path where the measurement netcdf file will be stored.
station : str
station name, for instance "HOEKVHLD".
extremes : bool
Whether to read measurements for waterlevel timeseries or extremes.
start_date : pd.Timestamp (or anything understood by pd.Timestamp)
start date of the measurements to be retrieved.
end_date : pd.Timestamp (or anything understood by pd.Timestamp)
end date of the measurements to be retrieved.
Returns
-------
None
"""
locs_meas_ts, locs_meas_ext, locs_meas_exttype = retrieve_catalog()

if extremes:
Expand Down Expand Up @@ -130,7 +151,23 @@ def retrieve_measurements_amount(dir_output, station_list, extremes:bool, start_
df_amount.to_csv(file_csv_amount)


def read_measurements_amount(dir_output, extremes:bool):
def read_measurements_amount(dir_output:str, extremes:bool):
"""
Read the measurements amount csv into a dataframe.
Parameters
----------
dir_output : str
Path where the measurements are stored.
extremes : bool
Whether to read measurements amount for waterlevel timeseries or extremes.
Returns
-------
df_amount : pd.DataFrame
DataFrame with the amount of measurements per year.
"""
if extremes:
fname = DICT_FNAMES['amount_ext']
else:
Expand All @@ -146,7 +183,30 @@ def read_measurements_amount(dir_output, extremes:bool):
return df_amount


def retrieve_measurements(dir_output:str, station:str, extremes:bool, start_date, end_date, drop_if_constant=None):
def retrieve_measurements(dir_output:str, station:str, extremes:bool, start_date:pd.Timestamp, end_date:pd.Timestamp, drop_if_constant:list = None):
"""
Retrieve timeseries with measurements or extremes for a single station from the DDL with ddlpy.
Parameters
----------
dir_output : str
Path where the measurement netcdf file will be stored.
station : str
station name, for instance "HOEKVHLD".
extremes : bool
Whether to read measurements for waterlevel timeseries or extremes.
start_date : pd.Timestamp (or anything understood by pd.Timestamp)
start date of the measurements to be retrieved.
end_date : pd.Timestamp (or anything understood by pd.Timestamp)
end date of the measurements to be retrieved.
drop_if_constant : list, optional
A list of columns to drop if the row values are constant, to save disk space. The default is None.
Returns
-------
None
"""

locs_meas_ts, locs_meas_ext, locs_meas_exttype = retrieve_catalog()

Expand Down Expand Up @@ -225,7 +285,29 @@ def xarray_to_hatyan(ds):
return df


def read_measurements(dir_output:str, station:str, extremes:bool, return_xarray=False, nap_correction=False):
def read_measurements(dir_output:str, station:str, extremes:bool, return_xarray:bool = False, nap_correction:bool = False):
"""
Read the measurements netcdf as a dataframe.
Parameters
----------
dir_output : str
Path where the measurements are stored.
station : str
station name, for instance "HOEKVHLD".
extremes : bool
Whether to read measurements for waterlevel timeseries or extremes.
return_xarray : bool, optional
Whether to return raw xarray.Dataset instead of a DataFrame. The default is False.
nap_correction : bool, optional
Whether to correct for NAP2005. The default is False.
Returns
-------
df_meas : pd.DataFrame
DataFrame with the measurements or extremes timeseries.
"""

if extremes:
fname = DICT_FNAMES["meas_ext"].format(station=station)
Expand Down
42 changes: 38 additions & 4 deletions kenmerkendewaarden/havengetallen.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,8 +49,8 @@ def calc_havengetallen(df_ext:pd.DataFrame, return_df_ext=False, min_coverage=No
df_havengetallen : pd.DataFrame
DataFrame with havengetallen for all hour-classes.
0 corresponds to spring, 6 corresponds to neap, mean is mean.
return_df_ext : pd.DataFrame
An enriched copy of the input DataFrame, mainly for plotting.
df_ext : pd.DataFrame
An enriched copy of the input DataFrame including a 'culm_hr' column.
"""
raise_extremes_with_aggers(df_ext)
Expand Down Expand Up @@ -174,7 +174,25 @@ def calc_HWLW_culmhr_summary_tidalcoeff(df_ext):
return HWLW_culmhr_summary


def plot_HWLW_pertimeclass(df_ext, df_havengetallen):
def plot_HWLW_pertimeclass(df_ext:pd.DataFrame, df_havengetallen:pd.DataFrame):
"""
Plot the extremes for each hour-class, including a median line.
Parameters
----------
df_ext : pd.DataFrame
DataFrame with measurement extremes, as provided by `kw.calc_havengetallen()`.
df_havengetallen : pd.DataFrame
DataFrame with havengetallen for all hour-classes, as provided by `kw.calc_havengetallen()`.
Returns
-------
fig : matplotlib.figure.Figure
Figure handle.
ax : matplotlib.axes._axes.Axes
Figure axis handle.
"""

assert 'culm_hr' in df_ext.columns

Expand Down Expand Up @@ -204,7 +222,23 @@ def plot_HWLW_pertimeclass(df_ext, df_havengetallen):
return fig, axs


def plot_aardappelgrafiek(df_havengetallen):
def plot_aardappelgrafiek(df_havengetallen:pd.DataFrame):
"""
Plot the median values of each hour-class in a aardappelgrafiek.
Parameters
----------
df_havengetallen : pd.DataFrame
DataFrame with havengetallen for all hour-classes, as provided by `kw.calc_havengetallen()`.
Returns
-------
fig : matplotlib.figure.Figure
Figure handle.
ax : matplotlib.axes._axes.Axes
Figure axis handle.
"""
# remove mean column
HWLW_culmhr_summary = df_havengetallen.loc[:11].copy()

Expand Down
13 changes: 7 additions & 6 deletions kenmerkendewaarden/overschrijding.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from matplotlib import ticker
from scipy import optimize, signal
from typing import Union, List
import datetime as dt
import logging
from kenmerkendewaarden.data_retrieve import clip_timeseries_physical_break
from kenmerkendewaarden.utils import raise_extremes_with_aggers
Expand All @@ -30,7 +29,7 @@ def get_threshold_rowidx(df):

def calc_overschrijding(df_ext:pd.DataFrame, dist:dict = None,
inverse:bool = False, clip_physical_break:bool = False,
rule_type:str = None, rule_value=None,
rule_type:str = None, rule_value:(pd.Timestamp, float) = None,
interp_freqs:list = None):
"""
Compute exceedance/deceedance frequencies based on measured extreme waterlevels.
Expand All @@ -47,8 +46,9 @@ def calc_overschrijding(df_ext:pd.DataFrame, dist:dict = None,
Whether to exclude the part of the timeseries before physical breaks like estuary closures. The default is False.
rule_type : str, optional
break/linear/None, passed on to apply_trendanalysis(). The default is None.
rule_value : TYPE, optional
Value corresponding to rule_type. The default is None.
rule_value : (pd.Timestamp, float), optional
Value corresponding to rule_type, pd.Timestamp (or anything understood by pd.Timestamp)
in case of rule_type='break', float in case of rule_type='linear'. The default is None.
interp_freqs : list, optional
The frequencies to interpolate to, providing this will result in a
"Geinterpoleerd" key in the returned dictionary. The default is None.
Expand Down Expand Up @@ -321,15 +321,16 @@ def get_total_years(df: pd.DataFrame) -> float:
return (df.index[-1] - df.index[0]).total_seconds() / (3600 * 24 * 365)


def apply_trendanalysis(df: pd.DataFrame, rule_type: str, rule_value: Union[float, dt.datetime]):
def apply_trendanalysis(df: pd.DataFrame, rule_type: str, rule_value: Union[pd.Timestamp, float]):
# There are 2 rule types: - break -> Values before break are removed
# - linear -> Values are increased/lowered based on value in value/year. It is assumes
# that there is no linear trend at the latest time (so it works its way back
# in the past). rule_value should be entered as going forward in time
if rule_type == 'break':
return df[rule_value:].copy()
elif rule_type == 'linear':
df, rule_value = df.copy(), float(rule_value)
rule_value = float(rule_value)
df = df.copy()
dx = np.array([rule_value*x.total_seconds()/(365*24*3600) for x in (df.index[-1] - df.index)])
df['values'] = df['values'] + dx
return df
Expand Down
Loading

0 comments on commit 65adefe

Please sign in to comment.