Skip to content

Commit

Permalink
Pass complete measurement timeseries throughout code (#168)
Browse files Browse the repository at this point in the history
* refactored code so we can always pass around measurements todate

* added new `crop_timeseries_last_nyears` function including warning if too short

* added tests for new function
  • Loading branch information
veenstrajelmer authored Oct 25, 2024
1 parent 97c0c3a commit 32ce563
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 61 deletions.
68 changes: 29 additions & 39 deletions examples/KWK_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,9 @@
# TODO: HW/LW numbers not always increasing (at havengetallen): ['HANSWT','BROUWHVSGT08','PETTZD','DORDT']
# overview in https://github.com/Deltares-research/kenmerkendewaarden/issues/101 and the linked wm-ws-dl issue

tstart_dt = pd.Timestamp(2011,1,1, tz="UTC+01:00")
tstop_dt = pd.Timestamp(2021,1,1, tz="UTC+01:00")
if ((tstop_dt.year-tstart_dt.year)==10) & (tstop_dt.month==tstop_dt.day==tstart_dt.month==tstart_dt.day==1):
year_slotgem = tstop_dt.year
else:
year_slotgem = 'invalid'
year_slotgem = 2021
print(f'year_slotgem: {year_slotgem}')

# dir_base = r'p:\11208031-010-kenmerkende-waarden-k\work'
dir_base = r'p:\11210325-005-kenmerkende-waarden\work'
dir_meas = os.path.join(dir_base,'measurements_wl_18700101_20240101')

Expand Down Expand Up @@ -84,32 +78,31 @@
# timeseries are used for slotgemiddelden, gemgetijkrommen (needs slotgem+havget)
df_meas_all = kw.read_measurements(dir_output=dir_meas, station=current_station, extremes=False,
nap_correction=nap_correction, drop_duplicates=drop_duplicates)
if df_meas_all is not None:
#crop measurement data
df_meas_10y = hatyan.crop_timeseries(df_meas_all, times=slice(tstart_dt,tstop_dt-dt.timedelta(minutes=10)))

# extremes are used for slotgemiddelden, havengetallen, overschrijding
df_ext_all = kw.read_measurements(dir_output=dir_meas, station=current_station, extremes=True,
df_ext_12345_all = kw.read_measurements(dir_output=dir_meas, station=current_station, extremes=True,
nap_correction=nap_correction, drop_duplicates=drop_duplicates)
if df_ext_all is not None:
# TODO: make calc_HWLW12345to12() faster: https://github.com/Deltares/hatyan/issues/311
df_ext_all_12 = hatyan.calc_HWLW12345to12(df_ext_all) #convert 12345 to 12 by taking minimum of 345 as 2 (laagste laagwater)
#crop timeseries to 10y
df_ext_10y_12 = hatyan.crop_timeseries(df_ext_all_12, times=slice(tstart_dt,tstop_dt),onlyfull=False)
if df_meas_all is None or df_ext_12345_all is None:
raise ValueError(f"missing data for {current_station}")

# convert 12345 to 12 by taking minimum of 345 as 2 (laagste laagwater)
# TODO: make calc_HWLW12345to12() faster: https://github.com/Deltares/hatyan/issues/311
df_ext_all = hatyan.calc_HWLW12345to12(df_ext_12345_all)

# crop measurement data to (excluding) year_slotgem
df_meas_todate = df_meas_all.loc[:str(year_slotgem-1)]
df_ext_todate = df_ext_all.loc[:str(year_slotgem-1)]



#### TIDAL INDICATORS
if compute_indicators and df_meas_all is not None and df_ext_all is not None:
if compute_indicators:
print(f'tidal indicators for {current_station}')
# compute and plot tidal indicators
dict_wltidalindicators = kw.calc_wltidalindicators(df_meas=df_meas_all, min_coverage=min_coverage)
dict_HWLWtidalindicators = kw.calc_HWLWtidalindicators(df_ext=df_ext_all_12, min_coverage=min_coverage)
dict_wltidalindicators = kw.calc_wltidalindicators(df_meas=df_meas_todate, min_coverage=min_coverage)
dict_HWLWtidalindicators = kw.calc_HWLWtidalindicators(df_ext=df_ext_todate, min_coverage=min_coverage)

# add hat/lat
df_meas_19y = df_meas_all.loc["2001":"2019"]
hat, lat = kw.calc_hat_lat_frommeasurements(df_meas_19y)
hat, lat = kw.calc_hat_lat_frommeasurements(df_meas_todate)
dict_HWLWtidalindicators["hat"] = hat
dict_HWLWtidalindicators["lat"] = lat

Expand All @@ -131,17 +124,17 @@
#### SLOTGEMIDDELDEN
# TODO: nodal cycle is not in same phase for all stations, this is not physically correct.
# TODO: more data is needed for proper working of fitting for some stations (2011: BAALHK, BRESKVHVN, GATVBSLE, SCHAARVDND)
if compute_slotgem and df_meas_all is not None and df_ext_all is not None:
if compute_slotgem:
print(f'slotgemiddelden for {current_station}')

# compute slotgemiddelden, exclude all values after tstop_dt (is year_slotgem)
# including years with too little values and years before physical break
slotgemiddelden_all = kw.calc_slotgemiddelden(df_meas=df_meas_all.loc[:tstop_dt],
df_ext=df_ext_all_12.loc[:tstop_dt],
slotgemiddelden_all = kw.calc_slotgemiddelden(df_meas=df_meas_todate,
df_ext=df_ext_todate,
min_coverage=0, clip_physical_break=True)
# only years with enough values and after potential physical break
slotgemiddelden_valid = kw.calc_slotgemiddelden(df_meas=df_meas_all.loc[:tstop_dt],
df_ext=df_ext_all_12.loc[:tstop_dt],
slotgemiddelden_valid = kw.calc_slotgemiddelden(df_meas=df_meas_todate,
df_ext=df_ext_todate,
min_coverage=min_coverage, clip_physical_break=True)

# plot slotgemiddelden
Expand Down Expand Up @@ -180,9 +173,9 @@


### HAVENGETALLEN
if compute_havengetallen and df_ext_all is not None:
if compute_havengetallen:
print(f'havengetallen for {current_station}')
df_havengetallen, df_HWLW = kw.calc_havengetallen(df_ext=df_ext_10y_12, return_df_ext=True)
df_havengetallen, df_HWLW = kw.calc_havengetallen(df_ext=df_ext_todate, return_df_ext=True)

# plot hwlw per timeclass including median
fig, axs = kw.plot_HWLW_pertimeclass(df_ext=df_HWLW, df_havengetallen=df_havengetallen)
Expand All @@ -200,18 +193,18 @@


##### GEMIDDELDE GETIJKROMMEN
if compute_gemgetij and df_meas_all is not None and df_ext_all is not None:
if compute_gemgetij:
print(f'gemiddelde getijkrommen for {current_station}')
pred_freq = "10s" # frequency influences the accuracy of havengetallen-scaling and is writing frequency of BOI timeseries

# derive getijkrommes: raw, scaled to havengetallen, scaled to havengetallen and 12h25min period
gemgetij_raw = kw.calc_gemiddeldgetij(df_meas=df_meas_10y, df_ext=None,
gemgetij_raw = kw.calc_gemiddeldgetij(df_meas=df_meas_todate, df_ext=None,
freq=pred_freq, nb=0, nf=0,
scale_extremes=False, scale_period=False)
gemgetij_corr = kw.calc_gemiddeldgetij(df_meas=df_meas_10y, df_ext=df_ext_10y_12,
gemgetij_corr = kw.calc_gemiddeldgetij(df_meas=df_meas_todate, df_ext=df_ext_todate,
freq=pred_freq, nb=1, nf=1,
scale_extremes=True, scale_period=False)
gemgetij_corr_boi = kw.calc_gemiddeldgetij(df_meas=df_meas_10y, df_ext=df_ext_10y_12,
gemgetij_corr_boi = kw.calc_gemiddeldgetij(df_meas=df_meas_todate, df_ext=df_ext_todate,
freq=pred_freq, nb=0, nf=4,
scale_extremes=True, scale_period=True)

Expand Down Expand Up @@ -280,15 +273,12 @@ def add_validation_dist(dist_dict, dist_type, station):
freqs_interested = [5, 2, 1, 1/2, 1/5, 1/10, 1/20, 1/50, 1/100, 1/200,
1/500, 1/1000, 1/2000, 1/4000, 1/5000, 1/10000]

if compute_overschrijding and df_ext_all is not None:
if compute_overschrijding:
print(f'overschrijdingsfrequenties for {current_station}')

# only include data up to year_slotgem
df_measext = df_ext_all_12.loc[:tstop_dt]

# 1. Exceedance
dist_exc_hydra = initiate_dist_with_hydra_nl(station=current_station)
dist_exc = kw.calc_overschrijding(df_ext=df_measext, rule_type=None, rule_value=None,
dist_exc = kw.calc_overschrijding(df_ext=df_ext_todate, rule_type=None, rule_value=None,
clip_physical_break=True, dist=dist_exc_hydra,
interp_freqs=freqs_interested)
add_validation_dist(dist_exc, dist_type='exceedance', station=current_station)
Expand All @@ -299,7 +289,7 @@ def add_validation_dist(dist_dict, dist_type, station):
fig.savefig(os.path.join(dir_overschrijding, f'kw{year_slotgem}-exceedance-{current_station}.png'))

# 2. Deceedance
dist_dec = kw.calc_overschrijding(df_ext=df_measext, rule_type=None, rule_value=None,
dist_dec = kw.calc_overschrijding(df_ext=df_ext_todate, rule_type=None, rule_value=None,
clip_physical_break=True, inverse=True,
interp_freqs=freqs_interested)
add_validation_dist(dist_dec, dist_type='deceedance', station=current_station)
Expand Down
21 changes: 13 additions & 8 deletions kenmerkendewaarden/gemiddeldgetij.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@
from kenmerkendewaarden.tidalindicators import (calc_HWLWtidalrange,
calc_getijcomponenten)
from kenmerkendewaarden.havengetallen import calc_havengetallen
from kenmerkendewaarden.utils import TimeSeries_TimedeltaFormatter_improved
from kenmerkendewaarden.utils import (crop_timeseries_last_nyears,
TimeSeries_TimedeltaFormatter_improved,
)
from matplotlib.ticker import MaxNLocator, MultipleLocator


Expand Down Expand Up @@ -45,9 +47,11 @@ def calc_gemiddeldgetij(
Parameters
----------
df_meas : pd.DataFrame
Timeseries of 10 years of waterlevel measurements.
Timeseries of waterlevel measurements. The last 10 years of this
timeseries are used to compute the getijkrommes.
df_ext : pd.DataFrame, optional
Timeseries of 10 years of waterlevel extremes (1/2 only). The default is None.
Timeseries of waterlevel extremes (1/2 only). The last 10 years of this
timeseries are used to compute the getijkrommes. The default is None.
min_coverage : float, optional
The minimal required coverage of the df_ext timeseries. Passed on to `calc_havengetallen()`. The default is None.
freq : str, optional
Expand All @@ -69,10 +73,10 @@ def calc_gemiddeldgetij(
dictionary with Dataframes with gemiddeld getij for mean, spring and neap tide.
"""
data_pd_meas_10y = df_meas
df_meas_10y = crop_timeseries_last_nyears(df=df_meas, nyears=10)
tstop_dt = df_meas.index.max()

current_station = data_pd_meas_10y.attrs["station"]
current_station = df_meas_10y.attrs["station"]

# TODO: deprecate debug argument+plot (maybe use max HW instead of max tidalrange?)
# TODO: add correctie havengetallen HW/LW av/sp/np met slotgemiddelde uit PLSS/modelfit (HW/LW av)
Expand All @@ -90,7 +94,8 @@ def calc_gemiddeldgetij(
station_attrs = [df.attrs["station"] for df in [df_meas, df_ext]]
assert all(x == station_attrs[0] for x in station_attrs)

df_havengetallen = calc_havengetallen(df_ext=df_ext, min_coverage=min_coverage)
df_ext_10y = crop_timeseries_last_nyears(df_ext, nyears=10)
df_havengetallen = calc_havengetallen(df_ext=df_ext_10y, min_coverage=min_coverage)
HW_sp, LW_sp = df_havengetallen.loc[
0, ["HW_values_median", "LW_values_median"]
] # spring
Expand All @@ -106,13 +111,13 @@ def calc_gemiddeldgetij(
HW_np = LW_np = None

# derive components via TA on measured waterlevels
comp_frommeasurements_avg, comp_av = get_gemgetij_components(data_pd_meas_10y)
comp_frommeasurements_avg, comp_av = get_gemgetij_components(df_meas_10y)

times_pred_1mnth = pd.date_range(
start=pd.Timestamp(tstop_dt.year, 1, 1, 0, 0) - pd.Timedelta(hours=12),
end=pd.Timestamp(tstop_dt.year, 2, 1, 0, 0),
freq=freq,
tz=data_pd_meas_10y.index.tz,
tz=df_meas_10y.index.tz,
) # start 12 hours in advance, to assure also corrected values on desired tstart
comp_av.attrs["nodalfactors"] = (
False # nodalfactors=False to guarantee repetative signal
Expand Down
17 changes: 10 additions & 7 deletions kenmerkendewaarden/havengetallen.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
)
from kenmerkendewaarden.utils import (
raise_extremes_with_aggers,
crop_timeseries_last_nyears,
TimeSeries_TimedeltaFormatter_improved,
)
from matplotlib.ticker import MultipleLocator
Expand Down Expand Up @@ -46,7 +47,8 @@ def calc_havengetallen(
Parameters
----------
df_ext : pd.DataFrame
DataFrame with extremes (highs and lows, no aggers).
DataFrame with extremes (highs and lows, no aggers). The last 10 years of this
timeseries are used to compute the havengetallen.
return_df : bool
Whether to return the enriched input dataframe. Default is False.
min_coverage : float, optional
Expand All @@ -64,17 +66,18 @@ def calc_havengetallen(
df_havengetallen : pd.DataFrame
DataFrame with havengetallen for all hour-classes.
0 corresponds to spring, 6 corresponds to neap, mean is mean.
df_ext : pd.DataFrame
df_ext_culm : pd.DataFrame
An enriched copy of the input DataFrame including a 'culm_hr' column.
"""
raise_extremes_with_aggers(df_ext)
df_ext_10y = crop_timeseries_last_nyears(df=df_ext, nyears=10)

# check if coverage is high enough for havengetallen
if min_coverage is not None:
# TODO: compute_actual_counts only returns years for which there are no nans, so will have different length than expected counts if there is an all-nan year
# TODO: if we supply 4 years of complete data instead of 10 years, no error is raised
data_pd_hw = df_ext.loc[df_ext["HWLWcode"] == 1]["values"]
data_pd_hw = df_ext_10y.loc[df_ext_10y["HWLWcode"] == 1]["values"]
df_actual_counts_peryear = compute_actual_counts(data_pd_hw, freq="Y")
df_expected_counts_peryear = compute_expected_counts(data_pd_hw, freq="Y")
# floor expected counts to avoid rounding issues
Expand All @@ -94,13 +97,13 @@ def calc_havengetallen(
f"min_coverage={min_coverage}:\n{df_debug}"
)

current_station = df_ext.attrs["station"]
current_station = df_ext_10y.attrs["station"]
logger.info(f"computing havengetallen for {current_station}")
df_ext = calc_hwlw_moonculm_combi(df_ext=df_ext, moonculm_offset=moonculm_offset)
df_havengetallen = calc_HWLW_culmhr_summary(df_ext) # TODO: maybe add tijverschil
df_ext_culm = calc_hwlw_moonculm_combi(df_ext=df_ext_10y, moonculm_offset=moonculm_offset)
df_havengetallen = calc_HWLW_culmhr_summary(df_ext_culm) # TODO: maybe add tijverschil
logger.info("computing havengetallen done")
if return_df_ext:
return df_havengetallen, df_ext
return df_havengetallen, df_ext_culm
else:
return df_havengetallen

Expand Down
11 changes: 7 additions & 4 deletions kenmerkendewaarden/tidalindicators.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
import matplotlib.pyplot as plt
import hatyan
import logging
from kenmerkendewaarden.utils import raise_extremes_with_aggers
from kenmerkendewaarden.utils import (raise_extremes_with_aggers,
crop_timeseries_last_nyears)


__all__ = [
Expand Down Expand Up @@ -380,7 +381,7 @@ def calc_hat_lat_fromcomponents(comp: pd.DataFrame) -> tuple:
return hat, lat


def calc_hat_lat_frommeasurements(df_meas_19y: pd.DataFrame) -> tuple:
def calc_hat_lat_frommeasurements(df_meas: pd.DataFrame) -> tuple:
"""
Derive highest and lowest astronomical tide (HAT/LAT) from a measurement timeseries of 19 years.
Tidal components are derived for each year of the measurement timeseries.
Expand All @@ -392,8 +393,9 @@ def calc_hat_lat_frommeasurements(df_meas_19y: pd.DataFrame) -> tuple:
Parameters
----------
df_meas_19y : pd.DataFrame
Measurements timeseries of 19 years.
df_meas : pd.DataFrame
Measurements timeseries. The last 19 years of this
timeseries are used to compute hat and lat.
Returns
-------
Expand All @@ -402,6 +404,7 @@ def calc_hat_lat_frommeasurements(df_meas_19y: pd.DataFrame) -> tuple:
"""

df_meas_19y = crop_timeseries_last_nyears(df=df_meas, nyears=19)
num_years = len(df_meas_19y.index.year.unique())
if num_years != 19:
raise ValueError(
Expand Down
28 changes: 28 additions & 0 deletions kenmerkendewaarden/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

import numpy as np
from matplotlib.ticker import Formatter
import logging

logger = logging.getLogger(__name__)


def raise_extremes_with_aggers(df_ext):
Expand All @@ -16,6 +19,31 @@ def raise_extremes_with_aggers(df_ext):
)


def crop_timeseries_last_nyears(df, nyears):
# remove last timestep if equal to "yyyy-01-01 00:00:00"
if '-01-01 00:00:00' in str(df.index[-1]):
df = df.iloc[:-1]

# last_year, for instance 2020
last_year = df.index[-1].year
# first_year, for instance 2011
first_year = last_year - (nyears-1)

df_10y = df.loc[str(first_year):str(last_year)]

# TODO: consider enforcing nyears instead of warning if it is not the case
# just like in `kw.calc_hat_lat_frommeasurements()`, but requires updates to tests
actual_years = df_10y.index.year.drop_duplicates().to_numpy()
is_exp_first = actual_years[0] == first_year
is_exp_last = actual_years[-1] == last_year
is_exp_amount = len(actual_years) == nyears
if not (is_exp_first & is_exp_last & is_exp_amount):
logger.warning(f"requested {nyears} years but resulted in "
f"{len(actual_years)}: {actual_years}")

return df_10y


# TODO: fixing display of negative timedeltas was requested in https://github.com/pandas-dev/pandas/issues/17232#issuecomment-2205579156
class TimeSeries_TimedeltaFormatter_improved(Formatter):
"""
Expand Down
Loading

0 comments on commit 32ce563

Please sign in to comment.