diff --git a/examples/KWK_process.py b/examples/KWK_process.py index 2888037..f2a1ce2 100644 --- a/examples/KWK_process.py +++ b/examples/KWK_process.py @@ -16,15 +16,9 @@ # TODO: HW/LW numbers not always increasing (at havengetallen): ['HANSWT','BROUWHVSGT08','PETTZD','DORDT'] # overview in https://github.com/Deltares-research/kenmerkendewaarden/issues/101 and the linked wm-ws-dl issue -tstart_dt = pd.Timestamp(2011,1,1, tz="UTC+01:00") -tstop_dt = pd.Timestamp(2021,1,1, tz="UTC+01:00") -if ((tstop_dt.year-tstart_dt.year)==10) & (tstop_dt.month==tstop_dt.day==tstart_dt.month==tstart_dt.day==1): - year_slotgem = tstop_dt.year -else: - year_slotgem = 'invalid' +year_slotgem = 2021 print(f'year_slotgem: {year_slotgem}') -# dir_base = r'p:\11208031-010-kenmerkende-waarden-k\work' dir_base = r'p:\11210325-005-kenmerkende-waarden\work' dir_meas = os.path.join(dir_base,'measurements_wl_18700101_20240101') @@ -84,32 +78,31 @@ # timeseries are used for slotgemiddelden, gemgetijkrommen (needs slotgem+havget) df_meas_all = kw.read_measurements(dir_output=dir_meas, station=current_station, extremes=False, nap_correction=nap_correction, drop_duplicates=drop_duplicates) - if df_meas_all is not None: - #crop measurement data - df_meas_10y = hatyan.crop_timeseries(df_meas_all, times=slice(tstart_dt,tstop_dt-dt.timedelta(minutes=10))) - # extremes are used for slotgemiddelden, havengetallen, overschrijding - df_ext_all = kw.read_measurements(dir_output=dir_meas, station=current_station, extremes=True, + df_ext_12345_all = kw.read_measurements(dir_output=dir_meas, station=current_station, extremes=True, nap_correction=nap_correction, drop_duplicates=drop_duplicates) - if df_ext_all is not None: - # TODO: make calc_HWLW12345to12() faster: https://github.com/Deltares/hatyan/issues/311 - df_ext_all_12 = hatyan.calc_HWLW12345to12(df_ext_all) #convert 12345 to 12 by taking minimum of 345 as 2 (laagste laagwater) - #crop timeseries to 10y - df_ext_10y_12 = hatyan.crop_timeseries(df_ext_all_12, times=slice(tstart_dt,tstop_dt),onlyfull=False) + if df_meas_all is None or df_ext_12345_all is None: + raise ValueError(f"missing data for {current_station}") + + # convert 12345 to 12 by taking minimum of 345 as 2 (laagste laagwater) + # TODO: make calc_HWLW12345to12() faster: https://github.com/Deltares/hatyan/issues/311 + df_ext_all = hatyan.calc_HWLW12345to12(df_ext_12345_all) + # crop measurement data to (excluding) year_slotgem + df_meas_todate = df_meas_all.loc[:str(year_slotgem-1)] + df_ext_todate = df_ext_all.loc[:str(year_slotgem-1)] #### TIDAL INDICATORS - if compute_indicators and df_meas_all is not None and df_ext_all is not None: + if compute_indicators: print(f'tidal indicators for {current_station}') # compute and plot tidal indicators - dict_wltidalindicators = kw.calc_wltidalindicators(df_meas=df_meas_all, min_coverage=min_coverage) - dict_HWLWtidalindicators = kw.calc_HWLWtidalindicators(df_ext=df_ext_all_12, min_coverage=min_coverage) + dict_wltidalindicators = kw.calc_wltidalindicators(df_meas=df_meas_todate, min_coverage=min_coverage) + dict_HWLWtidalindicators = kw.calc_HWLWtidalindicators(df_ext=df_ext_todate, min_coverage=min_coverage) # add hat/lat - df_meas_19y = df_meas_all.loc["2001":"2019"] - hat, lat = kw.calc_hat_lat_frommeasurements(df_meas_19y) + hat, lat = kw.calc_hat_lat_frommeasurements(df_meas_todate) dict_HWLWtidalindicators["hat"] = hat dict_HWLWtidalindicators["lat"] = lat @@ -131,17 +124,17 @@ #### SLOTGEMIDDELDEN # TODO: nodal cycle is not in same phase for all stations, this is not physically correct. # TODO: more data is needed for proper working of fitting for some stations (2011: BAALHK, BRESKVHVN, GATVBSLE, SCHAARVDND) - if compute_slotgem and df_meas_all is not None and df_ext_all is not None: + if compute_slotgem: print(f'slotgemiddelden for {current_station}') # compute slotgemiddelden, exclude all values after tstop_dt (is year_slotgem) # including years with too little values and years before physical break - slotgemiddelden_all = kw.calc_slotgemiddelden(df_meas=df_meas_all.loc[:tstop_dt], - df_ext=df_ext_all_12.loc[:tstop_dt], + slotgemiddelden_all = kw.calc_slotgemiddelden(df_meas=df_meas_todate, + df_ext=df_ext_todate, min_coverage=0, clip_physical_break=True) # only years with enough values and after potential physical break - slotgemiddelden_valid = kw.calc_slotgemiddelden(df_meas=df_meas_all.loc[:tstop_dt], - df_ext=df_ext_all_12.loc[:tstop_dt], + slotgemiddelden_valid = kw.calc_slotgemiddelden(df_meas=df_meas_todate, + df_ext=df_ext_todate, min_coverage=min_coverage, clip_physical_break=True) # plot slotgemiddelden @@ -180,9 +173,9 @@ ### HAVENGETALLEN - if compute_havengetallen and df_ext_all is not None: + if compute_havengetallen: print(f'havengetallen for {current_station}') - df_havengetallen, df_HWLW = kw.calc_havengetallen(df_ext=df_ext_10y_12, return_df_ext=True) + df_havengetallen, df_HWLW = kw.calc_havengetallen(df_ext=df_ext_todate, return_df_ext=True) # plot hwlw per timeclass including median fig, axs = kw.plot_HWLW_pertimeclass(df_ext=df_HWLW, df_havengetallen=df_havengetallen) @@ -200,18 +193,18 @@ ##### GEMIDDELDE GETIJKROMMEN - if compute_gemgetij and df_meas_all is not None and df_ext_all is not None: + if compute_gemgetij: print(f'gemiddelde getijkrommen for {current_station}') pred_freq = "10s" # frequency influences the accuracy of havengetallen-scaling and is writing frequency of BOI timeseries # derive getijkrommes: raw, scaled to havengetallen, scaled to havengetallen and 12h25min period - gemgetij_raw = kw.calc_gemiddeldgetij(df_meas=df_meas_10y, df_ext=None, + gemgetij_raw = kw.calc_gemiddeldgetij(df_meas=df_meas_todate, df_ext=None, freq=pred_freq, nb=0, nf=0, scale_extremes=False, scale_period=False) - gemgetij_corr = kw.calc_gemiddeldgetij(df_meas=df_meas_10y, df_ext=df_ext_10y_12, + gemgetij_corr = kw.calc_gemiddeldgetij(df_meas=df_meas_todate, df_ext=df_ext_todate, freq=pred_freq, nb=1, nf=1, scale_extremes=True, scale_period=False) - gemgetij_corr_boi = kw.calc_gemiddeldgetij(df_meas=df_meas_10y, df_ext=df_ext_10y_12, + gemgetij_corr_boi = kw.calc_gemiddeldgetij(df_meas=df_meas_todate, df_ext=df_ext_todate, freq=pred_freq, nb=0, nf=4, scale_extremes=True, scale_period=True) @@ -280,15 +273,12 @@ def add_validation_dist(dist_dict, dist_type, station): freqs_interested = [5, 2, 1, 1/2, 1/5, 1/10, 1/20, 1/50, 1/100, 1/200, 1/500, 1/1000, 1/2000, 1/4000, 1/5000, 1/10000] - if compute_overschrijding and df_ext_all is not None: + if compute_overschrijding: print(f'overschrijdingsfrequenties for {current_station}') - # only include data up to year_slotgem - df_measext = df_ext_all_12.loc[:tstop_dt] - # 1. Exceedance dist_exc_hydra = initiate_dist_with_hydra_nl(station=current_station) - dist_exc = kw.calc_overschrijding(df_ext=df_measext, rule_type=None, rule_value=None, + dist_exc = kw.calc_overschrijding(df_ext=df_ext_todate, rule_type=None, rule_value=None, clip_physical_break=True, dist=dist_exc_hydra, interp_freqs=freqs_interested) add_validation_dist(dist_exc, dist_type='exceedance', station=current_station) @@ -299,7 +289,7 @@ def add_validation_dist(dist_dict, dist_type, station): fig.savefig(os.path.join(dir_overschrijding, f'kw{year_slotgem}-exceedance-{current_station}.png')) # 2. Deceedance - dist_dec = kw.calc_overschrijding(df_ext=df_measext, rule_type=None, rule_value=None, + dist_dec = kw.calc_overschrijding(df_ext=df_ext_todate, rule_type=None, rule_value=None, clip_physical_break=True, inverse=True, interp_freqs=freqs_interested) add_validation_dist(dist_dec, dist_type='deceedance', station=current_station) diff --git a/kenmerkendewaarden/gemiddeldgetij.py b/kenmerkendewaarden/gemiddeldgetij.py index ff19364..df95da4 100644 --- a/kenmerkendewaarden/gemiddeldgetij.py +++ b/kenmerkendewaarden/gemiddeldgetij.py @@ -11,7 +11,9 @@ from kenmerkendewaarden.tidalindicators import (calc_HWLWtidalrange, calc_getijcomponenten) from kenmerkendewaarden.havengetallen import calc_havengetallen -from kenmerkendewaarden.utils import TimeSeries_TimedeltaFormatter_improved +from kenmerkendewaarden.utils import (crop_timeseries_last_nyears, + TimeSeries_TimedeltaFormatter_improved, + ) from matplotlib.ticker import MaxNLocator, MultipleLocator @@ -45,9 +47,11 @@ def calc_gemiddeldgetij( Parameters ---------- df_meas : pd.DataFrame - Timeseries of 10 years of waterlevel measurements. + Timeseries of waterlevel measurements. The last 10 years of this + timeseries are used to compute the getijkrommes. df_ext : pd.DataFrame, optional - Timeseries of 10 years of waterlevel extremes (1/2 only). The default is None. + Timeseries of waterlevel extremes (1/2 only). The last 10 years of this + timeseries are used to compute the getijkrommes. The default is None. min_coverage : float, optional The minimal required coverage of the df_ext timeseries. Passed on to `calc_havengetallen()`. The default is None. freq : str, optional @@ -69,10 +73,10 @@ def calc_gemiddeldgetij( dictionary with Dataframes with gemiddeld getij for mean, spring and neap tide. """ - data_pd_meas_10y = df_meas + df_meas_10y = crop_timeseries_last_nyears(df=df_meas, nyears=10) tstop_dt = df_meas.index.max() - current_station = data_pd_meas_10y.attrs["station"] + current_station = df_meas_10y.attrs["station"] # TODO: deprecate debug argument+plot (maybe use max HW instead of max tidalrange?) # TODO: add correctie havengetallen HW/LW av/sp/np met slotgemiddelde uit PLSS/modelfit (HW/LW av) @@ -90,7 +94,8 @@ def calc_gemiddeldgetij( station_attrs = [df.attrs["station"] for df in [df_meas, df_ext]] assert all(x == station_attrs[0] for x in station_attrs) - df_havengetallen = calc_havengetallen(df_ext=df_ext, min_coverage=min_coverage) + df_ext_10y = crop_timeseries_last_nyears(df_ext, nyears=10) + df_havengetallen = calc_havengetallen(df_ext=df_ext_10y, min_coverage=min_coverage) HW_sp, LW_sp = df_havengetallen.loc[ 0, ["HW_values_median", "LW_values_median"] ] # spring @@ -106,13 +111,13 @@ def calc_gemiddeldgetij( HW_np = LW_np = None # derive components via TA on measured waterlevels - comp_frommeasurements_avg, comp_av = get_gemgetij_components(data_pd_meas_10y) + comp_frommeasurements_avg, comp_av = get_gemgetij_components(df_meas_10y) times_pred_1mnth = pd.date_range( start=pd.Timestamp(tstop_dt.year, 1, 1, 0, 0) - pd.Timedelta(hours=12), end=pd.Timestamp(tstop_dt.year, 2, 1, 0, 0), freq=freq, - tz=data_pd_meas_10y.index.tz, + tz=df_meas_10y.index.tz, ) # start 12 hours in advance, to assure also corrected values on desired tstart comp_av.attrs["nodalfactors"] = ( False # nodalfactors=False to guarantee repetative signal diff --git a/kenmerkendewaarden/havengetallen.py b/kenmerkendewaarden/havengetallen.py index c5927fa..f9b198d 100644 --- a/kenmerkendewaarden/havengetallen.py +++ b/kenmerkendewaarden/havengetallen.py @@ -17,6 +17,7 @@ ) from kenmerkendewaarden.utils import ( raise_extremes_with_aggers, + crop_timeseries_last_nyears, TimeSeries_TimedeltaFormatter_improved, ) from matplotlib.ticker import MultipleLocator @@ -46,7 +47,8 @@ def calc_havengetallen( Parameters ---------- df_ext : pd.DataFrame - DataFrame with extremes (highs and lows, no aggers). + DataFrame with extremes (highs and lows, no aggers). The last 10 years of this + timeseries are used to compute the havengetallen. return_df : bool Whether to return the enriched input dataframe. Default is False. min_coverage : float, optional @@ -64,17 +66,18 @@ def calc_havengetallen( df_havengetallen : pd.DataFrame DataFrame with havengetallen for all hour-classes. 0 corresponds to spring, 6 corresponds to neap, mean is mean. - df_ext : pd.DataFrame + df_ext_culm : pd.DataFrame An enriched copy of the input DataFrame including a 'culm_hr' column. """ raise_extremes_with_aggers(df_ext) + df_ext_10y = crop_timeseries_last_nyears(df=df_ext, nyears=10) # check if coverage is high enough for havengetallen if min_coverage is not None: # TODO: compute_actual_counts only returns years for which there are no nans, so will have different length than expected counts if there is an all-nan year # TODO: if we supply 4 years of complete data instead of 10 years, no error is raised - data_pd_hw = df_ext.loc[df_ext["HWLWcode"] == 1]["values"] + data_pd_hw = df_ext_10y.loc[df_ext_10y["HWLWcode"] == 1]["values"] df_actual_counts_peryear = compute_actual_counts(data_pd_hw, freq="Y") df_expected_counts_peryear = compute_expected_counts(data_pd_hw, freq="Y") # floor expected counts to avoid rounding issues @@ -94,13 +97,13 @@ def calc_havengetallen( f"min_coverage={min_coverage}:\n{df_debug}" ) - current_station = df_ext.attrs["station"] + current_station = df_ext_10y.attrs["station"] logger.info(f"computing havengetallen for {current_station}") - df_ext = calc_hwlw_moonculm_combi(df_ext=df_ext, moonculm_offset=moonculm_offset) - df_havengetallen = calc_HWLW_culmhr_summary(df_ext) # TODO: maybe add tijverschil + df_ext_culm = calc_hwlw_moonculm_combi(df_ext=df_ext_10y, moonculm_offset=moonculm_offset) + df_havengetallen = calc_HWLW_culmhr_summary(df_ext_culm) # TODO: maybe add tijverschil logger.info("computing havengetallen done") if return_df_ext: - return df_havengetallen, df_ext + return df_havengetallen, df_ext_culm else: return df_havengetallen diff --git a/kenmerkendewaarden/tidalindicators.py b/kenmerkendewaarden/tidalindicators.py index 04e77e4..e6cd204 100644 --- a/kenmerkendewaarden/tidalindicators.py +++ b/kenmerkendewaarden/tidalindicators.py @@ -9,7 +9,8 @@ import matplotlib.pyplot as plt import hatyan import logging -from kenmerkendewaarden.utils import raise_extremes_with_aggers +from kenmerkendewaarden.utils import (raise_extremes_with_aggers, + crop_timeseries_last_nyears) __all__ = [ @@ -380,7 +381,7 @@ def calc_hat_lat_fromcomponents(comp: pd.DataFrame) -> tuple: return hat, lat -def calc_hat_lat_frommeasurements(df_meas_19y: pd.DataFrame) -> tuple: +def calc_hat_lat_frommeasurements(df_meas: pd.DataFrame) -> tuple: """ Derive highest and lowest astronomical tide (HAT/LAT) from a measurement timeseries of 19 years. Tidal components are derived for each year of the measurement timeseries. @@ -392,8 +393,9 @@ def calc_hat_lat_frommeasurements(df_meas_19y: pd.DataFrame) -> tuple: Parameters ---------- - df_meas_19y : pd.DataFrame - Measurements timeseries of 19 years. + df_meas : pd.DataFrame + Measurements timeseries. The last 19 years of this + timeseries are used to compute hat and lat. Returns ------- @@ -402,6 +404,7 @@ def calc_hat_lat_frommeasurements(df_meas_19y: pd.DataFrame) -> tuple: """ + df_meas_19y = crop_timeseries_last_nyears(df=df_meas, nyears=19) num_years = len(df_meas_19y.index.year.unique()) if num_years != 19: raise ValueError( diff --git a/kenmerkendewaarden/utils.py b/kenmerkendewaarden/utils.py index aee4db2..ffa2c49 100644 --- a/kenmerkendewaarden/utils.py +++ b/kenmerkendewaarden/utils.py @@ -2,6 +2,9 @@ import numpy as np from matplotlib.ticker import Formatter +import logging + +logger = logging.getLogger(__name__) def raise_extremes_with_aggers(df_ext): @@ -16,6 +19,31 @@ def raise_extremes_with_aggers(df_ext): ) +def crop_timeseries_last_nyears(df, nyears): + # remove last timestep if equal to "yyyy-01-01 00:00:00" + if '-01-01 00:00:00' in str(df.index[-1]): + df = df.iloc[:-1] + + # last_year, for instance 2020 + last_year = df.index[-1].year + # first_year, for instance 2011 + first_year = last_year - (nyears-1) + + df_10y = df.loc[str(first_year):str(last_year)] + + # TODO: consider enforcing nyears instead of warning if it is not the case + # just like in `kw.calc_hat_lat_frommeasurements()`, but requires updates to tests + actual_years = df_10y.index.year.drop_duplicates().to_numpy() + is_exp_first = actual_years[0] == first_year + is_exp_last = actual_years[-1] == last_year + is_exp_amount = len(actual_years) == nyears + if not (is_exp_first & is_exp_last & is_exp_amount): + logger.warning(f"requested {nyears} years but resulted in " + f"{len(actual_years)}: {actual_years}") + + return df_10y + + # TODO: fixing display of negative timedeltas was requested in https://github.com/pandas-dev/pandas/issues/17232#issuecomment-2205579156 class TimeSeries_TimedeltaFormatter_improved(Formatter): """ diff --git a/tests/test_utils.py b/tests/test_utils.py index 40b41de..be9a0f3 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -5,17 +5,21 @@ @author: veenstra """ import pytest -from kenmerkendewaarden.utils import raise_extremes_with_aggers +from kenmerkendewaarden.utils import (raise_extremes_with_aggers, + crop_timeseries_last_nyears) +import pandas as pd +import numpy as np @pytest.mark.unittest def test_raise_extremes_with_aggers_raise_12345df(df_ext): with pytest.raises(ValueError) as e: raise_extremes_with_aggers(df_ext) - assert ( + expected_error = ( "df_ext should only contain extremes (HWLWcode 1/2), " "but it also contains aggers (HWLWcode 3/4/5" - ) in str(e.value) + ) + assert expected_error in str(e.value) @pytest.mark.unittest @@ -23,6 +27,32 @@ def test_raise_extremes_with_aggers_pass_12df(df_ext_12_2010): raise_extremes_with_aggers(df_ext_12_2010) +@pytest.mark.unittest +def test_crop_timeseries_last_nyears(df_meas): + assert df_meas.index[0] == pd.Timestamp("1987-01-01 00:00:00+01:00 ") + assert df_meas.index[-1] == pd.Timestamp("2022-01-01 00:00:00+01:00") + assert len(df_meas) == 1840897 + + df_meas_10y = crop_timeseries_last_nyears(df_meas, nyears=10) + # assert number of years + num_years = len(df_meas_10y.index.year.unique()) + assert num_years == 10 + # assert start/end timestamps and length + assert df_meas_10y.index[0] == pd.Timestamp("2012-01-01 00:00:00+01:00 ") + assert df_meas_10y.index[-1] == pd.Timestamp("2021-12-31 23:50:00+01:00") + assert len(df_meas_10y) == 526032 + # assert on years + actual_years = df_meas_10y.index.year.drop_duplicates().to_numpy() + expected_years = np.arange(2012, 2021+1) + assert (actual_years == expected_years).all() + + +@pytest.mark.unittest +def test_crop_timeseries_last_nyears_warning_tooshort(df_meas_2010_2014, caplog): + crop_timeseries_last_nyears(df_meas_2010_2014, nyears=10) + assert "requested 10 years but resulted in 5" in caplog.text + + @pytest.mark.unittest def test_raise_extremes_with_aggers_emptydf(): import pandas as pd