From e70132c7048fc916547e098d5e14a71197b4bf24 Mon Sep 17 00:00:00 2001 From: veenstrajelmer <60435591+veenstrajelmer@users.noreply.github.com> Date: Thu, 29 Aug 2024 16:33:09 +0200 Subject: [PATCH] 111 align dtypes of variables returned by different kw functions (#127) * reformatted code tidalindicators, added test for hwlw tidalindicators * converted overschrijding from dataframes to series * converted gemiddeldgetij from dataframes to series * reformatting code * renamed tidalindicators and slotgemiddelden series index from None to "period" * updated whatsnew --- docs/whats-new.md | 1 + examples/KWK_process.py | 12 +- kenmerkendewaarden/gemiddeldgetij.py | 49 +++-- kenmerkendewaarden/havengetallen.py | 5 +- kenmerkendewaarden/overschrijding.py | 243 +++++++++++++------------ kenmerkendewaarden/slotgemiddelden.py | 3 +- kenmerkendewaarden/tidalindicators.py | 160 ++++++++--------- tests/test_gemiddeldgetij.py | 40 ++--- tests/test_overschrijding.py | 60 ++----- tests/test_slotgemiddelden.py | 2 +- tests/test_tidalindicators.py | 249 ++++++-------------------- 11 files changed, 322 insertions(+), 502 deletions(-) diff --git a/docs/whats-new.md b/docs/whats-new.md index 1497309..29ebd36 100644 --- a/docs/whats-new.md +++ b/docs/whats-new.md @@ -24,6 +24,7 @@ - improvements to output csv files in [#109](https://github.com/Deltares-research/kenmerkendewaarden/pull/109) - drop duplicate times in `kw.read_measurements()` in [#116](https://github.com/Deltares-research/kenmerkendewaarden/pull/116) - use NETCDF4_CLASSIC format to reduce file sizes written by `kw.retrieve_measurements()` in [#119](https://github.com/Deltares-research/kenmerkendewaarden/pull/119) +- align output of all methods in [#127](https://github.com/Deltares-research/kenmerkendewaarden/pull/127) ### Fix - implemented workaround for pandas 2.2.0 with different rounding behaviour in [#69](https://github.com/Deltares-research/kenmerkendewaarden/pull/69) diff --git a/examples/KWK_process.py b/examples/KWK_process.py index fe82325..4ab95a1 100644 --- a/examples/KWK_process.py +++ b/examples/KWK_process.py @@ -259,10 +259,11 @@ def initiate_dist_with_hydra_nl(station): if os.path.exists(file_hydra_nl): df_hydra_nl = pd.read_table(file_hydra_nl, encoding='latin-1', decimal=',', header=0) df_hydra_nl.index = 1/df_hydra_nl['Terugkeertijd [jaar]'] + df_hydra_nl.index.name = 'frequency' df_hydra_nl['values'] = df_hydra_nl['Belastingniveau [m+NAP]/Golfparameter [m]/[s]/Sterkte bekleding [-]'] df_hydra_nl = df_hydra_nl[['values']] df_hydra_nl.attrs['station'] = station - dist_dict['Hydra-NL'] = df_hydra_nl + dist_dict['Hydra-NL'] = df_hydra_nl['values'] return dist_dict def add_validation_dist(dist_dict, dist_type, station): @@ -275,10 +276,11 @@ def add_validation_dist(dist_dict, dist_type, station): df_validation = df_validation.rename({"value":"values"},axis=1) df_validation['values'] /= 100 df_validation = df_validation.set_index("value_Tfreq", drop=True) + df_validation.index.name = 'frequency' df_validation.attrs['station'] = station - dist_dict['validation'] = df_validation + dist_dict['validation'] = df_validation['values'] - Tfreqs_interested = [5, 2, 1, 1/2, 1/5, 1/10, 1/20, 1/50, 1/100, 1/200, + freqs_interested = [5, 2, 1, 1/2, 1/5, 1/10, 1/20, 1/50, 1/100, 1/200, 1/500, 1/1000, 1/2000, 1/4000, 1/5000, 1/10000] if compute_overschrijding and data_pd_HWLW_all is not None: @@ -291,7 +293,7 @@ def add_validation_dist(dist_dict, dist_type, station): dist_exc_hydra = initiate_dist_with_hydra_nl(station=current_station) dist_exc = kw.calc_overschrijding(df_ext=data_pd_measext, rule_type=None, rule_value=None, clip_physical_break=True, dist=dist_exc_hydra, - interp_freqs=Tfreqs_interested) + interp_freqs=freqs_interested) add_validation_dist(dist_exc, dist_type='exceedance', station=current_station) dist_exc['Geinterpoleerd'].to_csv(os.path.join(dir_overschrijding, f'Exceedance_{current_station}.csv')) # dist_exc['Gecombineerd'].to_csv(os.path.join(dir_overschrijding, f'Exceedance_{current_station}_gecombineerd.csv')) @@ -303,7 +305,7 @@ def add_validation_dist(dist_dict, dist_type, station): # 2. Deceedance dist_dec = kw.calc_overschrijding(df_ext=data_pd_measext, rule_type=None, rule_value=None, clip_physical_break=True, inverse=True, - interp_freqs=Tfreqs_interested) + interp_freqs=freqs_interested) add_validation_dist(dist_dec, dist_type='deceedance', station=current_station) dist_dec['Geinterpoleerd'].to_csv(os.path.join(dir_overschrijding, f'Deceedance_{current_station}.csv')) # dist_dec['Gecombineerd'].to_csv(os.path.join(dir_overschrijding, f'Deceedance_{current_station}_gecombineerd.csv')) diff --git a/kenmerkendewaarden/gemiddeldgetij.py b/kenmerkendewaarden/gemiddeldgetij.py index 985db11..4851dbc 100644 --- a/kenmerkendewaarden/gemiddeldgetij.py +++ b/kenmerkendewaarden/gemiddeldgetij.py @@ -267,9 +267,9 @@ def calc_gemiddeldgetij( # combine in single dictionary gemgetij_dict = {} - gemgetij_dict["mean"] = prediction_av - gemgetij_dict["spring"] = prediction_sp - gemgetij_dict["neap"] = prediction_np + gemgetij_dict["mean"] = prediction_av['values'] + gemgetij_dict["spring"] = prediction_sp['values'] + gemgetij_dict["neap"] = prediction_np['values'] return gemgetij_dict @@ -310,21 +310,21 @@ def plot_gemiddeldgetij( prediction_av_raw = gemgetij_dict_raw["mean"] prediction_sp_raw = gemgetij_dict_raw["spring"] prediction_np_raw = gemgetij_dict_raw["neap"] - prediction_av_raw["values"].plot( + prediction_av_raw.plot( ax=ax, linestyle="--", color=cmap(0), linewidth=0.7, label="gemiddeldgetij mean (raw)", ) - prediction_sp_raw["values"].plot( + prediction_sp_raw.plot( ax=ax, linestyle="--", color=cmap(1), linewidth=0.7, label="gemiddeldgetij spring (raw)", ) - prediction_np_raw["values"].plot( + prediction_np_raw.plot( ax=ax, linestyle="--", color=cmap(2), @@ -335,11 +335,11 @@ def plot_gemiddeldgetij( prediction_av_corr = gemgetij_dict["mean"] prediction_sp_corr = gemgetij_dict["spring"] prediction_np_corr = gemgetij_dict["neap"] - prediction_av_corr["values"].plot(ax=ax, color=cmap(0), label="gemiddeldgetij mean") - prediction_sp_corr["values"].plot( + prediction_av_corr.plot(ax=ax, color=cmap(0), label="gemiddeldgetij mean") + prediction_sp_corr.plot( ax=ax, color=cmap(1), label="gemiddeldgetij spring" ) - prediction_np_corr["values"].plot(ax=ax, color=cmap(2), label="gemiddeldgetij neap") + prediction_np_corr.plot(ax=ax, color=cmap(2), label="gemiddeldgetij neap") ax.set_title(f"getijkrommes for {station}") ax.legend(loc=4) @@ -366,9 +366,9 @@ def get_gemgetij_components(data_pd_meas): # ============================================================================= # Hatyan analyse voor 10 jaar (alle componenten voor gemiddelde getijcyclus) #TODO: maybe use original 4y period/componentfile instead? SA/SM should come from 19y analysis # ============================================================================= - const_list = hatyan.get_const_list_hatyan( - "year" - ) # components should not be reduced, since higher harmonics are necessary + + # components should not be reduced, since higher harmonics are necessary + const_list = hatyan.get_const_list_hatyan("year") hatyan_settings_ana = dict( nodalfactors=True, fu_alltimes=False, @@ -430,16 +430,18 @@ def get_gemgetij_components(data_pd_meas): .str.isalpha() ) comp_iM = comp_frommeasurements_avg.loc[bool_endswithiM] - comp_av.loc[comp_higherharmonics, "A"] = np.sqrt( - (comp_iM["A"] ** 2).sum() - ) # kwadraatsom + # kwadraatsom + comp_av.loc[comp_higherharmonics, "A"] = np.sqrt((comp_iM["A"] ** 2).sum()) comp_av.loc["A0"] = comp_frommeasurements_avg.loc["A0"] + # values are different than 1991.0 document and differs per station while the + # document states "Zoals te verwachten is de verhouding per component tussen deze + # wortel en de oorspronkelijke amplitude voor alle plaatsen gelijk" logger.debug( "verhouding tussen originele en kwadratensom componenten:\n" f"{comp_av/comp_frommeasurements_avg.loc[components_av]}" - ) # values are different than 1991.0 document and differs per station while the document states "Zoals te verwachten is de verhouding per component tussen deze wortel en de oorspronkelijke amplitude voor alle plaatsen gelijk" + ) return comp_frommeasurements_avg, comp_av @@ -464,18 +466,16 @@ def reshape_signal(ts, ts_ext, HW_goal, LW_goal, tP_goal=None): bool_HW = ts_ext["HWLWcode"] == 1 idx_HW = np.where(bool_HW)[0] timesHW = ts_ext.index[idx_HW] - timesLW = ts_ext.index[ - idx_HW[:-1] + 1 - ] # assuming alternating 1,2,1 or 1,3,1, this is always valid in this workflow + # assuming alternating 1,2,1 or 1,3,1, this is always valid in this workflow + timesLW = ts_ext.index[idx_HW[:-1] + 1] # crop from first to last HW (rest is not scaled anyway) ts_time_firstHW = ts_ext[bool_HW].index[0] ts_time_lastHW = ts_ext[bool_HW].index[-1] ts_corr = ts.copy().loc[ts_time_firstHW:ts_time_lastHW] - ts_corr["timedelta"] = ( - ts_corr.index - ) # this is necessary since datetimeindex with freq is not editable, and Series is editable + # this is necessary since datetimeindex with freq is not editable, and Series is editable + ts_corr["timedelta"] = (ts_corr.index) for i in np.arange(0, len(timesHW) - 1): HW1_val = ts_corr.loc[timesHW[i], "values"] HW2_val = ts_corr.loc[timesHW[i + 1], "values"] @@ -492,9 +492,8 @@ def reshape_signal(ts, ts_ext, HW_goal, LW_goal, tP_goal=None): temp2 = ( ts_corr.loc[timesLW[i] : timesHW[i + 1], "values"] - LW_val ) / TR2_val * TR_goal + LW_goal - temp = pd.concat( - [temp1, temp2.iloc[1:]] - ) # .iloc[1:] since timesLW[i] is in both timeseries (values are equal) + # .iloc[1:] since timesLW[i] is in both timeseries (values are equal) + temp = pd.concat([temp1, temp2.iloc[1:]]) ts_corr["values_new"] = temp tide_HWtoHW = ts_corr.loc[timesHW[i] : timesHW[i + 1]] diff --git a/kenmerkendewaarden/havengetallen.py b/kenmerkendewaarden/havengetallen.py index ec43b53..38614b0 100644 --- a/kenmerkendewaarden/havengetallen.py +++ b/kenmerkendewaarden/havengetallen.py @@ -65,13 +65,14 @@ def calc_havengetallen( """ raise_extremes_with_aggers(df_ext) + ser_ext = df_ext["values"] # check if coverage is high enough for havengetallen if min_coverage is not None: # TODO: compute_actual_counts only returns years for which there are no nans, so will have different length than expected counts if there is an all-nan year # TODO: if we supply 4 years of complete data instead of 10 years, no error is raised - df_actual_counts_peryear = compute_actual_counts(df_ext, freq="Y") - df_expected_counts_peryear = compute_expected_counts(df_ext, freq="Y") + df_actual_counts_peryear = compute_actual_counts(ser_ext, freq="Y") + df_expected_counts_peryear = compute_expected_counts(ser_ext, freq="Y") df_min_counts_peryear = df_expected_counts_peryear * min_coverage bool_coverage_toolow = df_actual_counts_peryear < df_min_counts_peryear df_debug = pd.DataFrame( diff --git a/kenmerkendewaarden/overschrijding.py b/kenmerkendewaarden/overschrijding.py index 51dd26f..d5e8832 100644 --- a/kenmerkendewaarden/overschrijding.py +++ b/kenmerkendewaarden/overschrijding.py @@ -21,13 +21,20 @@ logger = logging.getLogger(__name__) -def get_threshold_rowidx(df): +def get_threshold_rowidx(ser): # TODO: base on frequency or value? thresholfreq = 3 # take a frequency that is at least higher than the max HYDRA frequency (which is 1) - rowidx_tresholdfreq = np.abs(df.index - thresholfreq).argmin() + rowidx_tresholdfreq = np.abs(ser.index - thresholfreq).argmin() return rowidx_tresholdfreq +def series_copy_properties(ser, ser_reference): + # copy attrs index name and values name from reference to series + ser.attrs = ser_reference.attrs + ser.index.name = ser_reference.index.name + ser.name = ser_reference.name + + def calc_overschrijding( df_ext: pd.DataFrame, dist: dict = None, @@ -76,38 +83,38 @@ def calc_overschrijding( if clip_physical_break: df_extrema = clip_timeseries_physical_break(df_extrema) - df_extrema_clean = df_extrema.copy()[ - ["values"] - ] # drop all info but the values (times-idx, HWLWcode etc) + # drop all info but the values (times-idx, HWLWcode etc) + ser_extrema = df_extrema.copy()["values"] if dist is None: dist = {} logger.info(f"Calculate unfiltered distribution (inverse={inverse})") - dist["Ongefilterd"] = distribution(df_extrema_clean, inverse=inverse) + dist["Ongefilterd"] = distribution(ser_extrema, inverse=inverse) # TODO: re-enable filter for river discharge peaks - """# filtering is only applicable for stations with high river discharge influence, so disabled #TODO: ext is geschikt voor getij, maar bij hoge afvoergolf wil je alleen het echte extreem. Er is dan een treshold per station nodig, is nodig om de rivierafvoerpiek te kunnen duiden. + #TODO: ext is geschikt voor getij, maar bij hoge afvoergolf wil je alleen het echte extreem. Er is dan een treshold per station nodig, is nodig om de rivierafvoerpiek te kunnen duiden. + """# filtering is only applicable for stations with high river discharge influence, so disabled logger.info('Calculate filtered distribution') - df_peaks, threshold, _ = detect_peaks(df_extrema_clean) + ser_peaks, threshold, _ = detect_peaks(ser_extrema) if metadata_station['apply_treshold']: temp[metadata_station['id']] = threshold - df_extrema_filt = filter_with_threshold(df_extrema_clean, df_peaks, threshold) + ser_extrema_filt = filter_with_threshold(ser_extrema, ser_peaks, threshold) else: - df_extrema_filt = df_extrema_clean.copy() - dist['Gefilterd'] = distribution(df_extrema_filt.copy()) + ser_extrema_filt = ser_extrema.copy() + dist['Gefilterd'] = distribution(ser_extrema_filt.copy()) """ logger.info("Calculate filtered distribution with trendanalysis") - df_trend = apply_trendanalysis( - df_extrema_clean, rule_type=rule_type, rule_value=rule_value + ser_trend = apply_trendanalysis( + ser_extrema, rule_type=rule_type, rule_value=rule_value ) - dist["Trendanalyse"] = distribution(df_trend.copy(), inverse=inverse) + dist["Trendanalyse"] = distribution(ser_trend.copy(), inverse=inverse) logger.info("Fit Weibull to filtered distribution with trendanalysis") idx_maxfreq_trend = get_threshold_rowidx(dist["Trendanalyse"]) - treshold_value = dist["Trendanalyse"].iloc[idx_maxfreq_trend]["values"] - treshold_Tfreq = dist["Trendanalyse"].iloc[idx_maxfreq_trend].name + treshold_value = dist["Trendanalyse"].iloc[idx_maxfreq_trend] + treshold_Tfreq = dist["Trendanalyse"].index[idx_maxfreq_trend] dist["Weibull"] = get_weibull( dist["Trendanalyse"].copy(), threshold=treshold_value, @@ -118,14 +125,14 @@ def calc_overschrijding( if "Hydra-NL" in dist.keys(): logger.info("Blend trend, weibull and Hydra-NL") # TODO: now based on hardcoded Hydra-NL dict key which is already part of the input dist dict, this is tricky - df_hydra = dist["Hydra-NL"].copy() + ser_hydra = dist["Hydra-NL"].copy() else: logger.info("Blend trend and weibull") - df_hydra = None + ser_hydra = None dist["Gecombineerd"] = blend_distributions( - df_trend=dist["Trendanalyse"].copy(), - df_weibull=dist["Weibull"].copy(), - df_hydra=df_hydra, + ser_trend=dist["Trendanalyse"].copy(), + ser_weibull=dist["Weibull"].copy(), + ser_hydra=ser_hydra, ) if interp_freqs is not None: @@ -133,13 +140,6 @@ def calc_overschrijding( dist["Gecombineerd"], Tfreqs=interp_freqs ) - """ - if row['apply_treshold']: - keys = list(dist.keys()) - else: - keys = [x for x in list(dist.keys()) if x != 'Gefilterd'] - """ - return dist @@ -188,6 +188,7 @@ def check_peakside(values, _i, multiplier, window, threshold): def detect_peaks_hkv( df: pd.DataFrame, window: int, inverse: bool = False, threshold: float = None ) -> pd.DataFrame: + # TODO: still to be converted from dataframe to series _df = df.copy() if inverse: _df["values"] = -_df["values"] @@ -273,37 +274,38 @@ def detect_peaks_hkv( def distribution( - df: pd.DataFrame, - col: str = None, + ser: pd.Series, c: float = -0.3, d: float = 0.4, inverse: bool = False, -) -> pd.DataFrame: - col = df.columns[0] if col is None else col - years = get_total_years(df) +) -> pd.Series: + years = get_total_years(ser) if inverse: - df = df.sort_values(by=col, ascending=False) + ser = ser.sort_values(ascending=False) else: - df = df.sort_values(by=col) - rank = np.array(range(len(df[col]))) + 1 - df.index = (1 - (rank + c) / (len(rank) + d)) * (len(rank) / years) - df_sorted = df.sort_index(ascending=False) - - return df_sorted + ser = ser.sort_values() + rank = np.array(range(len(ser))) + 1 + ser.index = (1 - (rank + c) / (len(rank) + d)) * (len(rank) / years) + ser_sorted = ser.sort_index(ascending=False) + + series_copy_properties(ser=ser_sorted, ser_reference=ser) + ser_sorted.index.name = "frequency" + return ser_sorted def get_weibull( - df: pd.DataFrame, + ser: pd.Series, threshold: float, Tfreqs: np.ndarray, col: str = None, inverse: bool = False, -) -> pd.DataFrame: - values = df["values"].values +) -> pd.Series: + + values = ser.values if inverse: values = -values threshold = -threshold - p_val_gt_threshold = df.index[values > threshold][0] + p_val_gt_threshold = ser.index[values > threshold][0] def pfunc(x, p_val_gt_threshold, threshold, sigma, alpha): return p_val_gt_threshold * np.exp( @@ -347,52 +349,48 @@ def cost_func(params, *args): new_values = pfunc_inverse(Tfreqs, p_val_gt_threshold, threshold, sigma, alpha) if inverse: new_values = -new_values - pd_return = pd.DataFrame(data={"values": new_values}, index=Tfreqs).sort_index( - ascending=False - ) + ser_weibull = pd.Series(new_values, index=Tfreqs).sort_index(ascending=False) - # copy attributes - pd_return.attrs = df.attrs - return pd_return + series_copy_properties(ser=ser_weibull, ser_reference=ser) + return ser_weibull def filter_with_threshold( - df_raw: pd.DataFrame, - df_filtered: pd.DataFrame, + ser_raw: pd.Series, + ser_filtered: pd.Series, threshold: float, inverse: bool = False, -) -> pd.DataFrame: +) -> pd.Series: if inverse: return pd.concat( [ - df_raw[df_raw["values"] >= threshold], - df_filtered[df_filtered["values"] < threshold], + ser_raw[ser_raw >= threshold], + ser_filtered[ser_filtered < threshold], ], axis=0, ).sort_index() else: return pd.concat( [ - df_raw[df_raw["values"] <= threshold], - df_filtered[df_filtered["values"] > threshold], + ser_raw[ser_raw <= threshold], + ser_filtered[ser_filtered > threshold], ], axis=0, ).sort_index() -def detect_peaks(df: pd.DataFrame, prominence: int = 10, inverse: bool = False): - df = df.copy() +def detect_peaks(ser: pd.Series, prominence: int = 10, inverse: bool = False): + ser = ser.copy() if inverse: - df["values"] = -1 * df["values"] - peak_indices = signal.find_peaks(df["values"].values, prominence=prominence)[0] - df_peaks = pd.DataFrame( - data={"values": df["values"].iloc[peak_indices]}, - index=df.iloc[peak_indices].index, - ) + ser = -1 * ser + peak_indices = signal.find_peaks(ser.values, prominence=prominence)[0] + ser_peaks = pd.Series(ser.iloc[peak_indices], + index=ser.iloc[peak_indices].index, + ) threshold = determine_threshold( - values=df["values"].values, peak_indices=peak_indices + values=ser.values, peak_indices=peak_indices ) - return df_peaks, threshold, peak_indices + return ser_peaks, threshold, peak_indices def determine_threshold(values: np.ndarray, peak_indices: np.ndarray) -> float: @@ -408,80 +406,82 @@ def determine_threshold(values: np.ndarray, peak_indices: np.ndarray) -> float: return threshold -def get_total_years(df: pd.DataFrame) -> float: - return (df.index[-1] - df.index[0]).total_seconds() / (3600 * 24 * 365) +def get_total_years(ser: pd.Series) -> float: + return (ser.index[-1] - ser.index[0]).total_seconds() / (3600 * 24 * 365) def apply_trendanalysis( - df: pd.DataFrame, rule_type: str, rule_value: Union[pd.Timestamp, float] + ser: pd.Series, rule_type: str, rule_value: Union[pd.Timestamp, float] ): # There are 2 rule types: - break -> Values before break are removed # - linear -> Values are increased/lowered based on value in value/year. It is assumes # that there is no linear trend at the latest time (so it works its way back # in the past). rule_value should be entered as going forward in time if rule_type == "break": - return df[rule_value:].copy() + ser_out = ser[rule_value:].copy() elif rule_type == "linear": rule_value = float(rule_value) - df = df.copy() + ser = ser.copy() dx = np.array( [ rule_value * x.total_seconds() / (365 * 24 * 3600) - for x in (df.index[-1] - df.index) + for x in (ser.index[-1] - ser.index) ] ) - df["values"] = df["values"] + dx - return df + ser = ser + dx + ser_out = ser elif rule_type is None: - return df.copy() + ser_out = ser.copy() else: raise ValueError( f'Incorrect rule_type="{rule_type}" passed to function. Only "break", "linear" or None are supported' ) + ser_out.index.name = "frequency" + return ser_out def blend_distributions( - df_trend: pd.DataFrame, df_weibull: pd.DataFrame, df_hydra: pd.DataFrame = None + ser_trend: pd.Series, ser_weibull: pd.Series, ser_hydra: pd.Series = None ) -> pd.DataFrame: # get and compare station attributes - df_list = [df_trend, df_weibull, df_hydra] - station_attrs = [df.attrs["station"] for df in df_list if df is not None] + ser_list = [ser_trend, ser_weibull, ser_hydra] + station_attrs = [ser.attrs["station"] for ser in ser_list if ser is not None] assert all(x == station_attrs[0] for x in station_attrs) - df_trend = df_trend.sort_index(ascending=False) - df_weibull = df_weibull.sort_index(ascending=False) + ser_trend = ser_trend.sort_index(ascending=False) + ser_weibull = ser_weibull.sort_index(ascending=False) # Trend to weibull - idx_maxfreq_trend = get_threshold_rowidx(df_trend) - df_blended1 = df_trend.iloc[:idx_maxfreq_trend].copy() - df_weibull = df_weibull.loc[df_weibull.index < df_blended1.index[-1]].copy() + idx_maxfreq_trend = get_threshold_rowidx(ser_trend) + ser_blended1 = ser_trend.iloc[:idx_maxfreq_trend].copy() + ser_weibull = ser_weibull.loc[ser_weibull.index < ser_blended1.index[-1]].copy() # Weibull to Hydra - if df_hydra is not None: - df_hydra = df_hydra.sort_index(ascending=False) + if ser_hydra is not None: + ser_hydra = ser_hydra.sort_index(ascending=False) - Tfreqs_combined = np.unique(np.concatenate((df_weibull.index, df_hydra.index))) + Tfreqs_combined = np.unique(np.concatenate((ser_weibull.index, ser_hydra.index))) vals_weibull = np.interp( Tfreqs_combined, - np.flip(df_weibull.index), - np.flip(df_weibull["values"].values), + np.flip(ser_weibull.index), + np.flip(ser_weibull.values), ) vals_hydra = np.interp( - Tfreqs_combined, np.flip(df_hydra.index), np.flip(df_hydra["values"].values) + Tfreqs_combined, np.flip(ser_hydra.index), np.flip(ser_hydra.values) ) - Tfreq0, TfreqN = df_hydra.index[0], 1 / 50 + Tfreq0, TfreqN = ser_hydra.index[0], 1 / 50 Tfreqs = np.logspace(np.log10(TfreqN), np.log10(Tfreq0), int(1e5)) vals_weibull = np.interp( np.log10(Tfreqs), - np.log10(np.flip(df_weibull.index)), - np.flip(df_weibull["values"].values), + np.log10(np.flip(ser_weibull.index)), + np.flip(ser_weibull.values), ) vals_hydra = np.interp( np.log10(Tfreqs), - np.log10(np.flip(df_hydra.index)), - np.flip(df_hydra["values"].values), + np.log10(np.flip(ser_hydra.index)), + np.flip(ser_hydra.values), ) indices = np.arange(len(Tfreqs)) grads = np.flip(np.arange(len(indices))) / len(indices) * np.pi @@ -491,48 +491,45 @@ def blend_distributions( + (1 - 0.5 * (np.cos(grads) + 1)) * vals_hydra[indices] ) - df_blended2 = pd.DataFrame( - data={"values": vals_blend}, index=Tfreqs - ).sort_index(ascending=False) + ser_blended2 = pd.Series(vals_blend, + index=Tfreqs + ).sort_index(ascending=False) - df_blended = pd.concat( + ser_blended = pd.concat( [ - df_blended1, - df_weibull.loc[ - (df_weibull.index > df_blended2.index[0]) - & (df_weibull.index < df_blended1.index[-1]) + ser_blended1, + ser_weibull.loc[ + (ser_weibull.index > ser_blended2.index[0]) + & (ser_weibull.index < ser_blended1.index[-1]) ], - df_blended2, - df_hydra.loc[df_hydra.index < df_blended2.index[-1]], + ser_blended2, + ser_hydra.loc[ser_hydra.index < ser_blended2.index[-1]], ], axis=0, ) else: - df_blended = pd.concat( - [df_blended1, df_weibull.loc[(df_weibull.index < df_blended1.index[-1])]], + ser_blended = pd.concat( + [ser_blended1, ser_weibull.loc[(ser_weibull.index < ser_blended1.index[-1])]], axis=0, ) - duplicated_freqs = df_blended.index.duplicated(keep="first") - df_blended = df_blended.loc[~duplicated_freqs].sort_index(ascending=False) + duplicated_freqs = ser_blended.index.duplicated(keep="first") + ser_blended = ser_blended.loc[~duplicated_freqs].sort_index(ascending=False) - # copy attrs - df_blended.attrs = df_trend.attrs - return df_blended + series_copy_properties(ser=ser_blended, ser_reference=ser_trend) + + return ser_blended def interpolate_interested_Tfreqs( - df: pd.DataFrame, Tfreqs: List[float] + ser: pd.Series, Tfreqs: List[float] ) -> pd.DataFrame: - interp_vals = np.interp(Tfreqs, np.flip(df.index), np.flip(df["values"].values)) - df_interp = pd.DataFrame(data={"values": interp_vals}, index=Tfreqs).sort_index( - ascending=False - ) + interp_vals = np.interp(Tfreqs, np.flip(ser.index), np.flip(ser.values)) + ser_interp = pd.Series(interp_vals, index=Tfreqs).sort_index(ascending=False) - # copy attrs - df_interp.attrs = df.attrs - return df_interp + series_copy_properties(ser=ser_interp, ser_reference=ser) + return ser_interp def plot_overschrijding(dist: dict): @@ -576,11 +573,11 @@ def plot_overschrijding(dist: dict): else: c = None if k == "Gecombineerd": - ax.plot(dist[k]["values"], "--", label=k, c=c) + ax.plot(dist[k], "--", label=k, c=c) elif k == "Geinterpoleerd": - ax.plot(dist[k]["values"], "o", label=k, c=c, markersize=5) + ax.plot(dist[k], "o", label=k, c=c, markersize=5) else: - ax.plot(dist[k]["values"], label=k, c=c) + ax.plot(dist[k], label=k, c=c) ax.set_title(f"Distribution for {station}") ax.set_xlabel("Frequency [1/yrs]") diff --git a/kenmerkendewaarden/slotgemiddelden.py b/kenmerkendewaarden/slotgemiddelden.py index 68327b1..04fd3f0 100644 --- a/kenmerkendewaarden/slotgemiddelden.py +++ b/kenmerkendewaarden/slotgemiddelden.py @@ -231,7 +231,7 @@ def fit_models(mean_array_todate: pd.Series, with_nodal=True) -> pd.DataFrame: # We'll just use the years. This assumes that annual waterlevels are used that are stored left-padded, the mean waterlevel for 2020 is stored as 2020-1-1. This is not logical, but common practice. allyears_dt = pd.period_range(start=start, end=end) mean_array_allyears = pd.Series(mean_array_todate, index=allyears_dt) - + mean_array_allyears_nonans = mean_array_allyears.loc[~mean_array_allyears.isnull()] if len(mean_array_allyears_nonans) < 2: raise ValueError( @@ -250,6 +250,7 @@ def fit_models(mean_array_todate: pd.Series, with_nodal=True) -> pd.DataFrame: pred_linear = fit.predict(X) linear_fit = pd.Series(pred_linear, index=allyears_dt, name="values") + linear_fit.index.name = mean_array_todate.index.name return linear_fit diff --git a/kenmerkendewaarden/tidalindicators.py b/kenmerkendewaarden/tidalindicators.py index 4600418..945cf11 100644 --- a/kenmerkendewaarden/tidalindicators.py +++ b/kenmerkendewaarden/tidalindicators.py @@ -48,49 +48,45 @@ def calc_HWLWtidalindicators(df_ext: pd.DataFrame, min_coverage: float = None): raise_extremes_with_aggers(df_ext) # split to HW and LW separately, also groupby year - data_pd_HW = df_ext.loc[df_ext["HWLWcode"] == 1] - data_pd_LW = df_ext.loc[df_ext["HWLWcode"] == 2] + ser_ext = df_ext["values"] + data_pd_HW = df_ext.loc[df_ext["HWLWcode"] == 1]["values"] + data_pd_LW = df_ext.loc[df_ext["HWLWcode"] == 2]["values"] # yearmean HWLW from HWLW values #maybe also add *_mean_permonth - HW_mean_peryear = data_pd_HW.groupby(pd.PeriodIndex(data_pd_HW.index, freq="Y"))[ - ["values"] - ].mean() - LW_mean_peryear = data_pd_LW.groupby(pd.PeriodIndex(data_pd_LW.index, freq="Y"))[ - ["values"] - ].mean() + pi_hw_y = pd.PeriodIndex(data_pd_HW.index, freq="Y") + pi_lw_y = pd.PeriodIndex(data_pd_LW.index, freq="Y") + HW_mean_peryear = data_pd_HW.groupby(pi_hw_y).mean() + LW_mean_peryear = data_pd_LW.groupby(pi_lw_y).mean() # derive GHHW/GHWS (gemiddeld hoogwater springtij) per month - HW_monthmax_permonth = data_pd_HW.groupby( - pd.PeriodIndex(data_pd_HW.index, freq="M") - )[ - ["values"] - ].max() # proxy for HW at spring tide - LW_monthmin_permonth = data_pd_LW.groupby( - pd.PeriodIndex(data_pd_LW.index, freq="M") - )[ - ["values"] - ].min() # proxy for LW at spring tide - HW_monthmin_permonth = data_pd_HW.groupby( - pd.PeriodIndex(data_pd_HW.index, freq="M") - )[ - ["values"] - ].min() # proxy for HW at neap tide - LW_monthmax_permonth = data_pd_LW.groupby( - pd.PeriodIndex(data_pd_LW.index, freq="M") - )[ - ["values"] - ].max() # proxy for LW at neap tide + pi_hw_m = pd.PeriodIndex(data_pd_HW.index, freq="M") + pi_lw_m = pd.PeriodIndex(data_pd_LW.index, freq="M") + # proxy for HW at spring tide + HW_monthmax_permonth = data_pd_HW.groupby(pi_hw_m).max() + # proxy for LW at spring tide + LW_monthmin_permonth = data_pd_LW.groupby(pi_lw_m).min() + # proxy for HW at neap tide + HW_monthmin_permonth = data_pd_HW.groupby(pi_hw_m).min() + # proxy for LW at neap tide + LW_monthmax_permonth = data_pd_LW.groupby(pi_lw_m).max() + + ser_list = [HW_mean_peryear, LW_mean_peryear, + HW_monthmax_permonth, LW_monthmin_permonth, + HW_monthmin_permonth, LW_monthmax_permonth, + ] + for ser_one in ser_list: + ser_one.index.name = "period" # replace invalids with nan (in case of too less values per month or year) if min_coverage is not None: assert 0 <= min_coverage <= 1 # count timeseries values per year/month - ext_count_peryear = compute_actual_counts(df_ext, freq="Y") - ext_count_permonth = compute_actual_counts(df_ext, freq="M") + ext_count_peryear = compute_actual_counts(ser_ext, freq="Y") + ext_count_permonth = compute_actual_counts(ser_ext, freq="M") # compute expected counts and multiply with min_coverage to get minimal counts - min_count_peryear = compute_expected_counts(df_ext, freq="Y") * min_coverage - min_count_permonth = compute_expected_counts(df_ext, freq="M") * min_coverage + min_count_peryear = compute_expected_counts(ser_ext, freq="Y") * min_coverage + min_count_permonth = compute_expected_counts(ser_ext, freq="M") * min_coverage # set all statistics that were based on too little values to nan HW_mean_peryear.loc[ext_count_peryear < min_count_peryear] = np.nan @@ -108,39 +104,27 @@ def calc_HWLWtidalindicators(df_ext: pd.DataFrame, min_coverage: float = None): HW_monthmin_permonth = make_periodindex_contiguous(HW_monthmin_permonth) LW_monthmax_permonth = make_periodindex_contiguous(LW_monthmax_permonth) - # derive GHHW/GHWS (gemiddeld hoogwater springtij) - HW_monthmax_mean_peryear = HW_monthmax_permonth.groupby( - pd.PeriodIndex(HW_monthmax_permonth.index, freq="Y") - )[["values"]].mean() - LW_monthmin_mean_peryear = LW_monthmin_permonth.groupby( - pd.PeriodIndex(LW_monthmin_permonth.index, freq="Y") - )[["values"]].mean() - HW_monthmin_mean_peryear = HW_monthmin_permonth.groupby( - pd.PeriodIndex(HW_monthmin_permonth.index, freq="Y") - )[["values"]].mean() - LW_monthmax_mean_peryear = LW_monthmax_permonth.groupby( - pd.PeriodIndex(LW_monthmax_permonth.index, freq="Y") - )[["values"]].mean() - + # derive GHHW/GHWS (proxy for gemiddeld hoogwater/laagwater springtij/doodtij) + pi_hw_mmax_pm_y = pd.PeriodIndex(HW_monthmax_permonth.index, freq="Y") + pi_lw_mmax_pm_y = pd.PeriodIndex(LW_monthmax_permonth.index, freq="Y") + pi_hw_mmin_pm_y = pd.PeriodIndex(HW_monthmin_permonth.index, freq="Y") + pi_lw_mmin_pm_y = pd.PeriodIndex(LW_monthmin_permonth.index, freq="Y") + HW_monthmax_mean_peryear = HW_monthmax_permonth.groupby(pi_hw_mmax_pm_y).mean() + LW_monthmax_mean_peryear = LW_monthmax_permonth.groupby(pi_lw_mmax_pm_y).mean() + HW_monthmin_mean_peryear = HW_monthmin_permonth.groupby(pi_hw_mmin_pm_y).mean() + LW_monthmin_mean_peryear = LW_monthmin_permonth.groupby(pi_lw_mmin_pm_y).mean() + dict_tidalindicators = { - "HW_mean": data_pd_HW["values"].mean(), # GHW - "LW_mean": data_pd_LW["values"].mean(), # GLW - "HW_mean_peryear": HW_mean_peryear["values"], # GHW peryear - "LW_mean_peryear": LW_mean_peryear["values"], # GLW peryear - "HW_monthmax_permonth": HW_monthmax_permonth["values"], # GHHW/GHWS permonth - "LW_monthmin_permonth": LW_monthmin_permonth["values"], # GLLW/GLWS permonth - "HW_monthmax_mean_peryear": HW_monthmax_mean_peryear[ - "values" - ], # GHHW/GHWS peryear - "LW_monthmin_mean_peryear": LW_monthmin_mean_peryear[ - "values" - ], # GLLW/GLWS peryear - "HW_monthmin_mean_peryear": HW_monthmin_mean_peryear[ - "values" - ], # GLHW/GHWN peryear - "LW_monthmax_mean_peryear": LW_monthmax_mean_peryear[ - "values" - ], # GHLW/GLWN peryear + "HW_mean": data_pd_HW.mean(), # GHW + "LW_mean": data_pd_LW.mean(), # GLW + "HW_mean_peryear": HW_mean_peryear, # GHW peryear + "LW_mean_peryear": LW_mean_peryear, # GLW peryear + "HW_monthmax_permonth": HW_monthmax_permonth, # GHHW/GHWS permonth + "LW_monthmin_permonth": LW_monthmin_permonth, # GLLW/GLWS permonth + "HW_monthmax_mean_peryear": HW_monthmax_mean_peryear, # GHHW/GHWS peryear + "LW_monthmax_mean_peryear": LW_monthmax_mean_peryear, # GHLW/GLWN peryear + "HW_monthmin_mean_peryear": HW_monthmin_mean_peryear, # GLHW/GHWN peryear + "LW_monthmin_mean_peryear": LW_monthmin_mean_peryear, # GLLW/GLWS peryear } return dict_tidalindicators @@ -168,24 +152,27 @@ def calc_wltidalindicators(df_meas: pd.DataFrame, min_coverage: float = None): if df_meas.index.tz is not None: df_meas = df_meas.tz_localize(None) + # series from dataframe + ser_meas = df_meas["values"] + # yearmean wl from wl values - wl_mean_peryear = df_meas.groupby(pd.PeriodIndex(df_meas.index, freq="Y"))[ - ["values"] - ].mean() - wl_mean_permonth = df_meas.groupby(pd.PeriodIndex(df_meas.index, freq="M"))[ - ["values"] - ].mean() + pi_wl_y = pd.PeriodIndex(ser_meas.index, freq="Y") + pi_wl_m = pd.PeriodIndex(ser_meas.index, freq="M") + wl_mean_peryear = ser_meas.groupby(pi_wl_y).mean() + wl_mean_peryear.index.name = "period" + wl_mean_permonth = ser_meas.groupby(pi_wl_m).mean() + wl_mean_permonth.index.name = "period" # replace invalids with nan (in case of too less values per month or year) if min_coverage is not None: assert 0 <= min_coverage <= 1 # count timeseries values per year/month - wl_count_peryear = compute_actual_counts(df_meas, freq="Y") - wl_count_permonth = compute_actual_counts(df_meas, freq="M") + wl_count_peryear = compute_actual_counts(ser_meas, freq="Y") + wl_count_permonth = compute_actual_counts(ser_meas, freq="M") # compute expected counts and multiply with min_coverage to get minimal counts - min_count_peryear = compute_expected_counts(df_meas, freq="Y") * min_coverage - min_count_permonth = compute_expected_counts(df_meas, freq="M") * min_coverage + min_count_peryear = compute_expected_counts(ser_meas, freq="Y") * min_coverage + min_count_permonth = compute_expected_counts(ser_meas, freq="M") * min_coverage # set all statistics that were based on too little values to nan wl_mean_peryear.loc[wl_count_peryear < min_count_peryear] = np.nan @@ -196,31 +183,31 @@ def calc_wltidalindicators(df_meas: pd.DataFrame, min_coverage: float = None): wl_mean_permonth = make_periodindex_contiguous(wl_mean_permonth) dict_tidalindicators = { - "wl_mean": df_meas["values"].mean(), - "wl_mean_peryear": wl_mean_peryear["values"], # yearly mean wl - "wl_mean_permonth": wl_mean_permonth["values"], # monthly mean wl + "wl_mean": ser_meas.mean(), + "wl_mean_peryear": wl_mean_peryear, # yearly mean wl + "wl_mean_permonth": wl_mean_permonth, # monthly mean wl } return dict_tidalindicators -def compute_actual_counts(df_meas, freq, column="values"): +def compute_actual_counts(ser_meas, freq): """ Compute the number of non-nan values in a column for all years/months in a timeseries index. """ - df_meas_isnotnull = ~df_meas[column].isnull() - period_index = pd.PeriodIndex(df_meas_isnotnull.index, freq=freq) - df_actual_counts = df_meas_isnotnull.groupby(period_index).sum() - return df_actual_counts + ser_meas_isnotnull = ~ser_meas.isnull() + period_index = pd.PeriodIndex(ser_meas_isnotnull.index, freq=freq) + ser_actual_counts = ser_meas_isnotnull.groupby(period_index).sum() + return ser_actual_counts -def compute_expected_counts(df_meas, freq): +def compute_expected_counts(ser_meas, freq): """ Compute the expected number of values for all years/months in a timeseries index, by taking the number of days for each year/month and dividing it by the median frequency in that period. """ # TODO: beware of series with e.g. only first and last value of month/year, this will result in freq=30days and then expected count of 2, it will pass even if there is almost no data - df_meas = df_meas.copy() + df_meas = pd.DataFrame(ser_meas) df_meas["timediff"] = pd.TimedeltaIndex([pd.NaT]).append( df_meas.index[1:] - df_meas.index[:-1] ) # TODO: from pandas>=2.1.4 the following also works: df_times.diff() (which results in a timedeltaindex of the correct length) @@ -234,8 +221,8 @@ def compute_expected_counts(df_meas, freq): else: raise ValueError(f"invalid freq: '{freq}'") days_inperiod_td = pd.to_timedelta(days_inperiod, unit="D") - df_expected_counts = days_inperiod_td / median_freq - return df_expected_counts + ser_expected_counts = days_inperiod_td / median_freq + return ser_expected_counts def make_periodindex_contiguous(df): @@ -250,6 +237,7 @@ def make_periodindex_contiguous(df): # add attrs from input dataframe df_full.attrs = df.attrs + df_full.index.name = df.index.name return df_full diff --git a/tests/test_gemiddeldgetij.py b/tests/test_gemiddeldgetij.py index b3ef50a..5a4bbad 100644 --- a/tests/test_gemiddeldgetij.py +++ b/tests/test_gemiddeldgetij.py @@ -21,8 +21,8 @@ def test_calc_gemiddeldgetij_outputtype(df_meas_2010): assert isinstance(gemgetij_dict_raw, dict) for k, v in gemgetij_dict_raw.items(): - assert isinstance(v, pd.DataFrame) - assert v.columns == ["values"] + assert isinstance(v, pd.Series) + assert v.name == "values" assert isinstance(v.index, pd.TimedeltaIndex) assert v.index.name == "timedelta" @@ -46,16 +46,16 @@ def test_calc_gemiddeldgetij_raw(df_meas_2010): prediction_np_raw = gemgetij_dict_raw["neap"] assert len(prediction_av_raw) == 746 - assert np.isclose(prediction_av_raw["values"].min(), -0.567608885905236) - assert np.isclose(prediction_av_raw["values"].max(), 1.1665100442336331) + assert np.isclose(prediction_av_raw.min(), -0.567608885905236) + assert np.isclose(prediction_av_raw.max(), 1.1665100442336331) assert len(prediction_sp_raw) == 741 - assert np.isclose(prediction_sp_raw["values"].min(), -0.5305575695904736) - assert np.isclose(prediction_sp_raw["values"].max(), 1.313379801706846) + assert np.isclose(prediction_sp_raw.min(), -0.5305575695904736) + assert np.isclose(prediction_sp_raw.max(), 1.313379801706846) assert len(prediction_np_raw) == 755 - assert np.isclose(prediction_np_raw["values"].min(), -0.579937620616439) - assert np.isclose(prediction_np_raw["values"].max(), 0.8044625899304197) + assert np.isclose(prediction_np_raw.min(), -0.579937620616439) + assert np.isclose(prediction_np_raw.max(), 0.8044625899304197) @pytest.mark.unittest @@ -77,16 +77,16 @@ def test_calc_gemiddeldgetij_corr(df_meas_2010, df_ext_12_2010): prediction_np_corr = gemgetij_dict_corr["neap"] assert len(prediction_av_corr) == 3726 - assert np.isclose(prediction_av_corr["values"].min(), -0.6095833333333333) - assert np.isclose(prediction_av_corr["values"].max(), 1.1300000000000003) + assert np.isclose(prediction_av_corr.min(), -0.6095833333333333) + assert np.isclose(prediction_av_corr.max(), 1.1300000000000003) assert len(prediction_sp_corr) == 3701 - assert np.isclose(prediction_sp_corr["values"].min(), -0.5700000000000001) - assert np.isclose(prediction_sp_corr["values"].max(), 1.3450000000000002) + assert np.isclose(prediction_sp_corr.min(), -0.5700000000000001) + assert np.isclose(prediction_sp_corr.max(), 1.3450000000000002) assert len(prediction_np_corr) == 3771 - assert np.isclose(prediction_np_corr["values"].min(), -0.61) - assert np.isclose(prediction_np_corr["values"].max(), 0.8650000000000001) + assert np.isclose(prediction_np_corr.min(), -0.61) + assert np.isclose(prediction_np_corr.max(), 0.8650000000000001) @pytest.mark.unittest @@ -108,16 +108,16 @@ def test_calc_gemiddeldgetij_corr_boi(df_meas_2010, df_ext_12_2010): prediction_np_corr_boi = gemgetij_dict_corr_boi["neap"] assert len(prediction_av_corr_boi) == 8196 - assert np.isclose(prediction_av_corr_boi["values"].min(), -0.6095833333333333) - assert np.isclose(prediction_av_corr_boi["values"].max(), 1.1300000000000003) + assert np.isclose(prediction_av_corr_boi.min(), -0.6095833333333333) + assert np.isclose(prediction_av_corr_boi.max(), 1.1300000000000003) assert len(prediction_sp_corr_boi) == 8196 - assert np.isclose(prediction_sp_corr_boi["values"].min(), -0.5700000000000001) - assert np.isclose(prediction_sp_corr_boi["values"].max(), 1.3450000000000002) + assert np.isclose(prediction_sp_corr_boi.min(), -0.5700000000000001) + assert np.isclose(prediction_sp_corr_boi.max(), 1.3450000000000002) assert len(prediction_np_corr_boi) == 8196 - assert np.isclose(prediction_np_corr_boi["values"].min(), -0.61) - assert np.isclose(prediction_np_corr_boi["values"].max(), 0.8650000000000001) + assert np.isclose(prediction_np_corr_boi.min(), -0.61) + assert np.isclose(prediction_np_corr_boi.max(), 0.8650000000000001) @pytest.mark.unittest diff --git a/tests/test_overschrijding.py b/tests/test_overschrijding.py index b2bd10e..95c7ee7 100644 --- a/tests/test_overschrijding.py +++ b/tests/test_overschrijding.py @@ -31,11 +31,11 @@ def test_calc_overschrijding_outputtype(df_ext_12_2010_2014): assert isinstance(dist, dict) for k, v in dist.items(): - assert isinstance(v, pd.DataFrame) - assert v.columns == ["values"] + assert isinstance(v, pd.Series) + assert v.name == "values" assert isinstance(v.index, pd.Index) assert str(v.index.dtype) == "float64" - assert v.index.name is None # TODO: rename to 'frequency' + assert v.index.name == "frequency" @pytest.mark.unittest @@ -79,7 +79,7 @@ def test_calc_overschrijding(df_ext_12_2010_2014): 4.00701551, ] ) - assert np.allclose(dist["Geinterpoleerd"]["values"].values, expected_values) + assert np.allclose(dist["Geinterpoleerd"].values, expected_values) @pytest.mark.unittest @@ -96,42 +96,14 @@ def test_calc_overschrijding_with_hydra(df_ext_12_2010_2014): 1 / 100, 1 / 200, ] - df_hydra = pd.DataFrame( - { - "values": np.array( - [ - 2.473, - 3.18, - 4.043, - 4.164, - 4.358, - 4.696, - 5.056, - 5.468, - 5.865, - 6.328, - 7.207, - ] - ) - }, - index=np.array( - [ - 1.00000000e00, - 1.00000000e-01, - 2.00000000e-02, - 1.00000000e-02, - 3.33333333e-03, - 1.00000000e-03, - 3.33333333e-04, - 1.00000000e-04, - 3.33333333e-05, - 1.00000000e-05, - 1.00000000e-06, - ] - ), - ) - df_hydra.attrs = df_ext_12_2010_2014.attrs - dist_hydra = {"Hydra-NL": df_hydra} + hydra_values = np.array([2.473, 3.18 , 4.043, 4.164, 4.358, 4.696, 5.056, 5.468, 5.865, + 6.328, 7.207]) + hydra_index = np.array([1.00000000e+00, 1.00000000e-01, 2.00000000e-02, 1.00000000e-02, + 3.33333333e-03, 1.00000000e-03, 3.33333333e-04, 1.00000000e-04, + 3.33333333e-05, 1.00000000e-05, 1.00000000e-06]) + ser_hydra = pd.Series(hydra_values, index=hydra_index) + ser_hydra.attrs = df_ext_12_2010_2014.attrs + dist_hydra = {"Hydra-NL": ser_hydra} dist = kw.calc_overschrijding( df_ext=df_ext_12_2010_2014, interp_freqs=Tfreqs_interested, dist=dist_hydra ) @@ -160,7 +132,7 @@ def test_calc_overschrijding_with_hydra(df_ext_12_2010_2014): 4.3095, ] ) - assert np.allclose(dist["Geinterpoleerd"]["values"].values, expected_values) + assert np.allclose(dist["Geinterpoleerd"].values, expected_values) @pytest.mark.unittest @@ -207,7 +179,7 @@ def test_calc_overschrijding_rule_type_break(df_ext_12_2010_2014): 3.90661043, ] ) - assert np.allclose(dist["Geinterpoleerd"]["values"].values, expected_values) + assert np.allclose(dist["Geinterpoleerd"].values, expected_values) @pytest.mark.unittest @@ -267,7 +239,7 @@ def test_calc_overschrijding_clip_physical_break(df_ext_12_2010_2014): ] ) assert np.allclose( - dist_normal["Geinterpoleerd"]["values"].values, expected_values_normal + dist_normal["Geinterpoleerd"].values, expected_values_normal ) expected_values_clip = np.array( [ @@ -284,7 +256,7 @@ def test_calc_overschrijding_clip_physical_break(df_ext_12_2010_2014): ] ) assert np.allclose( - dist_clip["Geinterpoleerd"]["values"].values, expected_values_clip + dist_clip["Geinterpoleerd"].values, expected_values_clip ) diff --git a/tests/test_slotgemiddelden.py b/tests/test_slotgemiddelden.py index 34a994f..e87054f 100644 --- a/tests/test_slotgemiddelden.py +++ b/tests/test_slotgemiddelden.py @@ -17,7 +17,7 @@ def test_calc_slotgemiddelden_outputtype(df_meas_2010_2014, df_ext_12_2010_2014) assert isinstance(v, pd.Series) assert v.name == "values" assert isinstance(v.index, pd.PeriodIndex) - assert v.index.name is None # TODO: rename to 'period' + assert v.index.name == "period" @pytest.mark.unittest diff --git a/tests/test_tidalindicators.py b/tests/test_tidalindicators.py index 789cae1..78fe723 100644 --- a/tests/test_tidalindicators.py +++ b/tests/test_tidalindicators.py @@ -13,9 +13,11 @@ @pytest.mark.unittest def test_calc_tidalindicators_outputtype(df_meas_2010_2014, df_ext_12_2010): wl_dict = kw.calc_wltidalindicators(df_meas_2010_2014) + wl_dict_min = kw.calc_wltidalindicators(df_meas_2010_2014, min_coverage=1) hwlw_dict = kw.calc_HWLWtidalindicators(df_ext_12_2010) + hwlw_dict_min = kw.calc_HWLWtidalindicators(df_ext_12_2010, min_coverage=1) - for indicators_dict in [wl_dict, hwlw_dict]: + for indicators_dict in [wl_dict, wl_dict_min, hwlw_dict, hwlw_dict_min]: assert isinstance(indicators_dict, dict) for k, v in indicators_dict.items(): assert isinstance(v, (pd.Series, float)) @@ -23,7 +25,7 @@ def test_calc_tidalindicators_outputtype(df_meas_2010_2014, df_ext_12_2010): continue assert v.name == "values" assert isinstance(v.index, pd.PeriodIndex) - assert v.index.name is None # TODO: rename to `period` + assert v.index.name == "period" @pytest.mark.unittest @@ -39,7 +41,28 @@ def test_calc_HWLWtidalrange(df_ext_12_2010): @pytest.mark.unittest -def test_calc_HWLWtidalindicators(df_meas_2010_2014): +def test_calc_HWLWtidalindicators(df_ext_12_2010_2014): + ext_stats_notimezone = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014.tz_localize(None)) + ext_stats = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014) + ext_stats_min = kw.calc_HWLWtidalindicators(df_ext_12_2010_2014, min_coverage=1) + + expected_keys = ['HW_mean', 'LW_mean', + 'HW_mean_peryear', 'LW_mean_peryear', + 'HW_monthmax_permonth', 'LW_monthmin_permonth', + 'HW_monthmax_mean_peryear', 'LW_monthmax_mean_peryear', + 'HW_monthmin_mean_peryear', 'LW_monthmin_mean_peryear'] + for key in expected_keys: + assert key in ext_stats.keys() + assert (ext_stats[key] == ext_stats_notimezone[key]).all() + + + assert ext_stats_notimezone['HW_monthmax_permonth'].isnull().sum() == 0 + assert ext_stats['HW_monthmax_permonth'].isnull().sum() == 0 + assert ext_stats_min['HW_monthmax_permonth'].isnull().sum() == 13 + + +@pytest.mark.unittest +def test_calc_wltidalindicators(df_meas_2010_2014): wl_stats_notimezone = kw.calc_wltidalindicators(df_meas_2010_2014.tz_localize(None)) wl_stats = kw.calc_wltidalindicators(df_meas_2010_2014) @@ -51,70 +74,19 @@ def test_calc_HWLWtidalindicators(df_meas_2010_2014): wl_mean_peryear_expected = np.array( [0.07960731, 0.08612119, 0.0853051, 0.07010864, 0.10051922] ) - wl_mean_permonth_expected = np.array( - [ - -0.00227151, - 0.089313, - 0.04443996, - -0.03440509, - -0.00206317, - 0.04431481, - 0.03877688, - 0.18267697, - 0.13494907, - 0.18367832, - 0.15928009, - 0.11707661, - 0.1087836, - 0.02535962, - -0.09558468, - -0.0255162, - -0.00076165, - 0.05667361, - 0.11056228, - 0.13890681, - 0.1495, - 0.11866711, - 0.07253009, - 0.36550851, - 0.22046819, - -0.10208094, - -0.07221102, - 0.02279167, - 0.02424507, - 0.05409954, - 0.09238127, - 0.08972894, - 0.15472222, - 0.16913082, - 0.19712963, - 0.1639897, - 0.05744176, - -0.0134375, - -0.10685036, - -0.00822222, - 0.05911066, - 0.019875, - 0.02540995, - 0.07570565, - 0.12776389, - 0.17321909, - 0.23108102, - 0.19502688, - 0.06281138, - 0.08588046, - -0.00553763, - 0.03490278, - 0.03113575, - 0.03134954, - 0.10553763, - 0.16540771, - 0.12535648, - 0.20802195, - 0.10014352, - 0.25624552, - ] - ) + wl_mean_permonth_expected = np.array([ + -0.00227151, 0.089313 , 0.04443996, -0.03440509, -0.00206317, + 0.04431481, 0.03877688, 0.18267697, 0.13494907, 0.18367832, + 0.15928009, 0.11707661, 0.1087836 , 0.02535962, -0.09558468, + -0.0255162 , -0.00076165, 0.05667361, 0.11056228, 0.13890681, + 0.1495 , 0.11866711, 0.07253009, 0.36550851, 0.22046819, + -0.10208094, -0.07221102, 0.02279167, 0.02424507, 0.05409954, + 0.09238127, 0.08972894, 0.15472222, 0.16913082, 0.19712963, + 0.1639897 , 0.05744176, -0.0134375 , -0.10685036, -0.00822222, + 0.05911066, 0.019875 , 0.02540995, 0.07570565, 0.12776389, + 0.17321909, 0.23108102, 0.19502688, 0.06281138, 0.08588046, + -0.00553763, 0.03490278, 0.03113575, 0.03134954, 0.10553763, + 0.16540771, 0.12535648, 0.20802195, 0.10014352, 0.25624552]) assert np.allclose(wl_stats["wl_mean_peryear"].values, wl_mean_peryear_expected) assert np.allclose(wl_stats["wl_mean_permonth"].values, wl_mean_permonth_expected) @@ -246,134 +218,21 @@ def test_calc_wltidalindicators(df_ext_12_2010_2014): assert np.allclose(ext_stats["HW_mean_peryear"].values, hw_mean_peryear_expected) assert np.allclose(ext_stats["LW_mean_peryear"].values, lw_mean_peryear_expected) - hw_monthmax_permonth_expected = np.array( - [ - 1.94, - 1.89, - 1.86, - 1.55, - 1.74, - 1.58, - 1.54, - 2.07, - 2.11, - 2.06, - 1.9, - 1.75, - 1.69, - 1.82, - 1.49, - 1.39, - 1.4, - 1.71, - 1.72, - 1.66, - 1.69, - 1.59, - 2.03, - 2.47, - 2.31, - 1.63, - 1.64, - 1.61, - 1.44, - 1.51, - 1.52, - 1.87, - 1.71, - 1.72, - 1.86, - 2.07, - 1.87, - 1.83, - 1.53, - 1.51, - 1.62, - 1.53, - 1.52, - 1.41, - 2.08, - 1.98, - 2.07, - 3.03, - 1.76, - 1.82, - 1.61, - 1.73, - 1.48, - 1.48, - 1.62, - 1.71, - 1.58, - 2.77, - 1.6, - 1.92, - ] - ) - lw_monthmin_permonth_expected = np.array( - [ - -1.33, - -1.05, - -1.05, - -1.06, - -1.12, - -1.11, - -1.07, - -0.92, - -0.96, - -0.99, - -1.01, - -1.08, - -1.16, - -1.17, - -1.21, - -0.98, - -1.1, - -0.98, - -0.97, - -0.94, - -1.04, - -1.22, - -0.94, - -1.21, - -1.22, - -1.32, - -1.22, - -1.04, - -1.18, - -0.95, - -1.05, - -1.0, - -0.9, - -0.81, - -1.03, - -1.21, - -1.11, - -1.65, - -1.37, - -1.11, - -1.11, - -1.05, - -0.98, - -1.07, - -0.88, - -1.05, - -1.15, - -1.07, - -1.32, - -1.31, - -1.21, - -1.08, - -1.0, - -1.03, - -1.07, - -0.83, - -0.98, - -0.97, - -0.99, - -1.3, - ] - ) + hw_monthmax_permonth_expected = np.array([ + 1.94, 1.89, 1.86, 1.55, 1.74, 1.58, 1.54, 2.07, 2.11, 2.06, 1.9 , + 1.75, 1.69, 1.82, 1.49, 1.39, 1.4 , 1.71, 1.72, 1.66, 1.69, 1.59, + 2.03, 2.47, 2.31, 1.63, 1.64, 1.61, 1.44, 1.51, 1.52, 1.87, 1.71, + 1.72, 1.86, 2.07, 1.87, 1.83, 1.53, 1.51, 1.62, 1.53, 1.52, 1.41, + 2.08, 1.98, 2.07, 3.03, 1.76, 1.82, 1.61, 1.73, 1.48, 1.48, 1.62, + 1.71, 1.58, 2.77, 1.6 , 1.92]) + lw_monthmin_permonth_expected = np.array([ + -1.33, -1.05, -1.05, -1.06, -1.12, -1.11, -1.07, -0.92, -0.96, + -0.99, -1.01, -1.08, -1.16, -1.17, -1.21, -0.98, -1.1 , -0.98, + -0.97, -0.94, -1.04, -1.22, -0.94, -1.21, -1.22, -1.32, -1.22, + -1.04, -1.18, -0.95, -1.05, -1. , -0.9 , -0.81, -1.03, -1.21, + -1.11, -1.65, -1.37, -1.11, -1.11, -1.05, -0.98, -1.07, -0.88, + -1.05, -1.15, -1.07, -1.32, -1.31, -1.21, -1.08, -1. , -1.03, + -1.07, -0.83, -0.98, -0.97, -0.99, -1.3 ]) assert np.allclose( ext_stats["HW_monthmax_permonth"].values, hw_monthmax_permonth_expected )