111 align dtypes of variables returned by different kw functions (#127)

* reformatted code tidalindicators, added test for hwlw tidalindicators * converted overschrijding from dataframes to series * converted gemiddeldgetij from dataframes to series * reformatting code * renamed tidalindicators and slotgemiddelden series index from None to "period" * updated whatsnew
Deltares-research · Aug 29, 2024 · e70132c · e70132c
1 parent 017fb2c
commit e70132c
Show file tree

Hide file tree

Showing 11 changed files with 322 additions and 502 deletions.
diff --git a/docs/whats-new.md b/docs/whats-new.md
@@ -24,6 +24,7 @@
 - improvements to output csv files in [#109](https://github.com/Deltares-research/kenmerkendewaarden/pull/109)
 - drop duplicate times in `kw.read_measurements()` in [#116](https://github.com/Deltares-research/kenmerkendewaarden/pull/116)
 - use NETCDF4_CLASSIC format to reduce file sizes written by `kw.retrieve_measurements()` in [#119](https://github.com/Deltares-research/kenmerkendewaarden/pull/119)
+- align output of all methods in [#127](https://github.com/Deltares-research/kenmerkendewaarden/pull/127)
 
 ### Fix
 - implemented workaround for pandas 2.2.0 with different rounding behaviour in [#69](https://github.com/Deltares-research/kenmerkendewaarden/pull/69)

diff --git a/examples/KWK_process.py b/examples/KWK_process.py
@@ -259,10 +259,11 @@ def initiate_dist_with_hydra_nl(station):
         if os.path.exists(file_hydra_nl):
             df_hydra_nl = pd.read_table(file_hydra_nl, encoding='latin-1', decimal=',', header=0)
             df_hydra_nl.index = 1/df_hydra_nl['Terugkeertijd [jaar]']
+            df_hydra_nl.index.name = 'frequency'
             df_hydra_nl['values'] = df_hydra_nl['Belastingniveau [m+NAP]/Golfparameter [m]/[s]/Sterkte bekleding [-]']
             df_hydra_nl = df_hydra_nl[['values']]
             df_hydra_nl.attrs['station'] = station
-            dist_dict['Hydra-NL'] = df_hydra_nl
+            dist_dict['Hydra-NL'] = df_hydra_nl['values']
         return dist_dict
 
     def add_validation_dist(dist_dict, dist_type, station):
@@ -275,10 +276,11 @@ def add_validation_dist(dist_dict, dist_type, station):
         df_validation = df_validation.rename({"value":"values"},axis=1)
         df_validation['values'] /= 100
         df_validation = df_validation.set_index("value_Tfreq", drop=True)
+        df_validation.index.name = 'frequency'
         df_validation.attrs['station'] = station
-        dist_dict['validation'] = df_validation
+        dist_dict['validation'] = df_validation['values']
 
-    Tfreqs_interested = [5, 2, 1, 1/2, 1/5, 1/10, 1/20, 1/50, 1/100, 1/200,
+    freqs_interested = [5, 2, 1, 1/2, 1/5, 1/10, 1/20, 1/50, 1/100, 1/200,
                          1/500, 1/1000, 1/2000, 1/4000, 1/5000, 1/10000]
 
     if compute_overschrijding and data_pd_HWLW_all is not None:
@@ -291,7 +293,7 @@ def add_validation_dist(dist_dict, dist_type, station):
         dist_exc_hydra = initiate_dist_with_hydra_nl(station=current_station)
         dist_exc = kw.calc_overschrijding(df_ext=data_pd_measext, rule_type=None, rule_value=None, 
                                           clip_physical_break=True, dist=dist_exc_hydra,
-                                          interp_freqs=Tfreqs_interested)
+                                          interp_freqs=freqs_interested)
         add_validation_dist(dist_exc, dist_type='exceedance', station=current_station)
         dist_exc['Geinterpoleerd'].to_csv(os.path.join(dir_overschrijding, f'Exceedance_{current_station}.csv'))
         # dist_exc['Gecombineerd'].to_csv(os.path.join(dir_overschrijding, f'Exceedance_{current_station}_gecombineerd.csv'))
@@ -303,7 +305,7 @@ def add_validation_dist(dist_dict, dist_type, station):
         # 2. Deceedance
         dist_dec = kw.calc_overschrijding(df_ext=data_pd_measext, rule_type=None, rule_value=None, 
                                           clip_physical_break=True, inverse=True,
-                                          interp_freqs=Tfreqs_interested)
+                                          interp_freqs=freqs_interested)
         add_validation_dist(dist_dec, dist_type='deceedance', station=current_station)
         dist_dec['Geinterpoleerd'].to_csv(os.path.join(dir_overschrijding, f'Deceedance_{current_station}.csv'))
         # dist_dec['Gecombineerd'].to_csv(os.path.join(dir_overschrijding, f'Deceedance_{current_station}_gecombineerd.csv'))

diff --git a/kenmerkendewaarden/gemiddeldgetij.py b/kenmerkendewaarden/gemiddeldgetij.py
@@ -267,9 +267,9 @@ def calc_gemiddeldgetij(
 
     # combine in single dictionary
     gemgetij_dict = {}
-    gemgetij_dict["mean"] = prediction_av
-    gemgetij_dict["spring"] = prediction_sp
-    gemgetij_dict["neap"] = prediction_np
+    gemgetij_dict["mean"] = prediction_av['values']
+    gemgetij_dict["spring"] = prediction_sp['values']
+    gemgetij_dict["neap"] = prediction_np['values']
 
     return gemgetij_dict
 
@@ -310,21 +310,21 @@ def plot_gemiddeldgetij(
         prediction_av_raw = gemgetij_dict_raw["mean"]
         prediction_sp_raw = gemgetij_dict_raw["spring"]
         prediction_np_raw = gemgetij_dict_raw["neap"]
-        prediction_av_raw["values"].plot(
+        prediction_av_raw.plot(
             ax=ax,
             linestyle="--",
             color=cmap(0),
             linewidth=0.7,
             label="gemiddeldgetij mean (raw)",
         )
-        prediction_sp_raw["values"].plot(
+        prediction_sp_raw.plot(
             ax=ax,
             linestyle="--",
             color=cmap(1),
             linewidth=0.7,
             label="gemiddeldgetij spring (raw)",
         )
-        prediction_np_raw["values"].plot(
+        prediction_np_raw.plot(
             ax=ax,
             linestyle="--",
             color=cmap(2),
@@ -335,11 +335,11 @@ def plot_gemiddeldgetij(
     prediction_av_corr = gemgetij_dict["mean"]
     prediction_sp_corr = gemgetij_dict["spring"]
     prediction_np_corr = gemgetij_dict["neap"]
-    prediction_av_corr["values"].plot(ax=ax, color=cmap(0), label="gemiddeldgetij mean")
-    prediction_sp_corr["values"].plot(
+    prediction_av_corr.plot(ax=ax, color=cmap(0), label="gemiddeldgetij mean")
+    prediction_sp_corr.plot(
         ax=ax, color=cmap(1), label="gemiddeldgetij spring"
     )
-    prediction_np_corr["values"].plot(ax=ax, color=cmap(2), label="gemiddeldgetij neap")
+    prediction_np_corr.plot(ax=ax, color=cmap(2), label="gemiddeldgetij neap")
 
     ax.set_title(f"getijkrommes for {station}")
     ax.legend(loc=4)
@@ -366,9 +366,9 @@ def get_gemgetij_components(data_pd_meas):
     # =============================================================================
     # Hatyan analyse voor 10 jaar (alle componenten voor gemiddelde getijcyclus) #TODO: maybe use original 4y period/componentfile instead? SA/SM should come from 19y analysis
     # =============================================================================
-    const_list = hatyan.get_const_list_hatyan(
-        "year"
-    )  # components should not be reduced, since higher harmonics are necessary
+
+    # components should not be reduced, since higher harmonics are necessary
+    const_list = hatyan.get_const_list_hatyan("year")  
     hatyan_settings_ana = dict(
         nodalfactors=True,
         fu_alltimes=False,
@@ -430,16 +430,18 @@ def get_gemgetij_components(data_pd_meas):
             .str.isalpha()
         )
         comp_iM = comp_frommeasurements_avg.loc[bool_endswithiM]
-        comp_av.loc[comp_higherharmonics, "A"] = np.sqrt(
-            (comp_iM["A"] ** 2).sum()
-        )  # kwadraatsom
+        # kwadraatsom
+        comp_av.loc[comp_higherharmonics, "A"] = np.sqrt((comp_iM["A"] ** 2).sum())
 
     comp_av.loc["A0"] = comp_frommeasurements_avg.loc["A0"]
 
+    # values are different than 1991.0 document and differs per station while the
+    # document states "Zoals te verwachten is de verhouding per component tussen deze
+    # wortel en de oorspronkelijke amplitude voor alle plaatsen gelijk"    
     logger.debug(
         "verhouding tussen originele en kwadratensom componenten:\n"
         f"{comp_av/comp_frommeasurements_avg.loc[components_av]}"
-    )  # values are different than 1991.0 document and differs per station while the document states "Zoals te verwachten is de verhouding per component tussen deze wortel en de oorspronkelijke amplitude voor alle plaatsen gelijk"
+    )
 
     return comp_frommeasurements_avg, comp_av
 
@@ -464,18 +466,16 @@ def reshape_signal(ts, ts_ext, HW_goal, LW_goal, tP_goal=None):
     bool_HW = ts_ext["HWLWcode"] == 1
     idx_HW = np.where(bool_HW)[0]
     timesHW = ts_ext.index[idx_HW]
-    timesLW = ts_ext.index[
-        idx_HW[:-1] + 1
-    ]  # assuming alternating 1,2,1 or 1,3,1, this is always valid in this workflow
+    # assuming alternating 1,2,1 or 1,3,1, this is always valid in this workflow
+    timesLW = ts_ext.index[idx_HW[:-1] + 1]
 
     # crop from first to last HW (rest is not scaled anyway)
     ts_time_firstHW = ts_ext[bool_HW].index[0]
     ts_time_lastHW = ts_ext[bool_HW].index[-1]
     ts_corr = ts.copy().loc[ts_time_firstHW:ts_time_lastHW]
 
-    ts_corr["timedelta"] = (
-        ts_corr.index
-    )  # this is necessary since datetimeindex with freq is not editable, and Series is editable
+    # this is necessary since datetimeindex with freq is not editable, and Series is editable
+    ts_corr["timedelta"] = (ts_corr.index)
     for i in np.arange(0, len(timesHW) - 1):
         HW1_val = ts_corr.loc[timesHW[i], "values"]
         HW2_val = ts_corr.loc[timesHW[i + 1], "values"]
@@ -492,9 +492,8 @@ def reshape_signal(ts, ts_ext, HW_goal, LW_goal, tP_goal=None):
         temp2 = (
             ts_corr.loc[timesLW[i] : timesHW[i + 1], "values"] - LW_val
         ) / TR2_val * TR_goal + LW_goal
-        temp = pd.concat(
-            [temp1, temp2.iloc[1:]]
-        )  # .iloc[1:] since timesLW[i] is in both timeseries (values are equal)
+        # .iloc[1:] since timesLW[i] is in both timeseries (values are equal)
+        temp = pd.concat([temp1, temp2.iloc[1:]])
         ts_corr["values_new"] = temp
 
         tide_HWtoHW = ts_corr.loc[timesHW[i] : timesHW[i + 1]]

diff --git a/kenmerkendewaarden/havengetallen.py b/kenmerkendewaarden/havengetallen.py
@@ -65,13 +65,14 @@ def calc_havengetallen(
 
     """
     raise_extremes_with_aggers(df_ext)
+    ser_ext = df_ext["values"]
 
     # check if coverage is high enough for havengetallen
     if min_coverage is not None:
         # TODO: compute_actual_counts only returns years for which there are no nans, so will have different length than expected counts if there is an all-nan year
         # TODO: if we supply 4 years of complete data instead of 10 years, no error is raised
-        df_actual_counts_peryear = compute_actual_counts(df_ext, freq="Y")
-        df_expected_counts_peryear = compute_expected_counts(df_ext, freq="Y")
+        df_actual_counts_peryear = compute_actual_counts(ser_ext, freq="Y")
+        df_expected_counts_peryear = compute_expected_counts(ser_ext, freq="Y")
         df_min_counts_peryear = df_expected_counts_peryear * min_coverage
         bool_coverage_toolow = df_actual_counts_peryear < df_min_counts_peryear
         df_debug = pd.DataFrame(