Skip to content

Commit

Permalink
linear fit for slotgemiddelde (#157)
Browse files Browse the repository at this point in the history
* linear fit for slotgemiddelde (was default with nodal)

* simplified slotgemiddelde code

* remove with_AR option

* updated tests

* updated docstring
  • Loading branch information
veenstrajelmer authored Oct 15, 2024
1 parent 87b8820 commit 10cf370
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 60 deletions.
1 change: 1 addition & 0 deletions docs/whats-new.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

### Feat
- expanded physical_break_dict in [#151](https://github.com/Deltares-research/kenmerkendewaarden/pull/151)
- linear fit for slotgemiddelden (no nodal) in [#157](https://github.com/Deltares-research/kenmerkendewaarden/pull/157)


## 0.3.0 (2024-10-01)
Expand Down
84 changes: 38 additions & 46 deletions kenmerkendewaarden/slotgemiddelden.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,13 @@ def calc_slotgemiddelden(
df_ext: pd.DataFrame = None,
min_coverage: float = None,
clip_physical_break: bool = False,
with_nodal: bool = True,
):
"""
Compute slotgemiddelden from measurement timeseries and optionally also from extremes timeseries.
A simple linear trend is used to avoid all pretend-accuracy. However, when fitting a
linear trend on a limited amount of data, the nodal cycle and wind effects will cause
the model fit to be inaccurate. It is wise to use at least 30 years of data for
a valid fit, this is >1.5 times the nodal cycle.
Parameters
----------
Expand All @@ -42,8 +45,6 @@ def calc_slotgemiddelden(
Set yearly means to nans for years that do not have sufficient data coverage. The default is None.
clip_physical_break : bool, optional
Whether to exclude the part of the timeseries before physical breaks like estuary closures. The default is False.
with_nodal : bool, optional
Whether to include a nodal cycle in the linear trend model. The default is True.
Returns
-------
Expand Down Expand Up @@ -71,7 +72,7 @@ def calc_slotgemiddelden(
wl_mean_peryear = clip_timeseries_physical_break(wl_mean_peryear)

# fit linear models over yearly mean values
pred_pd_wl = fit_models(wl_mean_peryear, with_nodal=with_nodal)
pred_pd_wl = predict_linear_model(wl_mean_peryear)
slotgemiddelden_dict["wl_model_fit"] = pred_pd_wl

if df_ext is not None:
Expand Down Expand Up @@ -103,9 +104,9 @@ def calc_slotgemiddelden(
tidalrange_mean_peryear = clip_timeseries_physical_break(tidalrange_mean_peryear)

# fit linear models over yearly mean values
pred_pd_HW = fit_models(HW_mean_peryear, with_nodal=with_nodal)
pred_pd_LW = fit_models(LW_mean_peryear, with_nodal=with_nodal)
pred_pd_tidalrange = fit_models(tidalrange_mean_peryear, with_nodal=with_nodal)
pred_pd_HW = predict_linear_model(HW_mean_peryear)
pred_pd_LW = predict_linear_model(LW_mean_peryear)
pred_pd_tidalrange = predict_linear_model(tidalrange_mean_peryear)
slotgemiddelden_dict["HW_model_fit"] = pred_pd_HW
slotgemiddelden_dict["LW_model_fit"] = pred_pd_LW
slotgemiddelden_dict["tidalrange_model_fit"] = pred_pd_tidalrange
Expand Down Expand Up @@ -210,13 +211,13 @@ def plot_slotgemiddelden(
return fig, ax


def fit_models(mean_array_todate: pd.Series, with_nodal=True) -> pd.DataFrame:
def predict_linear_model(ser: pd.Series, with_nodal=False) -> pd.DataFrame:
"""
Fit linear model over yearly means in mean_array_todate, including five years in the future.
Parameters
----------
mean_array_todate : pd.Series
ser : pd.Series
DESCRIPTION.
Returns
Expand All @@ -226,61 +227,52 @@ def fit_models(mean_array_todate: pd.Series, with_nodal=True) -> pd.DataFrame:
"""

start = mean_array_todate.index.min()
end = mean_array_todate.index.max() + 1
start = ser.index.min()
end = ser.index.max() + 1

logger.info(f"fit linear model (with_nodal={with_nodal}) for {start} to {end}")
logger.info(f"fit linear model for {start} to {end}")

# We'll just use the years. This assumes that annual waterlevels are used that are stored left-padded, the mean waterlevel for 2020 is stored as 2020-1-1. This is not logical, but common practice.
# generate contiguous timeseries including gaps and including slotgemiddelde year
allyears_dt = pd.period_range(start=start, end=end)
mean_array_allyears = pd.Series(mean_array_todate, index=allyears_dt)
ser_allyears = pd.Series(ser, index=allyears_dt)

mean_array_allyears_nonans = mean_array_allyears.loc[~mean_array_allyears.isnull()]
if len(mean_array_allyears_nonans) < 2:
ser_nonans = ser_allyears.loc[~ser_allyears.isnull()]
if len(ser_nonans) < 2:
raise ValueError(
f"nan-filtered timeseries has only one timestep, cannot perform model fit:\n{mean_array_allyears_nonans}"
f"nan-filtered timeseries has only one timestep, cannot perform model fit:\n{ser_nonans}"
)

# convert to dataframe of expected format
# TODO: make functions accept mean_array instead of df as argument?
df = pd.DataFrame(
{"year": mean_array_allyears.index.year, "height": mean_array_allyears.values}
)

# below methods are copied from https://github.com/openearth/sealevel/blob/master/slr/slr/models.py
# TODO: install slr package as dependency or keep separate?
fit, _, X = linear_model(df, with_wind=False, with_nodal=with_nodal)
pred_linear = fit.predict(X)
# get model fit
fit, _, X = fit_linear_model(ser_allyears, with_nodal=with_nodal)

linear_fit = pd.Series(pred_linear, index=allyears_dt, name="values")
linear_fit.index.name = mean_array_todate.index.name
return linear_fit
# predict
pred_arr = fit.predict(X)
pred_pd = pd.Series(pred_arr, index=allyears_dt, name="values")
pred_pd.index.name = ser.index.name
return pred_pd


# copied from https://github.com/openearth/sealevel/blob/master/slr/slr/models.py
def linear_model(df, with_wind=True, with_ar=True, with_nodal=True, quantity="height"):
"""Define the linear model with optional wind and autoregression.
See the latest report for a detailed description.
def fit_linear_model(df, with_nodal=False):
"""
Define the linear model with constant and trend, optionally with nodal.
simplified from https://github.com/openearth/sealevel/blob/master/slr/slr/models.py
"""

y = df[quantity]
X = np.c_[df["year"] - 1970,]
# month = np.mod(df['year'], 1) * 12.0
# just use the years. This assumes that annual waterlevels are used that are
# stored left-padded, the mean waterlevel for 2020 is stored as 2020-1-1
# This is not logical, but common practice.
years = df.index.year
y = df.values
X = np.c_[years - 1970,]
names = ["Constant", "Trend"]
if with_nodal:
X = np.c_[
X,
np.cos(2 * np.pi * (df["year"] - 1970) / 18.613),
np.sin(2 * np.pi * (df["year"] - 1970) / 18.613),
np.cos(2 * np.pi * (years - 1970) / 18.613),
np.sin(2 * np.pi * (years - 1970) / 18.613),
]
names.extend(["Nodal U", "Nodal V"])
if with_wind:
X = np.c_[X, df["u2"], df["v2"]]
names.extend(["Wind $u^2$", "Wind $v^2$"])
X = sm.add_constant(X)
if with_ar:
model = sm.GLSAR(y, X, missing="drop", rho=1)
else:
model = sm.OLS(y, X, missing="drop")
model = sm.OLS(y, X, missing="drop")
fit = model.fit(cov_type="HC0")
return fit, names, X
29 changes: 15 additions & 14 deletions tests/test_slotgemiddelden.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,29 +21,30 @@ def test_calc_slotgemiddelden_outputtype(df_meas_2010_2014, df_ext_12_2010_2014)


@pytest.mark.unittest
def test_fit_models(df_meas_2010_2014):
def test_predict_linear_model(df_meas_2010_2014):
dict_wltidalindicators_valid = kw.calc_wltidalindicators(
df_meas_2010_2014
) # 24*365=8760 (hourly interval), 24/3*365=2920 (3-hourly interval)
wl_mean_peryear_valid = dict_wltidalindicators_valid["wl_mean_peryear"]

wl_model_fit_nodal = kw.slotgemiddelden.fit_models(
wl_model_fit_nodal = kw.slotgemiddelden.predict_linear_model(
wl_mean_peryear_valid, with_nodal=True
)
nodal_expected = np.array(
[0.0141927, 0.08612119, 0.0853051, 0.07010864, 0.10051922, 0.23137634]
[0.07860955, 0.08999961, 0.07954378, 0.07398706, 0.09952146, 0.17882958]
)
assert np.allclose(wl_model_fit_nodal.values, nodal_expected)

wl_model_fit_linear = kw.slotgemiddelden.fit_models(
wl_model_fit_linear = kw.slotgemiddelden.predict_linear_model(
wl_mean_peryear_valid, with_nodal=False
)
linear_expected = np.array(
[0.07851414, 0.0813139, 0.08411366, 0.08691342, 0.08971318, 0.09251294]
[0.07917004, 0.08175116, 0.08433229, 0.08691342, 0.08949454, 0.09207567]
)
assert np.allclose(wl_model_fit_linear.values, linear_expected)



@pytest.mark.unittest
def test_calc_slotgemiddelden(df_meas_2010_2014, df_ext_12_2010_2014):
slotgemiddelden_dict_inclext = kw.calc_slotgemiddelden(
Expand Down Expand Up @@ -94,14 +95,14 @@ def test_calc_slotgemiddelden(df_meas_2010_2014, df_ext_12_2010_2014):
)

# fmt: off
wl_model_fit_expected = np.array([0.0141927, 0.08612119, 0.0853051,
0.07010864, 0.10051922, 0.23137634])
hw_model_fit_expected = np.array([1.05295416, 1.12875177, 1.13988685,
1.1415461, 1.18998584, 1.336182])
lw_model_fit_expected = np.array([-0.67420399, -0.59089362, -0.59342291,
-0.61334278, -0.58024113, -0.42969074])
range_model_fit_expected = np.array([1.72715816, 1.71964539, 1.73330976,
1.75488888, 1.77022697, 1.76587273])
wl_model_fit_expected = np.array([0.07917004, 0.08175116, 0.08433229,
0.08691342, 0.08949454, 0.09207567])
hw_model_fit_expected = np.array([1.12529394, 1.13663287, 1.14797179,
1.15931071, 1.17064963, 1.18198856])
lw_model_fit_expected = np.array([-0.60236402, -0.59953375, -0.59670349,
-0.59387323, -0.59104297, -0.58821271])
range_model_fit_expected = np.array([1.72765796, 1.73616662, 1.74467528,
1.75318394, 1.7616926, 1.77020126])
# fmt: on
assert np.allclose(
slotgemiddelden_dict_inclext["wl_model_fit"].values, wl_model_fit_expected
Expand Down

0 comments on commit 10cf370

Please sign in to comment.