-
Notifications
You must be signed in to change notification settings - Fork 80
Speed up metrics computation by optimizing segment validation #1338
Conversation
Script for testing: import time
import json
import numpy as np
import pandas as pd
from loguru import logger
from etna.models import NaiveModel
from etna.datasets import TSDataset, generate_ar_df
from etna.metrics import MAE
from etna.pipeline import Pipeline
HORIZON = 14
def make_df(num_segments: int, num_features: int, num_periods: int, random_state: int = 0) -> pd.DataFrame:
rng = np.random.default_rng(random_state)
df = generate_ar_df(
periods=num_periods, start_time="2020-01-01", n_segments=num_segments
)
for i in range(num_features):
# add int column
df[f"new_int_{i}"] = rng.integers(low=-100, high=100, size=df.shape[0])
return df
def check_time(num_segments: int, num_features: int, num_periods: int = 365):
df = make_df(num_segments=num_segments, num_features=num_features, num_periods=num_periods)
df_wide = TSDataset.to_dataset(df)
ts = TSDataset(df=df_wide, freq="D")
model = NaiveModel(lag=1)
transforms = []
pipeline = Pipeline(model=model, transforms=transforms, horizon=HORIZON)
start_time = time.perf_counter()
metrics, _, _ = pipeline.backtest(ts=ts, metrics=[MAE()], n_folds=3)
elapsed_time = time.perf_counter() - start_time
return elapsed_time
def main():
num_segments = [10, 100, 1000, 10_000]
num_features = [0, 3, 10]
results = []
for cur_num_segments in num_segments:
for cur_num_features in num_features:
time_result = check_time(num_segments=cur_num_segments, num_features=cur_num_features)
record = {"num_segments": cur_num_segments, "num_features": cur_num_features, "time": time_result}
results.append(record)
logger.info(json.dumps(record))
json.dump(results, open("records_2.json", "w"), indent=2)
if __name__ == "__main__":
main() Results without changes: [
{
"num_segments": 10,
"num_features": 0,
"time": 0.3590500030000001
},
{
"num_segments": 10,
"num_features": 3,
"time": 0.4593144890000005
},
{
"num_segments": 10,
"num_features": 10,
"time": 0.3713757409999978
},
{
"num_segments": 100,
"num_features": 0,
"time": 1.2579138940000014
},
{
"num_segments": 100,
"num_features": 3,
"time": 1.4334653250000002
},
{
"num_segments": 100,
"num_features": 10,
"time": 1.562795714
},
{
"num_segments": 1000,
"num_features": 0,
"time": 9.964996322999998
},
{
"num_segments": 1000,
"num_features": 3,
"time": 13.49794635
},
{
"num_segments": 1000,
"num_features": 10,
"time": 15.799086332999998
},
{
"num_segments": 10000,
"num_features": 0,
"time": 104.504586417
},
{
"num_segments": 10000,
"num_features": 3,
"time": 235.44386497699998
},
{
"num_segments": 10000,
"num_features": 10,
"time": 281.90819511
}
] Results with changes in [
{
"num_segments": 10,
"num_features": 0,
"time": 0.21762080500000014
},
{
"num_segments": 10,
"num_features": 3,
"time": 0.25818068400000005
},
{
"num_segments": 10,
"num_features": 10,
"time": 0.2985194660000001
},
{
"num_segments": 100,
"num_features": 0,
"time": 0.7977682599999998
},
{
"num_segments": 100,
"num_features": 3,
"time": 0.8086340990000007
},
{
"num_segments": 100,
"num_features": 10,
"time": 0.9596903270000006
},
{
"num_segments": 1000,
"num_features": 0,
"time": 6.304263003999999
},
{
"num_segments": 1000,
"num_features": 3,
"time": 6.816117811000002
},
{
"num_segments": 1000,
"num_features": 10,
"time": 8.042914625000002
},
{
"num_segments": 10000,
"num_features": 0,
"time": 62.120309238999994
},
{
"num_segments": 10000,
"num_features": 3,
"time": 70.261712954
},
{
"num_segments": 10000,
"num_features": 10,
"time": 85.14670703500002
}
] Results with removing extra dataframe selection: [
{
"num_segments": 10,
"num_features": 0,
"time": 0.21787695200000012
},
{
"num_segments": 10,
"num_features": 3,
"time": 0.2454114409999999
},
{
"num_segments": 10,
"num_features": 10,
"time": 0.27747879200000014
},
{
"num_segments": 100,
"num_features": 0,
"time": 0.6796172610000006
},
{
"num_segments": 100,
"num_features": 3,
"time": 0.678246498
},
{
"num_segments": 100,
"num_features": 10,
"time": 0.7758322989999993
},
{
"num_segments": 1000,
"num_features": 0,
"time": 4.967111137000002
},
{
"num_segments": 1000,
"num_features": 3,
"time": 5.283647876000002
},
{
"num_segments": 1000,
"num_features": 10,
"time": 6.464345726000001
},
{
"num_segments": 10000,
"num_features": 0,
"time": 49.259913931999996
},
{
"num_segments": 10000,
"num_features": 3,
"time": 54.42493236100002
},
{
"num_segments": 10000,
"num_features": 10,
"time": 66.683967212
}
] |
There was also a profiling using py-spy with a script: import time
import json
import numpy as np
import pandas as pd
from loguru import logger
from etna.models import NaiveModel
from etna.datasets import TSDataset, generate_ar_df
from etna.metrics import MAE
from etna.pipeline import Pipeline
HORIZON = 14
def make_df(num_segments: int, num_features: int, num_periods: int, random_state: int = 0) -> pd.DataFrame:
rng = np.random.default_rng(random_state)
df = generate_ar_df(
periods=num_periods, start_time="2020-01-01", n_segments=num_segments
)
for i in range(num_features):
# add int column
df[f"new_int_{i}"] = rng.integers(low=-100, high=100, size=df.shape[0])
return df
def check_time(num_segments: int, num_features: int, num_periods: int = 365):
df = make_df(num_segments=num_segments, num_features=num_features, num_periods=num_periods)
df_wide = TSDataset.to_dataset(df)
ts = TSDataset(df=df_wide, freq="D")
model = NaiveModel(lag=1)
transforms = []
pipeline = Pipeline(model=model, transforms=transforms, horizon=HORIZON)
start_time = time.perf_counter()
metrics, _, _ = pipeline.backtest(ts=ts, metrics=[MAE()], n_folds=3)
elapsed_time = time.perf_counter() - start_time
return elapsed_time
def main():
check_time(num_segments=10_000, num_features=3)
if __name__ == "__main__":
main() Key notions:
|
🚀 Deployed on https://deploy-preview-1338--etna-docs.netlify.app |
Codecov Report
❗ Your organization is not using the GitHub App Integration. As a result you may experience degraded service beginning May 15th. Please install the Github App Integration for your organization. Read more. @@ Coverage Diff @@
## master #1338 +/- ##
==========================================
+ Coverage 89.09% 89.12% +0.02%
==========================================
Files 204 204
Lines 12642 12665 +23
==========================================
+ Hits 11264 11288 +24
+ Misses 1378 1377 -1
📣 We’re building smart automated test selection to slash your CI/CD build times. Learn more |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We actually need to investigate it deeper. Places to speed up:
- Per-segment iteration here -- do we really need to check each segment separately? I guess this is done only for the convenient error msg
- We should validate the timestamps and NaNs in vectorized way here -- first of all check the existence of NaNs in passed datasets and then compare the index. We don't need to do it in per-segment fashion. This way we don't need to do
dropna
and_validate_timestamp_columns
in the loop - For the built-in metrics (MAE, SMAPE...) we can implement the vectorized version of
metric_fn
, this might be hard to implement without changing the base classes. At least try to add @ngit decorator to speed up the computation. One sollution is to create the separate classVectorizedMetric(Metric)
and override__call__
.
# Conflicts: # CHANGELOG.md
Results after reworking [
{
"num_segments": 10,
"num_features": 0,
"time": 0.2025824759999999
},
{
"num_segments": 10,
"num_features": 3,
"time": 0.24138570799999925
},
{
"num_segments": 10,
"num_features": 10,
"time": 0.272586608000001
},
{
"num_segments": 100,
"num_features": 0,
"time": 0.6173706429999992
},
{
"num_segments": 100,
"num_features": 3,
"time": 0.622621066999999
},
{
"num_segments": 100,
"num_features": 10,
"time": 0.7178853730000014
},
{
"num_segments": 1000,
"num_features": 0,
"time": 4.244795499
},
{
"num_segments": 1000,
"num_features": 3,
"time": 4.704148591000001
},
{
"num_segments": 1000,
"num_features": 10,
"time": 5.926159300000002
},
{
"num_segments": 10000,
"num_features": 0,
"time": 40.73026384199999
},
{
"num_segments": 10000,
"num_features": 3,
"time": 46.77938691599999
},
{
"num_segments": 10000,
"num_features": 10,
"time": 60.692529713
}
] |
Results after iteration optimization: [
{
"num_segments": 10,
"num_features": 0,
"time": 0.22758442000000034
},
{
"num_segments": 10,
"num_features": 3,
"time": 0.303228195
},
{
"num_segments": 10,
"num_features": 10,
"time": 0.2589208110000003
},
{
"num_segments": 100,
"num_features": 0,
"time": 0.5286539329999993
},
{
"num_segments": 100,
"num_features": 3,
"time": 0.575157707999999
},
{
"num_segments": 100,
"num_features": 10,
"time": 0.6489074320000014
},
{
"num_segments": 1000,
"num_features": 0,
"time": 3.4659758200000006
},
{
"num_segments": 1000,
"num_features": 3,
"time": 4.007518799999998
},
{
"num_segments": 1000,
"num_features": 10,
"time": 5.0106977860000015
},
{
"num_segments": 10000,
"num_features": 0,
"time": 33.155186359
},
{
"num_segments": 10000,
"num_features": 3,
"time": 39.375860941000006
},
{
"num_segments": 10000,
"num_features": 10,
"time": 53.37710269499999
}
] |
etna/metrics/base.py
Outdated
df_true = y_true.df.loc[:, pd.IndexSlice[:, "target"]].sort_index(axis=1) | ||
df_pred = y_pred.df.loc[:, pd.IndexSlice[:, "target"]].sort_index(axis=1) | ||
|
||
df_true_isna = df_true.isna() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why can't we just ckech that both of df_true_isna
and df_pred_isna
sum to 0? As I understand we also need to compare the index here, does equals
do it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure that your suggested solution gives the same result as the initial solution. In initial solution we select segment from ts
, it uses first_valid_index
under the hood and skips the first NaNs.
Here we apply first_valid_index
on the whole dataframe and it skips only the nans that are present in all segments. So if segments start at different timestamps some of the segments will have NaNs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
DataFrame.equals
compares taking into account index.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I made a mistake in initial solution we check timestamps after dropna. So we can't really check that sum of isna equals to true. It will give not equivalent solution.
If we want to do not-equivalent check we should discuss what kind of check do we really want here, because I'm not really sure that existing type of check is reasonable enough.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Here we need to check the following things:
- There are no NaNs in the datasets
- Index of the datasets are the same
metrics_per_segment[segment] = self.metric_fn( | ||
y_true=y_true[:, segment, "target"].values, y_pred=y_pred[:, segment, "target"].values, **self.kwargs | ||
) | ||
segments = df_true.columns.get_level_values("segment").unique() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't it be sorted as index in the dataframe is sorted?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
May be we need test for such behaviour(input datasets have unsorted segments)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it will be sorted because we sorted index of df_true
. Also we have a guarantee that unique
returns values in the order of its appearance.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, I'll try to add a test on this.
Before submitting (must do checklist)
Proposed Changes
_validate_segment_columns
Closing issues
Closes #1336.