-
Notifications
You must be signed in to change notification settings - Fork 1.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Forecast Scenario Notebook for Local and Remote Inferencing (#3429)
* Initial commit with the codes * Working batch inference with gap * Cleanup code and delete outputs * Fix format issues * Working batch inf e2e * Fix formatting issues --------- Co-authored-by: Rahul Kumar <[email protected]>
- Loading branch information
Showing
8 changed files
with
2,036 additions
and
0 deletions.
There are no files selected for viewing
716 changes: 716 additions & 0 deletions
716
...toml-forecasting-forecast-function/auto-ml-forecasting-function-gap-batch-inference.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
502 changes: 502 additions & 0 deletions
502
...toml-forecasting-forecast-function/auto-ml-forecasting-function-gap-local-inference.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
631 changes: 631 additions & 0 deletions
631
...jobs/automl-forecasting-forecast-function/auto-ml-forecasting-function-gap-training.ipynb
Large diffs are not rendered by default.
Oops, something went wrong.
67 changes: 67 additions & 0 deletions
67
...dalone-jobs/automl-forecasting-forecast-function/forecasting_script/forecasting_script.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
""" | ||
This is the script that is executed on the compute instance. It relies | ||
on the model.pkl file which is uploaded along with this script to the | ||
compute instance. | ||
""" | ||
|
||
import os | ||
|
||
import pandas as pd | ||
|
||
from azureml.core import Dataset, Run | ||
import joblib | ||
from pandas.tseries.frequencies import to_offset | ||
|
||
|
||
def init(): | ||
global target_column_name | ||
global fitted_model | ||
|
||
target_column_name = os.environ["TARGET_COLUMN_NAME"] | ||
# AZUREML_MODEL_DIR is an environment variable created during deployment | ||
# It is the path to the model folder (./azureml-models) | ||
# Please provide your model's folder name if there's one | ||
model_path = os.path.join(os.environ["AZUREML_MODEL_DIR"], "model.pkl") | ||
fitted_model = joblib.load(model_path) | ||
|
||
|
||
def run(mini_batch): | ||
print(f"run method start: {__file__}, run({mini_batch})") | ||
resultList = [] | ||
for test in mini_batch: | ||
if os.path.splitext(test)[-1] == ".parquet": | ||
X_test = pd.read_parquet(test) | ||
elif os.path.splitext(test)[-1] == ".csv": | ||
X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name]) | ||
else: | ||
continue # Skip if it's neither a Parquet nor CSV file | ||
|
||
y_test = X_test.pop(target_column_name).values | ||
|
||
# We have default quantiles values set as below(95th percentile) | ||
quantiles = [0.025, 0.5, 0.975] | ||
predicted_column_name = "predicted" | ||
PI = "prediction_interval" | ||
fitted_model.quantiles = quantiles | ||
pred_quantiles = fitted_model.forecast_quantiles( | ||
X_test, ignore_data_errors=True | ||
) | ||
pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply( | ||
lambda x: "[{}, {}]".format(x[0], x[1]), axis=1 | ||
) | ||
X_test[target_column_name] = y_test | ||
X_test[PI] = pred_quantiles[PI].values | ||
X_test[predicted_column_name] = pred_quantiles[0.5].values | ||
# drop rows where prediction or actuals are nan | ||
# happens because of missing actuals | ||
# or at edges of time due to lags/rolling windows | ||
clean = X_test[ | ||
X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1) | ||
] | ||
print( | ||
f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns." | ||
) | ||
|
||
resultList.append(clean) | ||
|
||
return pd.concat(resultList, sort=False, ignore_index=True) |
1 change: 1 addition & 0 deletions
1
...s/automl-forecasting-forecast-function/forecasting_script/parallel_run_step.settings.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"append_row": {"pandas.DataFrame.to_csv": {"sep": ","}}} |
119 changes: 119 additions & 0 deletions
119
sdk/python/jobs/automl-standalone-jobs/automl-forecasting-forecast-function/helper.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
# Generate synthetic data | ||
|
||
import pandas as pd | ||
import numpy as np | ||
|
||
|
||
def get_timeseries( | ||
train_len: int, | ||
test_len: int, | ||
time_column_name: str, | ||
target_column_name: str, | ||
time_series_id_column_name: str, | ||
time_series_number: int = 1, | ||
freq: str = "H", | ||
): | ||
""" | ||
Return the time series of designed length. | ||
:param train_len: The length of training data (one series). | ||
:type train_len: int | ||
:param test_len: The length of testing data (one series). | ||
:type test_len: int | ||
:param time_column_name: The desired name of a time column. | ||
:type time_column_name: str | ||
:param time_series_number: The number of time series in the data set. | ||
:type time_series_number: int | ||
:param freq: The frequency string representing pandas offset. | ||
see https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html | ||
:type freq: str | ||
:returns: the tuple of train and test data sets. | ||
:rtype: tuple | ||
""" | ||
data_train = [] # type: List[pd.DataFrame] | ||
data_test = [] # type: List[pd.DataFrame] | ||
data_length = train_len + test_len | ||
for i in range(time_series_number): | ||
X = pd.DataFrame( | ||
{ | ||
time_column_name: pd.date_range( | ||
start="2000-01-01", periods=data_length, freq=freq | ||
), | ||
target_column_name: np.arange(data_length).astype(float) | ||
+ np.random.rand(data_length) | ||
+ i * 5, | ||
"ext_predictor": np.asarray(range(42, 42 + data_length)), | ||
time_series_id_column_name: np.repeat("ts{}".format(i), data_length), | ||
} | ||
) | ||
data_train.append(X[:train_len]) | ||
data_test.append(X[train_len:]) | ||
X_train = pd.concat(data_train) | ||
y_train = X_train.pop(target_column_name).values | ||
X_test = pd.concat(data_test) | ||
y_test = X_test.pop(target_column_name).values | ||
return X_train, y_train, X_test, y_test | ||
|
||
|
||
def make_forecasting_query( | ||
fulldata, time_column_name, target_column_name, forecast_origin, horizon, lookback | ||
): | ||
|
||
""" | ||
This function will take the full dataset, and create the query | ||
to predict all values of the time series from the `forecast_origin` | ||
forward for the next `horizon` horizons. Context from previous | ||
`lookback` periods will be included. | ||
fulldata: pandas.DataFrame a time series dataset. Needs to contain X and y. | ||
time_column_name: string which column (must be in fulldata) is the time axis | ||
target_column_name: string which column (must be in fulldata) is to be forecast | ||
forecast_origin: datetime type the last time we (pretend to) have target values | ||
horizon: timedelta how far forward, in time units (not periods) | ||
lookback: timedelta how far back does the model look | ||
Example: | ||
``` | ||
forecast_origin = pd.to_datetime("2012-09-01") + pd.DateOffset(days=5) # forecast 5 days after end of training | ||
print(forecast_origin) | ||
X_query, y_query = make_forecasting_query(data, | ||
forecast_origin = forecast_origin, | ||
horizon = pd.DateOffset(days=7), # 7 days into the future | ||
lookback = pd.DateOffset(days=1), # model has lag 1 period (day) | ||
) | ||
``` | ||
""" | ||
|
||
X_past = fulldata[ | ||
(fulldata[time_column_name] > forecast_origin - lookback) | ||
& (fulldata[time_column_name] <= forecast_origin) | ||
] | ||
|
||
X_future = fulldata[ | ||
(fulldata[time_column_name] > forecast_origin) | ||
& (fulldata[time_column_name] <= forecast_origin + horizon) | ||
] | ||
|
||
y_past = X_past.pop(target_column_name).values.astype(float) | ||
y_future = X_future.pop(target_column_name).values.astype(float) | ||
|
||
# Now take y_future and turn it into question marks | ||
y_query = y_future.copy().astype(float) # because sometimes life hands you an int | ||
y_query.fill(np.nan) | ||
|
||
print("X_past is " + str(X_past.shape) + " - shaped") | ||
print("X_future is " + str(X_future.shape) + " - shaped") | ||
print("y_past is " + str(y_past.shape) + " - shaped") | ||
print("y_query is " + str(y_query.shape) + " - shaped") | ||
|
||
X_pred = pd.concat([X_past, X_future]) | ||
y_pred = np.concatenate([y_past, y_query]) | ||
return X_pred, y_pred |
Binary file added
BIN
+68.9 KB
...jobs/automl-forecasting-forecast-function/images/forecast_function_at_train.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added
BIN
+65.2 KB
...toml-forecasting-forecast-function/images/forecast_function_away_from_train.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.