diff --git a/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-forecast-function/forecasting_script/forecasting_script.py b/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-forecast-function/forecasting_script/forecasting_script.py index 513be30b7d5..3531119154b 100644 --- a/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-forecast-function/forecasting_script/forecasting_script.py +++ b/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-forecast-function/forecasting_script/forecasting_script.py @@ -29,14 +29,19 @@ def run(mini_batch): print(f"run method start: {__file__}, run({mini_batch})") resultList = [] for test in mini_batch: - if os.path.splitext(test)[-1] == ".parquet": + file_ext = os.path.splitext(test)[-1] + if file_ext == ".parquet": X_test = pd.read_parquet(test) - elif os.path.splitext(test)[-1] == ".csv": + elif file_ext == ".csv": X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name]) else: + print(f"Unsupported file type: `{file_ext}`. Skipping the file.") continue # Skip if it's neither a Parquet nor CSV file - y_test = X_test.pop(target_column_name).values + if target_column_name in X_test.columns: + y_test = X_test.pop(target_column_name).values + else: + y_test = None # We have default quantiles values set as below(95th percentile) quantiles = [0.025, 0.5, 0.975] @@ -49,15 +54,21 @@ def run(mini_batch): pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply( lambda x: "[{}, {}]".format(x[0], x[1]), axis=1 ) - X_test[target_column_name] = y_test + if y_test is not None: + X_test[target_column_name] = y_test X_test[PI] = pred_quantiles[PI].values X_test[predicted_column_name] = pred_quantiles[0.5].values # drop rows where prediction or actuals are nan # happens because of missing actuals # or at edges of time due to lags/rolling windows - clean = X_test[ - X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1) - ] + if target_column_name in X_test.columns: + clean = X_test[ + X_test[[target_column_name, predicted_column_name]] + .notnull() + .all(axis=1) + ] + else: + clean = X_test[X_test[predicted_column_name].notnull()] print( f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns." ) diff --git a/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-github-dau/helpers/forecasting_script.py b/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-github-dau/helpers/forecasting_script.py index 5072bcfcfe1..396fda52dea 100644 --- a/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-github-dau/helpers/forecasting_script.py +++ b/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-github-dau/helpers/forecasting_script.py @@ -32,9 +32,19 @@ def run(mini_batch): print(f"run method start: {__file__}, run({mini_batch})") resultList = [] for test in mini_batch: - if not test.endswith(".csv"): + file_ext = os.path.splitext(test)[-1] + if file_ext == ".parquet": + X_test = pd.read_parquet(test) + elif file_ext == ".csv": + X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name]) + else: + print(f"Unsupported file type: `{file_ext}`. Skipping the file.") continue - X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name]) + + if target_column_name not in X_test.columns: + raise ValueError( + f"Target column `{target_column_name}` not found in the test data, required for rolling forecast." + ) y_test = X_test.pop(target_column_name).values # Make a rolling forecast, advancing the forecast origin by 1 period on each iteration through the test set diff --git a/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-orange-juice-sales/forecast/forecasting_script.py b/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-orange-juice-sales/forecast/forecasting_script.py index 56135869a89..87f4e961c4f 100644 --- a/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-orange-juice-sales/forecast/forecasting_script.py +++ b/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-orange-juice-sales/forecast/forecasting_script.py @@ -29,11 +29,19 @@ def run(mini_batch): print(f"run method start: {__file__}, run({mini_batch})") resultList = [] for test in mini_batch: - if os.path.splitext(test)[-1] != ".csv": + file_ext = os.path.splitext(test)[-1] + if file_ext == ".parquet": + X_test = pd.read_parquet(test) + elif file_ext == ".csv": + X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name]) + else: + print(f"Unsupported file type: `{file_ext}`. Skipping the file.") continue - X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name]) - y_test = X_test.pop(target_column_name).values + if target_column_name in X_test.columns: + y_test = X_test.pop(target_column_name).values + else: + y_test = None # We have default quantiles values set as below(95th percentile) quantiles = [0.025, 0.5, 0.975] @@ -46,15 +54,21 @@ def run(mini_batch): pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply( lambda x: "[{}, {}]".format(x[0], x[1]), axis=1 ) - X_test[target_column_name] = y_test + if y_test is not None: + X_test[target_column_name] = y_test X_test[PI] = pred_quantiles[PI].values X_test[predicted_column_name] = pred_quantiles[0.5].values # drop rows where prediction or actuals are nan # happens because of missing actuals # or at edges of time due to lags/rolling windows - clean = X_test[ - X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1) - ] + if target_column_name in X_test.columns: + clean = X_test[ + X_test[[target_column_name, predicted_column_name]] + .notnull() + .all(axis=1) + ] + else: + clean = X_test[X_test[predicted_column_name].notnull()] print( f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns." ) diff --git a/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-recipes-univariate/forecast/forecasting_script.py b/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-recipes-univariate/forecast/forecasting_script.py index c38a9dbebc3..87f4e961c4f 100644 --- a/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-recipes-univariate/forecast/forecasting_script.py +++ b/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-recipes-univariate/forecast/forecasting_script.py @@ -29,8 +29,19 @@ def run(mini_batch): print(f"run method start: {__file__}, run({mini_batch})") resultList = [] for test in mini_batch: - X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name]) - y_test = X_test.pop(target_column_name).values + file_ext = os.path.splitext(test)[-1] + if file_ext == ".parquet": + X_test = pd.read_parquet(test) + elif file_ext == ".csv": + X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name]) + else: + print(f"Unsupported file type: `{file_ext}`. Skipping the file.") + continue + + if target_column_name in X_test.columns: + y_test = X_test.pop(target_column_name).values + else: + y_test = None # We have default quantiles values set as below(95th percentile) quantiles = [0.025, 0.5, 0.975] @@ -43,15 +54,21 @@ def run(mini_batch): pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply( lambda x: "[{}, {}]".format(x[0], x[1]), axis=1 ) - X_test[target_column_name] = y_test - X_test[PI] = pred_quantiles[PI] - X_test[predicted_column_name] = pred_quantiles[0.5] + if y_test is not None: + X_test[target_column_name] = y_test + X_test[PI] = pred_quantiles[PI].values + X_test[predicted_column_name] = pred_quantiles[0.5].values # drop rows where prediction or actuals are nan # happens because of missing actuals # or at edges of time due to lags/rolling windows - clean = X_test[ - X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1) - ] + if target_column_name in X_test.columns: + clean = X_test[ + X_test[[target_column_name, predicted_column_name]] + .notnull() + .all(axis=1) + ] + else: + clean = X_test[X_test[predicted_column_name].notnull()] print( f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns." ) diff --git a/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-task-bike-share/forecast/rolling_script.py b/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-task-bike-share/forecast/rolling_script.py index 2a25f704d8e..abbc0f2c89c 100644 --- a/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-task-bike-share/forecast/rolling_script.py +++ b/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-task-bike-share/forecast/rolling_script.py @@ -29,9 +29,19 @@ def run(mini_batch): print(f"run method start: {__file__}, run({mini_batch})") resultList = [] for test in mini_batch: - if not test.endswith(".csv"): + file_ext = os.path.splitext(test)[-1] + if file_ext == ".parquet": + X_test = pd.read_parquet(test) + elif file_ext == ".csv": + X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name]) + else: + print(f"Unsupported file type: `{file_ext}`. Skipping the file.") continue - X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name]) + + if target_column_name not in X_test.columns: + raise ValueError( + f"Target column `{target_column_name}` not found in the test data, required for rolling forecast." + ) y_test = X_test.pop(target_column_name).values # Make a rolling forecast, advancing the forecast origin by 1 period on each iteration through the test set diff --git a/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-task-energy-demand/forecast/forecasting_script.py b/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-task-energy-demand/forecast/forecasting_script.py index 5c10c29763d..87f4e961c4f 100644 --- a/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-task-energy-demand/forecast/forecasting_script.py +++ b/sdk/python/jobs/automl-standalone-jobs/automl-forecasting-task-energy-demand/forecast/forecasting_script.py @@ -29,10 +29,19 @@ def run(mini_batch): print(f"run method start: {__file__}, run({mini_batch})") resultList = [] for test in mini_batch: - if not test.endswith(".csv"): + file_ext = os.path.splitext(test)[-1] + if file_ext == ".parquet": + X_test = pd.read_parquet(test) + elif file_ext == ".csv": + X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name]) + else: + print(f"Unsupported file type: `{file_ext}`. Skipping the file.") continue - X_test = pd.read_csv(test, parse_dates=[fitted_model.time_column_name]) - y_test = X_test.pop(target_column_name).values + + if target_column_name in X_test.columns: + y_test = X_test.pop(target_column_name).values + else: + y_test = None # We have default quantiles values set as below(95th percentile) quantiles = [0.025, 0.5, 0.975] @@ -45,15 +54,21 @@ def run(mini_batch): pred_quantiles[PI] = pred_quantiles[[min(quantiles), max(quantiles)]].apply( lambda x: "[{}, {}]".format(x[0], x[1]), axis=1 ) - X_test[target_column_name] = y_test + if y_test is not None: + X_test[target_column_name] = y_test X_test[PI] = pred_quantiles[PI].values X_test[predicted_column_name] = pred_quantiles[0.5].values # drop rows where prediction or actuals are nan # happens because of missing actuals # or at edges of time due to lags/rolling windows - clean = X_test[ - X_test[[target_column_name, predicted_column_name]].notnull().all(axis=1) - ] + if target_column_name in X_test.columns: + clean = X_test[ + X_test[[target_column_name, predicted_column_name]] + .notnull() + .all(axis=1) + ] + else: + clean = X_test[X_test[predicted_column_name].notnull()] print( f"The predictions have {clean.shape[0]} rows and {clean.shape[1]} columns." ) diff --git a/sdk/python/jobs/pipelines/1k_demand_forecast_pipeline/aml-demand-forecast-mm-pipeline/aml-demand-forecast-mm-pipeline.ipynb b/sdk/python/jobs/pipelines/1k_demand_forecast_pipeline/aml-demand-forecast-mm-pipeline/aml-demand-forecast-mm-pipeline.ipynb index 1a84a012993..979cb19351a 100644 --- a/sdk/python/jobs/pipelines/1k_demand_forecast_pipeline/aml-demand-forecast-mm-pipeline/aml-demand-forecast-mm-pipeline.ipynb +++ b/sdk/python/jobs/pipelines/1k_demand_forecast_pipeline/aml-demand-forecast-mm-pipeline/aml-demand-forecast-mm-pipeline.ipynb @@ -479,8 +479,8 @@ "| **enable_early_stopping** | Flag to enable early termination if the primary metric is no longer improving. |\n", "| **partition_column_names** | The names of columns used to group your models. For timeseries, the groups must not split up individual time-series. That is, each group must contain one or more whole time-series. |\n", "| **allow_multi_partitions** | A flag that allows users to train one model per partition when each partition contians more than one unique time series. The dafault value is `False`. |\n", - "| **track_child_runs** | Flag to disable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). |\n", - "| **n_best_runs** | Number of best runs to track per partition for a Many Models Run. Defaults to 1. Please set `track_child_runs` to `True` and then modify this parameter. |\n", + "| **track_child_runs** | Flag to enable tracking of child runs. Only best run is tracked if the flag is set to False (this includes the model and metrics of the run). Defaults to `False`. We do not encourage to turn this on since it can lead to throttling, instead use `n_best_runs` if you really need to track more than one best run. |\n", + "| **n_best_runs** | Number of best runs to track per partition for a Many Models Run. Defaults to 1. |\n", "| **enable_early_stopping** | Flag to enable early termination if the primary metric is no longer improving. |\n", "| **max_trials** | Represents the maximum number of trials an Automated ML job can try to run a training algorithm with different combination of hyperparameters. Its default value is set to 1000. If `enable_early_stopping` is defined, then the number of trials used to run training algorithms can be smaller.|\n", "| **timeout_minutes** | Maximum amount of time in minutes that the whole AutoML job can take before the job terminates. This timeout includes setup, featurization and training runs but does not include the ensembling and model explainability runs at the end of the process since those actions need to happen once all the trials (children jobs) are done. If not specified, the default job's total timeout is 6 days (8,640 minutes). To specify a timeout less than or equal to 1 hour (60 minutes), make sure your dataset's size is not greater than 10,000,000 (rows times column) or an error results. |\n",