From 8834114958648714f4df25603a7b08dbb67b7a8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Morales?= Date: Tue, 29 Oct 2024 11:26:06 -0600 Subject: [PATCH] breaking: add `hist_exog_list` argument to forecast (#505) --- nbs/src/nixtla_client.ipynb | 102 ++++++++++++++++++++++++++++++------ nixtla/nixtla_client.py | 59 ++++++++++++++------- 2 files changed, 126 insertions(+), 35 deletions(-) diff --git a/nbs/src/nixtla_client.ipynb b/nbs/src/nixtla_client.ipynb index 94460f7d..1b91b748 100644 --- a/nbs/src/nixtla_client.ipynb +++ b/nbs/src/nixtla_client.ipynb @@ -417,19 +417,37 @@ " id_col: str,\n", " time_col: str,\n", " target_col: str,\n", + " hist_exog: Optional[List[str]],\n", ") -> Tuple[DFType, Optional[DFType]]:\n", - "\n", - " exog_list = [c for c in df.columns if c not in (id_col, time_col, target_col)]\n", - "\n", + " exogs = [c for c in df.columns if c not in (id_col, time_col, target_col)]\n", + " if hist_exog is None:\n", + " hist_exog = []\n", " if X_df is None:\n", - " df = df[[id_col, time_col, target_col, *exog_list]]\n", + " # all exogs must be historic\n", + " ignored_exogs = [c for c in exogs if c not in hist_exog]\n", + " if ignored_exogs:\n", + " warnings.warn(\n", + " f\"`df` contains the following exogenous features: {ignored_exogs}, \"\n", + " \"but `X_df` was not provided and they were not declared in `hist_exog_list`. \"\n", + " \"They will be ignored.\"\n", + " )\n", + " exogs = [c for c in exogs if c in hist_exog]\n", + " df = df[[id_col, time_col, target_col, *exogs]]\n", " return df, None\n", "\n", - " futr_exog_list = [c for c in X_df.columns if c not in (id_col, time_col)]\n", - " hist_exog_list = list(set(exog_list) - set(futr_exog_list))\n", + " # exogs in df that weren't declared as historic nor future\n", + " futr_exog = [c for c in X_df.columns if c not in (id_col, time_col)]\n", + " declared_exogs = {*hist_exog, *futr_exog}\n", + " ignored_exogs = [c for c in exogs if c not in declared_exogs]\n", + " if ignored_exogs:\n", + " warnings.warn(\n", + " f\"`df` contains the following exogenous features: {ignored_exogs}, \"\n", + " \"but they were not found in `X_df` nor declared in `hist_exog_list`. \"\n", + " \"They will be ignored.\"\n", + " )\n", "\n", - " # Capture case where future exogenous are provided in X_df that are not in df\n", - " missing_futr = set(futr_exog_list) - set(exog_list)\n", + " # future exogenous are provided in X_df that are not in df\n", + " missing_futr = set(futr_exog) - set(exogs)\n", " if missing_futr:\n", " raise ValueError(\n", " \"The following exogenous features are present in `X_df` \"\n", @@ -437,8 +455,8 @@ " )\n", "\n", " # Make sure df and X_df are in right order\n", - " df = df[[id_col, time_col, target_col, *futr_exog_list, *hist_exog_list]]\n", - " X_df = X_df[[id_col, time_col, *futr_exog_list]]\n", + " df = df[[id_col, time_col, target_col, *futr_exog, *hist_exog]]\n", + " X_df = X_df[[id_col, time_col, *futr_exog]]\n", "\n", " return df, X_df\n", "\n", @@ -929,6 +947,7 @@ " finetune_depth: _Finetune_Depth = 1,\n", " finetune_loss: _Loss = 'default',\n", " clean_ex_first: bool = True,\n", + " hist_exog_list: Optional[List[str]] = None,\n", " validate_api_key: bool = False,\n", " add_history: bool = False,\n", " date_features: Union[bool, List[Union[str, Callable]]] = False,\n", @@ -985,6 +1004,8 @@ " Loss function to use for finetuning. Options are: `default`, `mae`, `mse`, `rmse`, `mape`, and `smape`.\n", " clean_ex_first : bool (default=True)\n", " Clean exogenous signal before making forecasts using TimeGPT.\n", + " hist_exog_list : list of str, optional (default=None)\n", + " Column names of the historical exogenous features.\n", " validate_api_key : bool (default=False)\n", " If True, validates api_key before sending requests.\n", " add_history : bool (default=False)\n", @@ -1055,7 +1076,12 @@ " model=model,\n", " )\n", " df, X_df = _validate_exog(\n", - " df, X_df, id_col=id_col, time_col=time_col, target_col=target_col\n", + " df=df,\n", + " X_df=X_df,\n", + " id_col=id_col,\n", + " time_col=time_col,\n", + " target_col=target_col,\n", + " hist_exog=hist_exog_list,\n", " )\n", " level, quantiles = _prepare_level_and_quantiles(level, quantiles)\n", " freq = _maybe_infer_freq(df, freq=freq, id_col=id_col, time_col=time_col)\n", @@ -1095,12 +1121,9 @@ " if processed.data.shape[1] > 1:\n", " X = processed.data[:, 1:].T\n", " if futr_cols is not None:\n", - " hist_exog_set= set(x_cols) - set(futr_cols)\n", - " if hist_exog_set:\n", - " logger.info(f'Using historical exogenous features: {list(hist_exog_set)}')\n", " logger.info(f'Using future exogenous features: {futr_cols}')\n", - " else:\n", - " logger.info(f'Using historical exogenous features: {x_cols}')\n", + " if hist_exog_list is not None:\n", + " logger.info(f'Using historical exogenous features: {hist_exog_list}')\n", " else:\n", " X = None\n", "\n", @@ -2572,6 +2595,53 @@ ")" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#| hide\n", + "# future and historic exogs\n", + "df = generate_series(n_series=2, min_length=5, max_length=20)\n", + "train, future = time_features(df, freq='D', features=['year', 'month'], h=5)\n", + "\n", + "# features in df but not in X_df\n", + "missing_exogenous = train.columns.drop(['unique_id', 'ds', 'y']).tolist()\n", + "expected_warning = (\n", + " f'`df` contains the following exogenous features: {missing_exogenous}, '\n", + " 'but `X_df` was not provided and they were not declared in `hist_exog_list`. '\n", + " 'They will be ignored.'\n", + ")\n", + "with warnings.catch_warnings(record=True) as w:\n", + " forecasts = nixtla_client.forecast(train, h=5)\n", + " assert any(expected_warning in str(warning.message) for warning in w)\n", + "\n", + "# features in df not set as historic nor in X_df\n", + "expected_warning = (\n", + " f\"`df` contains the following exogenous features: ['month'], \"\n", + " 'but they were not found in `X_df` nor declared in `hist_exog_list`. '\n", + " 'They will be ignored.'\n", + ")\n", + "with warnings.catch_warnings(record=True) as w:\n", + " forecasts = nixtla_client.forecast(train, h=5, X_df=future[['unique_id', 'ds', 'year']])\n", + " assert any(expected_warning in str(warning.message) for warning in w)\n", + "\n", + "# features in X_df not in df\n", + "test_fail(\n", + " lambda: nixtla_client.forecast(\n", + " train[['unique_id', 'ds', 'y']],\n", + " h=5,\n", + " X_df=future,\n", + " ),\n", + " contains='features are present in `X_df` but not in `df`'\n", + ")\n", + "\n", + "# test setting one as historic and other as future\n", + "nixtla_client.forecast(train, h=5, X_df=future[['unique_id', 'ds', 'year']], hist_exog_list=['month'])\n", + "test_eq(nixtla_client.weights_x['features'].tolist(), ['year', 'month'])" + ] + }, { "cell_type": "markdown", "metadata": {}, diff --git a/nixtla/nixtla_client.py b/nixtla/nixtla_client.py index 2298d21f..c3e9a31d 100644 --- a/nixtla/nixtla_client.py +++ b/nixtla/nixtla_client.py @@ -347,19 +347,37 @@ def _validate_exog( id_col: str, time_col: str, target_col: str, + hist_exog: Optional[List[str]], ) -> Tuple[DFType, Optional[DFType]]: - - exog_list = [c for c in df.columns if c not in (id_col, time_col, target_col)] - + exogs = [c for c in df.columns if c not in (id_col, time_col, target_col)] + if hist_exog is None: + hist_exog = [] if X_df is None: - df = df[[id_col, time_col, target_col, *exog_list]] + # all exogs must be historic + ignored_exogs = [c for c in exogs if c not in hist_exog] + if ignored_exogs: + warnings.warn( + f"`df` contains the following exogenous features: {ignored_exogs}, " + "but `X_df` was not provided and they were not declared in `hist_exog_list`. " + "They will be ignored." + ) + exogs = [c for c in exogs if c in hist_exog] + df = df[[id_col, time_col, target_col, *exogs]] return df, None - futr_exog_list = [c for c in X_df.columns if c not in (id_col, time_col)] - hist_exog_list = list(set(exog_list) - set(futr_exog_list)) + # exogs in df that weren't declared as historic nor future + futr_exog = [c for c in X_df.columns if c not in (id_col, time_col)] + declared_exogs = {*hist_exog, *futr_exog} + ignored_exogs = [c for c in exogs if c not in declared_exogs] + if ignored_exogs: + warnings.warn( + f"`df` contains the following exogenous features: {ignored_exogs}, " + "but they were not found in `X_df` nor declared in `hist_exog_list`. " + "They will be ignored." + ) - # Capture case where future exogenous are provided in X_df that are not in df - missing_futr = set(futr_exog_list) - set(exog_list) + # future exogenous are provided in X_df that are not in df + missing_futr = set(futr_exog) - set(exogs) if missing_futr: raise ValueError( "The following exogenous features are present in `X_df` " @@ -367,8 +385,8 @@ def _validate_exog( ) # Make sure df and X_df are in right order - df = df[[id_col, time_col, target_col, *futr_exog_list, *hist_exog_list]] - X_df = X_df[[id_col, time_col, *futr_exog_list]] + df = df[[id_col, time_col, target_col, *futr_exog, *hist_exog]] + X_df = X_df[[id_col, time_col, *futr_exog]] return df, X_df @@ -859,6 +877,7 @@ def forecast( finetune_depth: _Finetune_Depth = 1, finetune_loss: _Loss = "default", clean_ex_first: bool = True, + hist_exog_list: Optional[List[str]] = None, validate_api_key: bool = False, add_history: bool = False, date_features: Union[bool, List[Union[str, Callable]]] = False, @@ -915,6 +934,8 @@ def forecast( Loss function to use for finetuning. Options are: `default`, `mae`, `mse`, `rmse`, `mape`, and `smape`. clean_ex_first : bool (default=True) Clean exogenous signal before making forecasts using TimeGPT. + hist_exog_list : list of str, optional (default=None) + Column names of the historical exogenous features. validate_api_key : bool (default=False) If True, validates api_key before sending requests. add_history : bool (default=False) @@ -985,7 +1006,12 @@ def forecast( model=model, ) df, X_df = _validate_exog( - df, X_df, id_col=id_col, time_col=time_col, target_col=target_col + df=df, + X_df=X_df, + id_col=id_col, + time_col=time_col, + target_col=target_col, + hist_exog=hist_exog_list, ) level, quantiles = _prepare_level_and_quantiles(level, quantiles) freq = _maybe_infer_freq(df, freq=freq, id_col=id_col, time_col=time_col) @@ -1025,14 +1051,9 @@ def forecast( if processed.data.shape[1] > 1: X = processed.data[:, 1:].T if futr_cols is not None: - hist_exog_set = set(x_cols) - set(futr_cols) - if hist_exog_set: - logger.info( - f"Using historical exogenous features: {list(hist_exog_set)}" - ) logger.info(f"Using future exogenous features: {futr_cols}") - else: - logger.info(f"Using historical exogenous features: {x_cols}") + if hist_exog_list is not None: + logger.info(f"Using historical exogenous features: {hist_exog_list}") else: X = None @@ -1632,7 +1653,7 @@ def plot( ax=ax, ) -# %% ../nbs/src/nixtla_client.ipynb 50 +# %% ../nbs/src/nixtla_client.ipynb 51 def _forecast_wrapper( df: pd.DataFrame, client: NixtlaClient,