Fix bug in number of tabular forecasting prediction samples (#1149)

Lightning-Universe · Feb 4, 2022 · 8e4abf3 · 8e4abf3
1 parent 20b3a7d
commit 8e4abf3
Show file tree

Hide file tree

Showing 6 changed files with 19 additions and 8 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -72,6 +72,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 - Fixed a bug where `TabularData` would not work correctly with no categorical variables ([#1144](https://github.com/PyTorchLightning/lightning-flash/pull/1144))
 
+- Fixed a bug where loading `TabularForecastingData` for prediction would only yield a single sample per series ([#1149](https://github.com/PyTorchLightning/lightning-flash/pull/1149))
+
 ### Removed
 
 - Removed the `Seq2SeqData` base class (use `TranslationData` or `SummarizationData` directly) ([#1128](https://github.com/PyTorchLightning/lightning-flash/pull/1128))

diff --git a/flash/tabular/forecasting/input.py b/flash/tabular/forecasting/input.py
@@ -74,7 +74,6 @@ def load_data(
             time_series_dataset = TimeSeriesDataSet.from_parameters(
                 parameters,
                 data,
-                predict=True,
                 stop_randomization=True,
             )
         return time_series_dataset

diff --git a/flash_examples/tabular_forecasting.py b/flash_examples/tabular_forecasting.py
@@ -28,6 +28,7 @@
 data = generate_ar_data(seasonality=10.0, timesteps=400, n_series=100, seed=42)
 data["date"] = pd.Timestamp("2020-01-01") + pd.to_timedelta(data.time_idx, "D")
 
+max_encoder_length = 60
 max_prediction_length = 20
 
 training_cutoff = data["time_idx"].max() - max_prediction_length
@@ -39,10 +40,11 @@
     group_ids=["series"],
     # only unknown variable is "value" - and N-Beats can also not take any additional variables
     time_varying_unknown_reals=["value"],
-    max_encoder_length=60,
+    max_encoder_length=max_encoder_length,
     max_prediction_length=max_prediction_length,
     train_data_frame=data[lambda x: x.time_idx <= training_cutoff],
-    val_data_frame=data,
+    # validate on the last sequence
+    val_data_frame=data[lambda x: x.time_idx > training_cutoff - max_encoder_length],
     batch_size=32,
 )
 
@@ -58,7 +60,11 @@
 trainer.fit(model, datamodule=datamodule)
 
 # 4. Generate predictions
-datamodule = TabularForecastingData.from_data_frame(predict_data_frame=data, parameters=datamodule.parameters)
+datamodule = TabularForecastingData.from_data_frame(
+    predict_data_frame=data[lambda x: x.time_idx > training_cutoff - max_encoder_length],
+    parameters=datamodule.parameters,
+    batch_size=32,
+)
 predictions = trainer.predict(model, datamodule=datamodule)
 print(predictions)
 

diff --git a/requirements/test.txt b/requirements/test.txt
@@ -1,9 +1,9 @@
 coverage
 codecov>=2.1
-pytest>=5.0
+pytest>=5.0,<7.0
 pytest-flake8
 flake8
-pytest-doctestplus
+pytest-doctestplus>=0.9.0
 pytest-rerunfailures>=10.0
 
 # install pkg

diff --git a/tests/examples/test_scripts.py b/tests/examples/test_scripts.py
@@ -95,6 +95,10 @@
             "tabular_regression.py",
             marks=pytest.mark.skipif(not _TABULAR_TESTING, reason="tabular libraries aren't installed"),
         ),
+        pytest.param(
+            "tabular_forecasting.py",
+            marks=pytest.mark.skipif(not _TABULAR_TESTING, reason="tabular libraries aren't installed"),
+        ),
         pytest.param("template.py", marks=pytest.mark.skipif(not _SKLEARN_AVAILABLE, reason="sklearn isn't installed")),
         pytest.param(
             "text_classification.py",

diff --git a/tests/tabular/forecasting/test_data.py b/tests/tabular/forecasting/test_data.py
@@ -45,7 +45,7 @@ def test_from_data_frame_time_series_data_set_single_call(patch_time_series_data
     )
 
     patch_time_series_data_set.from_parameters.assert_called_once_with(
-        {"test": None}, val_data, predict=True, stop_randomization=True
+        {"test": None}, val_data, stop_randomization=True
     )
 
 
@@ -79,7 +79,7 @@ def test_from_data_frame_time_series_data_set_multi_call(patch_time_series_data_
     )
 
     patch_time_series_data_set.from_parameters.assert_called_once_with(
-        {"test": None}, val_data, predict=True, stop_randomization=True
+        {"test": None}, val_data, stop_randomization=True
     )