Merge pull request #10 from sb-ai-lab/bugfix/granularity_handling_fix

bugfix/granularity handling fix
sb-ai-lab · Feb 21, 2024 · 39c32d6 · 39c32d6
2 parents 019a828 + 907439d
commit 39c32d6
Show file tree

Hide file tree

Showing 7 changed files with 1,305 additions and 76 deletions.
diff --git a/.gitignore b/.gitignore
@@ -144,5 +144,9 @@ cython_debug/
 .vscode/
 
 temp/
+additional_datasets/
 catboost_info/
-poetry.lock
+poetry.lock
+*check*.ipynb
+*temp*
+*demand*
diff --git a/Tutorial_1_Basics.ipynb b/Tutorial_1_Basics.ipynb
@@ -21,20 +21,20 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [],
    "source": [
     "def get_results(\n",
     "    cv: int,\n",
     "    regime: str,\n",
-    "    y_true: Optional[List[NDArray[np.float]]] = None,\n",
-    "    y_pred: Optional[List[NDArray[np.float]]] = None,\n",
+    "    y_true: Optional[List[NDArray[np.floating]]] = None,\n",
+    "    y_pred: Optional[List[NDArray[np.floating]]] = None,\n",
     "    ids: Optional[List[Union[float, str]]] = None,\n",
     ") -> pd.DataFrame:\n",
     "    def _get_fold_value(\n",
-    "        value: Optional[Union[float, NDArray[np.float]]], idx: int\n",
-    "    ) -> List[Optional[Union[float, NDArray[np.float]]]]:\n",
+    "        value: Optional[Union[float, NDArray[np.floating]]], idx: int\n",
+    "    ) -> List[Optional[Union[float, NDArray[np.floating]]]]:\n",
     "        if value is None:\n",
     "            return [None]\n",
     "        if isinstance(value[idx], float):\n",
@@ -59,6 +59,13 @@
     "    return df_res"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Pipeline setup"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -93,7 +100,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -180,7 +187,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -189,9 +196,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 6,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "freq: Day; period: 1\n"
+     ]
+    }
+   ],
    "source": [
     "dataset = TSDataset(\n",
     "    data=pd.read_csv(df_path),\n",
@@ -202,36 +217,44 @@
     "strategy = strategies_factory[strategy_params]"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Backtest validation of pipeline"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0:\tlearn: 0.0001731\ttest: 0.0001793\tbest: 0.0001793 (0)\ttotal: 2.23ms\tremaining: 2.23s\n",
-      "500:\tlearn: 0.0000015\ttest: 0.0000015\tbest: 0.0000015 (500)\ttotal: 689ms\tremaining: 687ms\n",
-      "999:\tlearn: 0.0000011\ttest: 0.0000011\tbest: 0.0000011 (999)\ttotal: 1.35s\tremaining: 0us\n",
+      "freq: Day; period: 1\n",
+      "0:\tlearn: 0.0001731\ttest: 0.0001793\tbest: 0.0001793 (0)\ttotal: 63.7ms\tremaining: 1m 3s\n",
+      "500:\tlearn: 0.0000015\ttest: 0.0000015\tbest: 0.0000015 (500)\ttotal: 1.29s\tremaining: 1.29s\n",
+      "999:\tlearn: 0.0000011\ttest: 0.0000011\tbest: 0.0000011 (999)\ttotal: 2.48s\tremaining: 0us\n",
       "\n",
       "bestTest = 1.110360723e-06\n",
       "bestIteration = 999\n",
       "\n",
       "Fold 0:\n",
       "MultiRMSE: 1.110360722708484e-06\n",
-      "0:\tlearn: 0.0001756\ttest: 0.0001744\tbest: 0.0001744 (0)\ttotal: 2.53ms\tremaining: 2.52s\n",
-      "500:\tlearn: 0.0000015\ttest: 0.0000016\tbest: 0.0000016 (500)\ttotal: 626ms\tremaining: 623ms\n",
-      "999:\tlearn: 0.0000011\ttest: 0.0000012\tbest: 0.0000012 (999)\ttotal: 1.28s\tremaining: 0us\n",
+      "0:\tlearn: 0.0001756\ttest: 0.0001744\tbest: 0.0001744 (0)\ttotal: 6.77ms\tremaining: 6.77s\n",
+      "500:\tlearn: 0.0000015\ttest: 0.0000016\tbest: 0.0000016 (500)\ttotal: 1.31s\tremaining: 1.31s\n",
+      "999:\tlearn: 0.0000011\ttest: 0.0000012\tbest: 0.0000012 (999)\ttotal: 2.51s\tremaining: 0us\n",
       "\n",
       "bestTest = 1.164605569e-06\n",
       "bestIteration = 999\n",
       "\n",
       "Fold 1:\n",
       "MultiRMSE: 1.1646055692725877e-06\n",
-      "0:\tlearn: 0.0001767\ttest: 0.0001718\tbest: 0.0001718 (0)\ttotal: 1.57ms\tremaining: 1.57s\n",
-      "500:\tlearn: 0.0000015\ttest: 0.0000015\tbest: 0.0000015 (500)\ttotal: 628ms\tremaining: 625ms\n",
-      "999:\tlearn: 0.0000011\ttest: 0.0000012\tbest: 0.0000012 (999)\ttotal: 1.26s\tremaining: 0us\n",
+      "0:\tlearn: 0.0001767\ttest: 0.0001718\tbest: 0.0001718 (0)\ttotal: 4.55ms\tremaining: 4.54s\n",
+      "500:\tlearn: 0.0000015\ttest: 0.0000015\tbest: 0.0000015 (500)\ttotal: 1.37s\tremaining: 1.37s\n",
+      "999:\tlearn: 0.0000011\ttest: 0.0000012\tbest: 0.0000012 (999)\ttotal: 2.94s\tremaining: 0us\n",
       "\n",
       "bestTest = 1.166647188e-06\n",
       "bestIteration = 999\n",
@@ -249,7 +272,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -367,7 +390,7 @@
        "[70 rows x 3 columns]"
       ]
      },
-     "execution_count": 11,
+     "execution_count": 8,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -380,46 +403,39 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "It is also possible to use the classic fit and predict methods."
+    "## Fit and predict interface of pipeline"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "0:\tlearn: 0.0001722\ttest: 0.0001803\tbest: 0.0001803 (0)\ttotal: 1.89ms\tremaining: 1.89s\n",
-      "500:\tlearn: 0.0000015\ttest: 0.0000016\tbest: 0.0000016 (500)\ttotal: 629ms\tremaining: 626ms\n",
-      "999:\tlearn: 0.0000010\ttest: 0.0000012\tbest: 0.0000012 (999)\ttotal: 1.28s\tremaining: 0us\n",
+      "0:\tlearn: 0.0001722\ttest: 0.0001803\tbest: 0.0001803 (0)\ttotal: 9.59ms\tremaining: 9.58s\n",
+      "500:\tlearn: 0.0000015\ttest: 0.0000016\tbest: 0.0000016 (500)\ttotal: 1.65s\tremaining: 1.64s\n",
+      "999:\tlearn: 0.0000010\ttest: 0.0000012\tbest: 0.0000012 (999)\ttotal: 3.05s\tremaining: 0us\n",
       "\n",
       "bestTest = 1.219564131e-06\n",
       "bestIteration = 999\n",
       "\n",
       "Fold 0:\n",
       "MultiRMSE: 1.2195641307026147e-06\n",
-      "0:\tlearn: 0.0001757\ttest: 0.0001733\tbest: 0.0001733 (0)\ttotal: 1.56ms\tremaining: 1.56s\n",
-      "500:\tlearn: 0.0000015\ttest: 0.0000016\tbest: 0.0000016 (500)\ttotal: 645ms\tremaining: 642ms\n",
-      "999:\tlearn: 0.0000010\ttest: 0.0000012\tbest: 0.0000012 (999)\ttotal: 1.27s\tremaining: 0us\n",
+      "0:\tlearn: 0.0001757\ttest: 0.0001733\tbest: 0.0001733 (0)\ttotal: 6.25ms\tremaining: 6.24s\n",
+      "500:\tlearn: 0.0000015\ttest: 0.0000016\tbest: 0.0000016 (500)\ttotal: 2.09s\tremaining: 2.08s\n",
+      "999:\tlearn: 0.0000010\ttest: 0.0000012\tbest: 0.0000012 (999)\ttotal: 3.71s\tremaining: 0us\n",
       "\n",
       "bestTest = 1.162857853e-06\n",
       "bestIteration = 999\n",
       "\n",
       "Fold 1:\n",
       "MultiRMSE: 1.1628578528684804e-06\n",
-      "0:\tlearn: 0.0001768\ttest: 0.0001711\tbest: 0.0001711 (0)\ttotal: 1.8ms\tremaining: 1.8s\n",
-      "500:\tlearn: 0.0000015\ttest: 0.0000015\tbest: 0.0000015 (500)\ttotal: 651ms\tremaining: 648ms\n",
-      "999:\tlearn: 0.0000010\ttest: 0.0000011\tbest: 0.0000011 (999)\ttotal: 1.27s\tremaining: 0us\n",
+      "0:\tlearn: 0.0001768\ttest: 0.0001711\tbest: 0.0001711 (0)\ttotal: 3.75ms\tremaining: 3.75s\n",
+      "500:\tlearn: 0.0000015\ttest: 0.0000015\tbest: 0.0000015 (500)\ttotal: 1.85s\tremaining: 1.84s\n",
+      "999:\tlearn: 0.0000010\ttest: 0.0000011\tbest: 0.0000011 (999)\ttotal: 3.83s\tremaining: 0us\n",
       "\n",
       "bestTest = 1.086497806e-06\n",
       "bestIteration = 999\n",
@@ -438,7 +454,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -556,7 +572,7 @@
        "[70 rows x 3 columns]"
       ]
      },
-     "execution_count": 13,
+     "execution_count": 10,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -565,12 +581,124 @@
     "current_pred"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Working with raw time series' granularity"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Time series come in different granularities, from hourly and daily time series to more complex ones such as the end of each quarter.\n",
+    "\n",
+    "If the rows do not contain segments that are too short (that are shorter than history + horizon), then `tsururu` will try to extract the row granularity on its own. We currently support the following types:\n",
+    "\n",
+    "- Yearly (and YearlyEnd)\n",
+    "- Quarterly (and Quarterly)\n",
+    "- Monthly (and MonthlyEnd)\n",
+    "- Weekly\n",
+    "- Daily\n",
+    "- Hourly\n",
+    "- Minlutely\n",
+    "- Secondly\n",
+    "- Microsecondly\n",
+    "\n",
+    "There is also support for compound granularities (10 days, 15 minutes, 32 seconds, etc.). The correctness of the selected granularity can be checked from the output after the `Dataset` class has been created.\n",
+    "\n",
+    "However, there are tricky situations (e.g. 28 days) where the monthly granularity may be guessed incorrectly. Therefore, it is possible to set your own granularity using the `pd.DateOffset` class or related classes from `pandas.tseries.offsets`, which must be fed as `delta` parameter into the `Dataset` class. Then the time column will be processed according to the user's settings."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Consider a time series where each point is exactly __28 daily points away__ from each other"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 31,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    "df_path_2 = \"datasets/global/simulated_data_to_check_28D.csv\"\n",
+    "\n",
+    "# Configure the features settings\n",
+    "columns_and_features_params_2 = {\n",
+    "    \"target\": {\n",
+    "        \"column\": [\"value\"],\n",
+    "        \"type\": \"continious\",\n",
+    "    },\n",
+    "    \"date\": {\n",
+    "        \"column\": [\"date\"],\n",
+    "        \"type\": \"datetime\",\n",
+    "    },\n",
+    "    \"id\": {\n",
+    "        \"column\": [\"id\"],\n",
+    "        \"type\": \"categorical\",\n",
+    "    }\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "freq: Month; period: 1.0\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset_2 = TSDataset(\n",
+    "    data=pd.read_csv(df_path_2),\n",
+    "    columns_and_features_params=columns_and_features_params_2,\n",
+    "    history=30,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We see that the frequency of the series is incorrectly defined as monthly. Let's try to pass the `delta` parameter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Custom OffSet: <DateOffset: days=28>\n"
+     ]
+    }
+   ],
+   "source": [
+    "dataset_2 = TSDataset(\n",
+    "    data=pd.read_csv(df_path_2),\n",
+    "    columns_and_features_params=columns_and_features_params_2,\n",
+    "    history=30,\n",
+    "    delta=pd.DateOffset(days=28),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now it's all detected correctly."
+   ]
   }
  ],
  "metadata": {
@@ -592,7 +720,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.9.0"
+   "version": "3.9.18"
   },
   "orig_nbformat": 4
  },