Merge remote-tracking branch 'origin/staging' into neuralforecast

georgia-tech-db · Sep 28, 2023 · fda2b40 · fda2b40
2 parents 32a204b + b12e7ac
commit fda2b40
Show file tree

Hide file tree

Showing 35 changed files with 755 additions and 275 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -56,7 +56,18 @@ workflows:
                 ignore: 
                   - master
                   - staging
-
+        ################################
+        #### LONG INTEGRATION TESTS: PR
+        ################################
+        ################################
+        - Linux:
+            name: Long Integration Test (Cache) | v3.10 | Linux
+            mode: LONG INTEGRATION CACHE
+            filters:
+              branches:
+                ignore: 
+                  - master
+                  - staging
         ################################
         #### SHORT THIRDPARTY TESTS: PR
         ################################
@@ -201,6 +212,11 @@ jobs:
       - restore_cache:
           keys:
             - v1-model_cache-{{ checksum "setup.py" }}
+
+      # Always restore testmondata from staging, python3.10, ray disabled.
+      - restore_cache:
+          keys:
+            - v1-testmon_cache-staging-python3.10-rayDISABLED-{{ checksum "setup.py" }}
 
       - run:
           name: Install EvaDB package from GitHub repo with all dependencies
@@ -237,6 +253,7 @@ jobs:
             if [[ $PY_VERSION = "3.10" ]] || [[ $PY_VERSION = "3.11" ]]; then
               export SETUPTOOLS_USE_DISTUTILS=stdlib
             fi
+            set +e # To make sure the later cache step is not skipped.
             bash script/test/test.sh -m "<< parameters.mode >>"
 
       # Enable cache save conditionally (to avoid empty cache in Notebooks)
@@ -251,6 +268,20 @@ jobs:
                   - /home/circleci/.cache/torch/
                   - /home/circleci/.cache/gpt4all/
 
+    # Collect the testmondata only for long intergration tests
+      - when:
+          condition:
+            and:
+              - equal: [ LONG INTEGRATION, << parameters.mode >> ]
+              - equal: [ staging, << pipeline.git.branch >> ]
+              - equal: [ "3.10", << parameters.v >> ]
+              - equal: [ DISABLED, << parameters.ray >>]
+          steps:
+          - save_cache:
+              key: v1-testmon_cache-{{ .Branch }}-python<< parameters.v >>-ray<< parameters.ray >>-{{ checksum "setup.py" }}-{{ epoch }}
+              paths:
+                - .testmondata
+
       - save_cache:
           key: v1-pip-wheel_cache-python<< parameters.v >>-ray<< parameters.ray >>-{{ checksum "setup.py" }}
           paths:

diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -36,6 +36,8 @@ parts:
         title: Emotion Analysis
       - file: source/usecases/homesale-forecast.rst
         title: Home Sale Forecasting
+      - file: source/usecases/homerental-predict.rst
+        title: Home Rental Prediction
       # - file: source/usecases/privategpt.rst
       #   title: PrivateGPT
 
@@ -69,8 +71,10 @@ parts:
       - file: source/reference/ai/index
         title: AI Engines
         sections:
-          - file: source/reference/ai/model-train
-            title: Model Training
+          - file: source/reference/ai/model-train-ludwig
+            title: Model Training with Ludwig
+          - file: source/reference/ai/model-train-sklearn
+            title: Model Training with Sklearn
           - file: source/reference/ai/model-forecasting
             title: Time Series Forecasting
           - file: source/reference/ai/hf

diff --git a/docs/requirements.txt b/docs/requirements.txt
@@ -37,4 +37,4 @@ jupytext==1.15.2
 urllib3 < 2
 
 # eva
-git+https://github.com/georgia-tech-db/eva.git@master
+git+https://github.com/georgia-tech-db/eva.git@staging
diff --git a/docs/source/overview/model-inference.rst b/docs/source/overview/model-inference.rst
@@ -43,7 +43,7 @@ In EvaDB, we can also use models in joins.
 The most powerful usecase is lateral join combined with ``UNNEST``, which is very helpful to flatten the output from `one-to-many` models.
 The key idea here is a model could give multiple outputs (e.g., bounding box) stored in an array. This syntax is used to unroll elements from the array into multiple rows.
 Typical examples are `face detectors <https://github.com/georgia-tech-db/evadb/blob/staging/evadb/functions/face_detector.py>`_ and `object detectors <https://github.com/georgia-tech-db/evadb/blob/staging/evadb/functions/fastrcnn_object_detector.py>`_. 
-In the below example, we use `emotion detector <https://github.com/georgia-tech-db/evadb/blob/staging/evadb/functions/emotion_detector.py>_` to detect emotions from faces in the movie, where a single scene can contain multiple faces. 
+In the below example, we use `emotion detector <https://github.com/georgia-tech-db/evadb/blob/staging/evadb/functions/emotion_detector.py>`_ to detect emotions from faces in the movie, where a single scene can contain multiple faces. 
 
 .. code-block:: sql
    

diff --git a/docs/source/reference/ai/model-forecasting.rst b/docs/source/reference/ai/model-forecasting.rst
@@ -47,7 +47,7 @@ EvaDB's default forecast framework is `statsforecast <https://nixtla.github.io/s
 .. list-table:: Available Parameters
    :widths: 25 75
 
-   * - PREDICT (required) 
+   * - PREDICT (**required**) 
      - The name of the column we wish to forecast.
    * - TIME
      - The name of the column that contains the datestamp, wihch should be of a format expected by Pandas, ideally YYYY-MM-DD for a date or YYYY-MM-DD HH:MM:SS for a timestamp. Please visit the `pandas documentation <https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html>`_ for details. If not provided, an auto increasing ID column will be used.

diff --git a/docs/source/reference/ai/model-train-ludwig.rst b/docs/source/reference/ai/model-train-ludwig.rst
@@ -0,0 +1,65 @@
+.. _ludwig:
+
+Model Training with Ludwig
+==========================
+
+1. Installation
+---------------
+
+To use the `Ludwig framework <https://ludwig.ai/latest/>`_, we need to install the extra ludwig dependency in your EvaDB virtual environment.
+
+.. code-block:: bash
+   
+   pip install evadb[ludwig]
+
+2. Example Query
+----------------
+
+.. code-block:: sql
+
+   CREATE OR REPLACE FUNCTION PredictHouseRent FROM
+   ( SELECT sqft, location, rental_price FROM HomeRentals )
+   TYPE Ludwig
+   PREDICT 'rental_price'
+   TIME_LIMIT 120;
+
+In the above query, you are creating a new customized function by automatically training a model from the ``HomeRentals`` table.
+The ``rental_price`` column will be the target column for predication, while ``sqft`` and ``location`` are the inputs. 
+
+You can also simply give all other columns in ``HomeRentals`` as inputs and let the underlying AutoML framework to figure it out. Below is an example query:
+
+.. code-block:: sql
+
+   CREATE FUNCTION IF NOT EXISTS PredictHouseRent FROM
+   ( SELECT * FROM HomeRentals )
+   TYPE Ludwig
+   PREDICT 'rental_price'
+   TIME_LIMIT 120;
+
+.. note::
+
+   Check out our :ref:`homerental-predict` for working example.
+
+3. Model Training Parameters
+----------------------------
+
+.. list-table:: Available Parameters
+   :widths: 25 75
+
+   * - PREDICT (**required**)
+     - The name of the column we wish to predict.
+   * - TIME_LIMIT
+     - Time limit to train the model in seconds. Default: 120.
+   * - TUNE_FOR_MEMORY
+     - Whether to refine hyperopt search space for available host / GPU memory. Default: False.    
+
+Below is an example query specifying the above parameters:
+
+.. code-block:: sql
+
+   CREATE FUNCTION IF NOT EXISTS PredictHouseRent FROM
+   ( SELECT * FROM HomeRentals )
+   TYPE Ludwig
+   PREDICT 'rental_price'
+   TIME_LIMIT 3600
+   TUNE_FOR_MEMORY True;
diff --git a/docs/source/reference/ai/model-train-sklearn.rst b/docs/source/reference/ai/model-train-sklearn.rst
@@ -0,0 +1,26 @@
+.. _sklearn:
+
+Model Training with Sklearn
+============================
+
+1. Installation
+---------------
+
+To use the `Sklearn framework <https://scikit-learn.org/stable/>`_, we need to install the extra sklearn dependency in your EvaDB virtual environment.
+
+.. code-block:: bash
+   
+   pip install evadb[sklearn]
+
+2. Example Query
+----------------
+
+.. code-block:: sql
+
+   CREATE OR REPLACE FUNCTION PredictHouseRent FROM
+   ( SELECT number_of_rooms, number_of_bathrooms, days_on_market, rental_price FROM HomeRentals )
+   TYPE Sklearn
+   PREDICT 'rental_price';
+
+In the above query, you are creating a new customized function by training a model from the ``HomeRentals`` table using the ``Sklearn`` framework.
+The ``rental_price`` column will be the target column for predication, while the rest columns from the ``SELET`` query are the inputs. 
diff --git a/docs/source/reference/ai/model-train.rst b/docs/source/reference/ai/model-train.rst
diff --git a/docs/source/reference/evaql/create.rst b/docs/source/reference/evaql/create.rst
@@ -117,7 +117,7 @@ Where the `parameter` is ``key value`` pair.
 
 .. note::
 
-   Go over :ref:`hf`, :ref:`predict`, and :ref:`forecast` to check examples for creating function via type.
+   Go over :ref:`hf`, :ref:`ludwig`, and :ref:`forecast` to check examples for creating function via type.
 
 CREATE MATERIALIZED VIEW
 ------------------------

diff --git a/docs/source/shared/postgresql.rst b/docs/source/shared/postgresql.rst
@@ -5,34 +5,14 @@ We will assume that you have a ``PostgreSQL`` database server running locally th
 
 EvaDB lets you connect to your favorite databases, data warehouses, data lakes, etc., via the ``CREATE DATABASE`` statement. In this query, we connect EvaDB to an existing ``PostgreSQL`` server:
 
-.. tab-set::
-
-    .. tab-item:: Python
-
-        .. code-block:: python
-
-            params = {
-                "user": "eva",
-                "password": "password",
-                "host": "localhost",
-                "port": "5432",
-                "database": "evadb",
-            }
-            query = f"CREATE DATABASE postgres_data 
-                      WITH ENGINE = 'postgres', 
-                      PARAMETERS = {params};"
-            cursor.query(query).df()
-
-    .. tab-item:: SQL 
-
-        .. code-block:: text
-
-            CREATE DATABASE postgres_data 
-            WITH ENGINE = 'postgres', 
-            PARAMETERS = {
-                "user": "eva",
-                "password": "password",
-                "host": "localhost",
-                "port": "5432",
-                "database": "evadb"
-            }
+.. code-block:: text
+
+    CREATE DATABASE postgres_data 
+    WITH ENGINE = 'postgres', 
+    PARAMETERS = {
+        "user": "eva",
+        "password": "password",
+        "host": "localhost",
+        "port": "5432",
+        "database": "evadb"
+    }