Starting the change for XGBoost integration into EVADb. (georgia-tech…

…-db#1232) Co-authored-by: Jineet Desai <[email protected]> Co-authored-by: Andy Xu <[email protected]>
alexxx-db · Nov 22, 2023 · e19f144 · e19f144
1 parent af9f485
commit e19f144
Show file tree

Hide file tree

Showing 7 changed files with 221 additions and 3 deletions.
diff --git a/docs/_toc.yml b/docs/_toc.yml
@@ -869,11 +869,15 @@ parts:
 <<<<<<< HEAD
 <<<<<<< HEAD
 <<<<<<< HEAD
+<<<<<<< HEAD
+=======
+>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
           - file: source/reference/ai/model-train-xgboost
             title: Model Training with XGBoost
 =======
 <<<<<<< HEAD
 <<<<<<< HEAD
+<<<<<<< HEAD
 >>>>>>> 70850a8b (feat: sync master staging (#1050))
 =======
 <<<<<<< HEAD
@@ -890,6 +894,10 @@ parts:
             title: Model Training with XGBoost
 =======
 >>>>>>> 03a6c555 (feat: sync master staging (#1050))
+=======
+>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
+=======
+>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
 >>>>>>> 53dafecf (feat: sync master staging (#1050))
 =======
       - file: source/reference/ai/index
@@ -931,10 +939,13 @@ parts:
 <<<<<<< HEAD
 <<<<<<< HEAD
 <<<<<<< HEAD
+<<<<<<< HEAD
 =======
 >>>>>>> 03a6c555 (feat: sync master staging (#1050))
 =======
 >>>>>>> ae08f806 (Bump v0.3.4+ dev)
+=======
+>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
 =======
           - file: source/reference/ai/model-train
             title: Model Training
@@ -945,15 +956,19 @@ parts:
 =======
 >>>>>>> 6d6a14c8 (Bump v0.3.4+ dev)
 =======
+<<<<<<< HEAD
 =======
 >>>>>>> 70850a8b (feat: sync master staging (#1050))
 =======
 >>>>>>> 22e78346 (Bump v0.3.4+ dev)
+=======
+>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
 >>>>>>> eva-master
 =======
           - file: source/reference/ai/model-train-xgboost
             title: Model Training with XGBoost
 <<<<<<< HEAD
+<<<<<<< HEAD
 >>>>>>> 201f901b (Starting the change for XGBoost integration into EVADb. (#1232))
 <<<<<<< HEAD
 >>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
@@ -1025,6 +1040,10 @@ parts:
 =======
 >>>>>>> 6d6a14c8 (Bump v0.3.4+ dev)
 >>>>>>> ae08f806 (Bump v0.3.4+ dev)
+=======
+>>>>>>> 201f901b (Starting the change for XGBoost integration into EVADb. (#1232))
+>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
+>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
           - file: source/reference/ai/model-forecasting
             title: Time Series Forecasting
           - file: source/reference/ai/hf

diff --git a/evadb/executor/create_function_executor.py b/evadb/executor/create_function_executor.py
@@ -170,6 +170,7 @@
 <<<<<<< HEAD
 <<<<<<< HEAD
 <<<<<<< HEAD
+<<<<<<< HEAD
 <<<<<<< HEAD
     DEFAULT_SKLEARN_TRAIN_MODEL,
 =======
@@ -178,9 +179,14 @@
 >>>>>>> 9fe75f29 (feat: sync master staging (#1050))
 =======
 >>>>>>> b87af508 (feat: sync master staging (#1050))
+=======
+>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
     DEFAULT_TRAIN_REGRESSION_METRIC,
 =======
 >>>>>>> 2dacff69 (feat: sync master staging (#1050))
+=======
+    DEFAULT_TRAIN_REGRESSION_METRIC,
+>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
     DEFAULT_TRAIN_TIME_LIMIT,
     DEFAULT_XGBOOST_TASK,
     SKLEARN_SUPPORTED_MODELS,
@@ -373,6 +379,7 @@
     try_to_import_torch,
     try_to_import_ultralytics,
     try_to_import_xgboost,
+<<<<<<< HEAD
 =======
 =======
     string_comparison_case_insensitive,
@@ -398,6 +405,8 @@
     try_to_import_ultralytics,
 >>>>>>> b87af508 (feat: sync master staging (#1050))
 >>>>>>> 2dacff69 (feat: sync master staging (#1050))
+=======
+>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
 )
 from evadb.utils.logging_manager import logger
 
@@ -1107,12 +1116,18 @@ def handle_sklearn_function(self):
             FunctionMetadataCatalogEntry("model_path", model_path)
         )
 <<<<<<< HEAD
+<<<<<<< HEAD
+=======
+>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
         # Pass the prediction column name to sklearn.py
         self.node.metadata.append(
             FunctionMetadataCatalogEntry("predict_col", arg_map["predict"])
         )
+<<<<<<< HEAD
 =======
 >>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
+=======
+>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
 
         impl_path = Path(f"{self.function_dir}/sklearn.py").absolute().as_posix()
         io_list = self._resolve_function_io(None)
@@ -1130,6 +1145,7 @@ def handle_sklearn_function(self):
 <<<<<<< HEAD
 <<<<<<< HEAD
 <<<<<<< HEAD
+<<<<<<< HEAD
 =======
 =======
 >>>>>>> 7cac771f (Bump v0.3.4+ dev)
@@ -1139,6 +1155,72 @@ def handle_sklearn_function(self):
 >>>>>>> c5f43c65 (Bump v0.3.4+ dev)
 =======
 >>>>>>> ae08f806 (Bump v0.3.4+ dev)
+=======
+=======
+<<<<<<< HEAD
+=======
+    def handle_xgboost_function(self):
+        """Handle xgboost functions
+
+        We use the Flaml AutoML model for training xgboost models.
+        """
+        try_to_import_xgboost()
+
+        assert (
+            len(self.children) == 1
+        ), "Create sklearn function expects 1 child, finds {}.".format(
+            len(self.children)
+        )
+
+        aggregated_batch_list = []
+        child = self.children[0]
+        for batch in child.exec():
+            aggregated_batch_list.append(batch)
+        aggregated_batch = Batch.concat(aggregated_batch_list, copy=False)
+        aggregated_batch.drop_column_alias()
+
+        arg_map = {arg.key: arg.value for arg in self.node.metadata}
+        from flaml import AutoML
+
+        model = AutoML()
+        settings = {
+            "time_budget": arg_map.get("time_limit", DEFAULT_TRAIN_TIME_LIMIT),
+            "metric": arg_map.get("metric", DEFAULT_TRAIN_REGRESSION_METRIC),
+            "estimator_list": ["xgboost"],
+            "task": "regression",
+        }
+        model.fit(
+            dataframe=aggregated_batch.frames, label=arg_map["predict"], **settings
+        )
+        model_path = os.path.join(
+            self.db.config.get_value("storage", "model_dir"), self.node.name
+        )
+        pickle.dump(model, open(model_path, "wb"))
+        self.node.metadata.append(
+            FunctionMetadataCatalogEntry("model_path", model_path)
+        )
+        # Pass the prediction column to xgboost.py.
+        self.node.metadata.append(
+            FunctionMetadataCatalogEntry("predict_col", arg_map["predict"])
+        )
+
+        impl_path = Path(f"{self.function_dir}/xgboost.py").absolute().as_posix()
+        io_list = self._resolve_function_io(None)
+        return (
+            self.node.name,
+            impl_path,
+            self.node.function_type,
+            io_list,
+            self.node.metadata,
+        )
+
+>>>>>>> 201f901b (Starting the change for XGBoost integration into EVADb. (#1232))
+    def handle_ultralytics_function(self):
+        """Handle Ultralytics functions"""
+=======
+>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
+<<<<<<< HEAD
+>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
     def handle_xgboost_function(self):
         """Handle xgboost functions
 
@@ -3455,6 +3537,9 @@ def exec(self, *args, **kwargs):
                 train_time,
             ) = self.handle_sklearn_function()
 <<<<<<< HEAD
+<<<<<<< HEAD
+=======
+>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
         elif string_comparison_case_insensitive(self.node.function_type, "XGBoost"):
             (
                 name,
@@ -3466,12 +3551,14 @@ def exec(self, *args, **kwargs):
                 best_score,
                 train_time,
             ) = self.handle_xgboost_function()
+<<<<<<< HEAD
 =======
 <<<<<<< HEAD
 <<<<<<< HEAD
             ) = self.handle_sklearn_function()
 >>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
 =======
+<<<<<<< HEAD
             ) = self.handle_xgboost_function()
 >>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
 =======
@@ -3480,6 +3567,9 @@ def exec(self, *args, **kwargs):
 =======
 >>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
 >>>>>>> c5f43c65 (Bump v0.3.4+ dev)
+=======
+>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
+>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
         elif string_comparison_case_insensitive(self.node.function_type, "Forecasting"):
             (
                 name,

diff --git a/evadb/functions/sklearn.py b/evadb/functions/sklearn.py
@@ -33,10 +33,13 @@ def name(self) -> str:
 <<<<<<< HEAD
 <<<<<<< HEAD
 <<<<<<< HEAD
+<<<<<<< HEAD
 =======
 >>>>>>> 2170a7a9 (Bump v0.3.4+ dev)
 =======
 >>>>>>> c5f43c65 (Bump v0.3.4+ dev)
+=======
+>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
     def setup(self, model_path: str, predict_col: str, **kwargs):
         try_to_import_flaml_automl()
 
@@ -56,6 +59,7 @@ def forward(self, frames: pd.DataFrame) -> pd.DataFrame:
     def setup(self, model_path: str, **kwargs):
 <<<<<<< HEAD
 <<<<<<< HEAD
+<<<<<<< HEAD
 =======
     def setup(self, model_path: str, predict_col: str, **kwargs):
 >>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
@@ -74,36 +78,51 @@ def forward(self, frames: pd.DataFrame) -> pd.DataFrame:
 =======
 =======
 >>>>>>> c5f43c65 (Bump v0.3.4+ dev)
+=======
+=======
+    def setup(self, model_path: str, predict_col: str, **kwargs):
+>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
+>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
         try_to_import_sklearn()
 
         self.model = pickle.load(open(model_path, "rb"))
+        self.predict_col = predict_col
 
     def forward(self, frames: pd.DataFrame) -> pd.DataFrame:
-        # The last column is the predictor variable column. Hence we do not
-        # pass that column in the predict method for sklearn.
-        predictions = self.model.predict(frames.iloc[:, :-1])
+        # Do not pass the prediction column in the predict method for sklearn.
+        frames.drop([self.predict_col], axis=1, inplace=True)
+        predictions = self.model.predict(frames)
         predict_df = pd.DataFrame(predictions)
         # We need to rename the column of the output dataframe. For this we
 <<<<<<< HEAD
+<<<<<<< HEAD
 >>>>>>> 2170a7a9 (Bump v0.3.4+ dev)
 =======
 >>>>>>> c5f43c65 (Bump v0.3.4+ dev)
+=======
+>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
         # shall rename it to the column name same as that of the last column of
         # frames. This is because the last column of frames corresponds to the
         # variable we want to predict.
         predict_df.rename(columns={0: frames.columns[-1]}, inplace=True)
 >>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
 <<<<<<< HEAD
 <<<<<<< HEAD
+<<<<<<< HEAD
+=======
+>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
 =======
         # shall rename it to the column name same as that of the predict column
         # passed in the training frames in EVA query.
         predict_df.rename(columns={0: self.predict_col}, inplace=True)
 >>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
+<<<<<<< HEAD
 =======
 >>>>>>> 2170a7a9 (Bump v0.3.4+ dev)
 =======
 >>>>>>> c5f43c65 (Bump v0.3.4+ dev)
+=======
+>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
         return predict_df
 
     def to_device(self, device: str):