Skip to content

Commit

Permalink
Starting the change for XGBoost integration into EVADb. (georgia-tech…
Browse files Browse the repository at this point in the history
…-db#1232)

Co-authored-by: Jineet Desai <[email protected]>
Co-authored-by: Andy Xu <[email protected]>
  • Loading branch information
3 people authored and a0x8o committed Nov 22, 2023
1 parent af9f485 commit e19f144
Show file tree
Hide file tree
Showing 7 changed files with 221 additions and 3 deletions.
19 changes: 19 additions & 0 deletions docs/_toc.yml
Original file line number Diff line number Diff line change
Expand Up @@ -869,11 +869,15 @@ parts:
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
=======
>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
- file: source/reference/ai/model-train-xgboost
title: Model Training with XGBoost
=======
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
>>>>>>> 70850a8b (feat: sync master staging (#1050))
=======
<<<<<<< HEAD
Expand All @@ -890,6 +894,10 @@ parts:
title: Model Training with XGBoost
=======
>>>>>>> 03a6c555 (feat: sync master staging (#1050))
=======
>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
=======
>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
>>>>>>> 53dafecf (feat: sync master staging (#1050))
=======
- file: source/reference/ai/index
Expand Down Expand Up @@ -931,10 +939,13 @@ parts:
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
=======
>>>>>>> 03a6c555 (feat: sync master staging (#1050))
=======
>>>>>>> ae08f806 (Bump v0.3.4+ dev)
=======
>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
=======
- file: source/reference/ai/model-train
title: Model Training
Expand All @@ -945,15 +956,19 @@ parts:
=======
>>>>>>> 6d6a14c8 (Bump v0.3.4+ dev)
=======
<<<<<<< HEAD
=======
>>>>>>> 70850a8b (feat: sync master staging (#1050))
=======
>>>>>>> 22e78346 (Bump v0.3.4+ dev)
=======
>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
>>>>>>> eva-master
=======
- file: source/reference/ai/model-train-xgboost
title: Model Training with XGBoost
<<<<<<< HEAD
<<<<<<< HEAD
>>>>>>> 201f901b (Starting the change for XGBoost integration into EVADb. (#1232))
<<<<<<< HEAD
>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
Expand Down Expand Up @@ -1025,6 +1040,10 @@ parts:
=======
>>>>>>> 6d6a14c8 (Bump v0.3.4+ dev)
>>>>>>> ae08f806 (Bump v0.3.4+ dev)
=======
>>>>>>> 201f901b (Starting the change for XGBoost integration into EVADb. (#1232))
>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
- file: source/reference/ai/model-forecasting
title: Time Series Forecasting
- file: source/reference/ai/hf
Expand Down
90 changes: 90 additions & 0 deletions evadb/executor/create_function_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
DEFAULT_SKLEARN_TRAIN_MODEL,
=======
Expand All @@ -178,9 +179,14 @@
>>>>>>> 9fe75f29 (feat: sync master staging (#1050))
=======
>>>>>>> b87af508 (feat: sync master staging (#1050))
=======
>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
DEFAULT_TRAIN_REGRESSION_METRIC,
=======
>>>>>>> 2dacff69 (feat: sync master staging (#1050))
=======
DEFAULT_TRAIN_REGRESSION_METRIC,
>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
DEFAULT_TRAIN_TIME_LIMIT,
DEFAULT_XGBOOST_TASK,
SKLEARN_SUPPORTED_MODELS,
Expand Down Expand Up @@ -373,6 +379,7 @@
try_to_import_torch,
try_to_import_ultralytics,
try_to_import_xgboost,
<<<<<<< HEAD
=======
=======
string_comparison_case_insensitive,
Expand All @@ -398,6 +405,8 @@
try_to_import_ultralytics,
>>>>>>> b87af508 (feat: sync master staging (#1050))
>>>>>>> 2dacff69 (feat: sync master staging (#1050))
=======
>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
)
from evadb.utils.logging_manager import logger

Expand Down Expand Up @@ -1107,12 +1116,18 @@ def handle_sklearn_function(self):
FunctionMetadataCatalogEntry("model_path", model_path)
)
<<<<<<< HEAD
<<<<<<< HEAD
=======
>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
# Pass the prediction column name to sklearn.py
self.node.metadata.append(
FunctionMetadataCatalogEntry("predict_col", arg_map["predict"])
)
<<<<<<< HEAD
=======
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
=======
>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))

impl_path = Path(f"{self.function_dir}/sklearn.py").absolute().as_posix()
io_list = self._resolve_function_io(None)
Expand All @@ -1130,6 +1145,7 @@ def handle_sklearn_function(self):
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
=======
=======
>>>>>>> 7cac771f (Bump v0.3.4+ dev)
Expand All @@ -1139,6 +1155,72 @@ def handle_sklearn_function(self):
>>>>>>> c5f43c65 (Bump v0.3.4+ dev)
=======
>>>>>>> ae08f806 (Bump v0.3.4+ dev)
=======
=======
<<<<<<< HEAD
=======
def handle_xgboost_function(self):
"""Handle xgboost functions

We use the Flaml AutoML model for training xgboost models.
"""
try_to_import_xgboost()

assert (
len(self.children) == 1
), "Create sklearn function expects 1 child, finds {}.".format(
len(self.children)
)

aggregated_batch_list = []
child = self.children[0]
for batch in child.exec():
aggregated_batch_list.append(batch)
aggregated_batch = Batch.concat(aggregated_batch_list, copy=False)
aggregated_batch.drop_column_alias()

arg_map = {arg.key: arg.value for arg in self.node.metadata}
from flaml import AutoML

model = AutoML()
settings = {
"time_budget": arg_map.get("time_limit", DEFAULT_TRAIN_TIME_LIMIT),
"metric": arg_map.get("metric", DEFAULT_TRAIN_REGRESSION_METRIC),
"estimator_list": ["xgboost"],
"task": "regression",
}
model.fit(
dataframe=aggregated_batch.frames, label=arg_map["predict"], **settings
)
model_path = os.path.join(
self.db.config.get_value("storage", "model_dir"), self.node.name
)
pickle.dump(model, open(model_path, "wb"))
self.node.metadata.append(
FunctionMetadataCatalogEntry("model_path", model_path)
)
# Pass the prediction column to xgboost.py.
self.node.metadata.append(
FunctionMetadataCatalogEntry("predict_col", arg_map["predict"])
)

impl_path = Path(f"{self.function_dir}/xgboost.py").absolute().as_posix()
io_list = self._resolve_function_io(None)
return (
self.node.name,
impl_path,
self.node.function_type,
io_list,
self.node.metadata,
)

>>>>>>> 201f901b (Starting the change for XGBoost integration into EVADb. (#1232))
def handle_ultralytics_function(self):
"""Handle Ultralytics functions"""
=======
>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
<<<<<<< HEAD
>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
def handle_xgboost_function(self):
"""Handle xgboost functions

Expand Down Expand Up @@ -3455,6 +3537,9 @@ def exec(self, *args, **kwargs):
train_time,
) = self.handle_sklearn_function()
<<<<<<< HEAD
<<<<<<< HEAD
=======
>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
elif string_comparison_case_insensitive(self.node.function_type, "XGBoost"):
(
name,
Expand All @@ -3466,12 +3551,14 @@ def exec(self, *args, **kwargs):
best_score,
train_time,
) = self.handle_xgboost_function()
<<<<<<< HEAD
=======
<<<<<<< HEAD
<<<<<<< HEAD
) = self.handle_sklearn_function()
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
=======
<<<<<<< HEAD
) = self.handle_xgboost_function()
>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
=======
Expand All @@ -3480,6 +3567,9 @@ def exec(self, *args, **kwargs):
=======
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
>>>>>>> c5f43c65 (Bump v0.3.4+ dev)
=======
>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
elif string_comparison_case_insensitive(self.node.function_type, "Forecasting"):
(
name,
Expand Down
25 changes: 22 additions & 3 deletions evadb/functions/sklearn.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,13 @@ def name(self) -> str:
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
=======
>>>>>>> 2170a7a9 (Bump v0.3.4+ dev)
=======
>>>>>>> c5f43c65 (Bump v0.3.4+ dev)
=======
>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
def setup(self, model_path: str, predict_col: str, **kwargs):
try_to_import_flaml_automl()

Expand All @@ -56,6 +59,7 @@ def forward(self, frames: pd.DataFrame) -> pd.DataFrame:
def setup(self, model_path: str, **kwargs):
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
=======
def setup(self, model_path: str, predict_col: str, **kwargs):
>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
Expand All @@ -74,36 +78,51 @@ def forward(self, frames: pd.DataFrame) -> pd.DataFrame:
=======
=======
>>>>>>> c5f43c65 (Bump v0.3.4+ dev)
=======
=======
def setup(self, model_path: str, predict_col: str, **kwargs):
>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
try_to_import_sklearn()

self.model = pickle.load(open(model_path, "rb"))
self.predict_col = predict_col

def forward(self, frames: pd.DataFrame) -> pd.DataFrame:
# The last column is the predictor variable column. Hence we do not
# pass that column in the predict method for sklearn.
predictions = self.model.predict(frames.iloc[:, :-1])
# Do not pass the prediction column in the predict method for sklearn.
frames.drop([self.predict_col], axis=1, inplace=True)
predictions = self.model.predict(frames)
predict_df = pd.DataFrame(predictions)
# We need to rename the column of the output dataframe. For this we
<<<<<<< HEAD
<<<<<<< HEAD
>>>>>>> 2170a7a9 (Bump v0.3.4+ dev)
=======
>>>>>>> c5f43c65 (Bump v0.3.4+ dev)
=======
>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
# shall rename it to the column name same as that of the last column of
# frames. This is because the last column of frames corresponds to the
# variable we want to predict.
predict_df.rename(columns={0: frames.columns[-1]}, inplace=True)
>>>>>>> 40a10ce1 (Bump v0.3.4+ dev)
<<<<<<< HEAD
<<<<<<< HEAD
<<<<<<< HEAD
=======
>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
=======
# shall rename it to the column name same as that of the predict column
# passed in the training frames in EVA query.
predict_df.rename(columns={0: self.predict_col}, inplace=True)
>>>>>>> 4771bdec (Starting the change for XGBoost integration into EVADb. (#1232))
<<<<<<< HEAD
=======
>>>>>>> 2170a7a9 (Bump v0.3.4+ dev)
=======
>>>>>>> c5f43c65 (Bump v0.3.4+ dev)
=======
>>>>>>> dda3558c (Starting the change for XGBoost integration into EVADb. (#1232))
return predict_df

def to_device(self, device: str):
Expand Down
Loading

0 comments on commit e19f144

Please sign in to comment.