Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

migrate xgboost to trainer implementation #184

Merged
merged 1 commit into from
Nov 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cmd/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

data_path = "/data"
default_output_filename = "output"
default_trainer_names = [ 'PolynomialRegressionTrainer', 'GradientBoostingRegressorTrainer', 'SGDRegressorTrainer', 'KNeighborsRegressorTrainer', 'LinearRegressionTrainer','SVRRegressorTrainer']
default_trainer_names = [ 'PolynomialRegressionTrainer', 'GradientBoostingRegressorTrainer', 'SGDRegressorTrainer', 'KNeighborsRegressorTrainer', 'LinearRegressionTrainer','SVRRegressorTrainer', 'XgboostFitTrainer']
default_trainers = ",".join(default_trainer_names)
default_version = "v0.6"

Expand Down
4 changes: 3 additions & 1 deletion src/estimate/model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,13 @@
from prom_types import TIMESTAMP_COL, valid_container_query

from scikit_model import ScikitModel
from xgboost_model import XgboostModel
# from keras_model import KerasModel

# model wrapper
MODELCLASS = {
'scikit': ScikitModel
'scikit': ScikitModel,
'xgboost': XgboostModel
# 'keras': KerasModel,
}

Expand Down
51 changes: 51 additions & 0 deletions src/estimate/model/xgboost_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
import sys
cur_path = os.path.join(os.path.dirname(__file__), '.')
sys.path.append(cur_path)

from estimate_common import transform_and_predict, load_model_by_pickle, load_model_by_json, is_component_model
import xgboost as xgb
import os
import sys
src_path = os.path.join(os.path.dirname(__file__), '..', '..')
sys.path.append(src_path)

from util import ModelOutputType

import collections.abc

class XgboostModel():
def __init__(self, model_path, model_name, output_type, model_file, features, fe_files, component_init=False):
self.name = model_name
self.features = features
self.output_type = ModelOutputType[output_type]

self.comp_type = not component_init and is_component_model(model_file)
if self.comp_type:
self.models = dict()
model_info = load_model_by_json(model_path, model_file)
for comp, model_metadata in model_info.items():
model = XgboostModel(model_path, self.name, self.output_type.name, model_metadata['model_file'], model_metadata['features'], model_metadata['fe_files'], component_init=True)
self.models[comp] = model
else:
filepath = os.path.join(model_path, model_file)
self.model = xgb.XGBRegressor(n_estimators=1000, learning_rate=0.1)
self.model.load_model(filepath)
self.fe_list = []
for fe_filename in fe_files:
self.fe_list += [load_model_by_pickle(model_path, fe_filename)]

def get_power(self, request):
if self.comp_type:
results = dict()
for comp, model in self.models.items():
y, msg = transform_and_predict(model, request)
if msg != "":
return [], msg
if not isinstance(y, collections.abc.Sequence):
y = [y]
results[comp] = y
return results, msg
else:
return transform_and_predict(self, request)

20 changes: 20 additions & 0 deletions src/train/trainer/XgboostFitTrainer/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from sklearn.model_selection import RepeatedKFold, cross_val_score

import os
import sys
trainer_path = os.path.join(os.path.dirname(__file__), '..')
sys.path.append(trainer_path)

from train.trainer.xgboost_interface import XgboostTrainer

class XgboostFitTrainer(XgboostTrainer):
def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name):
super(XgboostFitTrainer, self).__init__(energy_components, feature_group, energy_source, node_level, pipeline_name=pipeline_name)
self.fe_files = []

def _train(self, node_type, component, X_values, y_values):
model = self.node_models[node_type][component]
if model.__sklearn_is_fitted__():
self.node_models[node_type][component].fit(X_values, y_values, xgb_model=model)
else:
self.node_models[node_type][component].fit(X_values, y_values)
2 changes: 1 addition & 1 deletion src/train/trainer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def process(self, data, power_labels, pipeline_lock):
self.load_model(node_type)
node_type_filtered_data = data[data[node_info_column] == node_type]
if self.node_scalers[node_type] is None:
self.print_log("fit scaler to latest data".format(node_type, self.feature_group_name))
self.print_log("fit scaler to latest data {1} for node_type={0}".format(node_type, self.feature_group_name))
# no profiled scaler
x_values = node_type_filtered_data[self.features].values
self.node_scalers[node_type] = MaxAbsScaler()
Expand Down
107 changes: 107 additions & 0 deletions src/train/trainer/xgboost_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
from sklearn.metrics import mean_absolute_error
import os
import sys
import xgboost as xgb

util_path = os.path.join(os.path.dirname(__file__), '..', '..', 'util')
sys.path.append(util_path)

from util import save_pkl, load_pkl, load_json
from abc import abstractmethod

from . import Trainer

model_class = "xgboost"

def get_save_path(model_filepath):
return "/".join(model_filepath.split("/")[0:-1])

def _json_filepath(filepath):
if ".json" not in filepath:
filepath += ".json"
return filepath

class XgboostTrainer(Trainer):
def __init__(self, energy_components, feature_group, energy_source, node_level, pipeline_name, scaler_type="maxabs"):
super(XgboostTrainer, self).__init__(model_class, energy_components, feature_group, energy_source, node_level, pipeline_name, scaler_type=scaler_type)
self.fe_files = []

def init_model(self):
return xgb.XGBRegressor(n_estimators=1000, learning_rate=0.1)

@abstractmethod
def _train(self, node_type, component, X_values, y_values):
return NotImplemented

def train(self, node_type, component, X_values, y_values):
if hasattr(self, 'fe'):
for index in range(len(self.fe)):
X_values = self.fe[index].fit_transform(X_values)
self._train(node_type, component, X_values, y_values)

def save_checkpoint(self, model, filepath):
filepath = _json_filepath(filepath)
if hasattr(self, 'fe'):
save_path = get_save_path(filepath)
for index in range(len(self.fe)):
save_pkl(save_path, self.fe_files[index], self.fe[index])
model.save_model(filepath)

def load_local_checkpoint(self, filepath):
filepath = _json_filepath(filepath)
if hasattr(self, 'fe_files'):
save_path = get_save_path(filepath)
for index in range(len(self.fe_files)):
loaded_fe = load_pkl(save_path, self.fe_files[index])
if loaded_fe is not None:
self.fe[index] = loaded_fe
loaded_model = None
if os.path.exists(filepath):
loaded_model = self.init_model()
loaded_model.load_model(filepath)
return loaded_model, loaded_model is not None

#TODO
def should_archive(self, node_type):
return True

def get_basic_metadata(self, node_type):
return dict()

def get_mae(self, node_type, component, X_test, y_test):
predicted_values = self.predict(node_type, component, X_test, skip_preprocess=True)
mae = mean_absolute_error(y_test, predicted_values)
return mae

def save_model(self, component_save_path, node_type, component):
model = self.node_models[node_type][component]
filepath = os.path.join(component_save_path, self.component_model_filename(component))
self.save_checkpoint(model, filepath)

def component_model_filename(self, component):
return component + ".json"

def get_weight_dict(self, node_type):
weight_dict = dict()
for component in self.energy_components:
scaler = self.node_scalers[node_type]
checkpoint_filename = _json_filepath(self._checkpoint_filename(component, node_type))
model_in_json = load_json(self.checkpoint_toppath, checkpoint_filename)
if model_in_json is None:
self.print_log("cannot load model in json")
# failed to get model from local checkpoint
return
weight_dict[component] = {
"All_Weights": {
"Bias_Weight": 0,
"Categorical_Variables": dict(),
"Numerical_Variables": {self.features[i]:
{"scale": scaler.scale_[i],
"mean": 0,
"variance": 0,
"weight": model_in_json
}
for i in range(len(self.features))},
}
}
return weight_dict
3 changes: 1 addition & 2 deletions tests/trainer_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@
import pandas as pd
import threading

test_trainer_names = [ 'PolynomialRegressionTrainer', 'GradientBoostingRegressorTrainer', 'SGDRegressorTrainer', 'KNeighborsRegressorTrainer', 'LinearRegressionTrainer','SVRRegressorTrainer']

test_trainer_names = [ 'PolynomialRegressionTrainer', 'GradientBoostingRegressorTrainer', 'SGDRegressorTrainer', 'KNeighborsRegressorTrainer', 'LinearRegressionTrainer','SVRRegressorTrainer', 'XgboostFitTrainer' ]
pipeline_lock = threading.Lock()

def assert_train(trainer, data, energy_components):
Expand Down
Loading