Skip to content

Commit

Permalink
Test for Bayesian Optimization Algo (#406)
Browse files Browse the repository at this point in the history
* added tests for acquisition function and models

* added tests for global_optimizer

* added tests for boa

* minor linting

* tests for algorithm manager

* added discrete parameter to study config

* covered all parameter types

* moved python script to testing folder

* added python tests to unit tests

* remembered to uncomment existing tests

* fixed path to test script

* moved python tests to separate job in workflow

* added run command to test script
  • Loading branch information
jdplatt authored and k8s-ci-robot committed Mar 11, 2019
1 parent 61451ef commit c87d583
Show file tree
Hide file tree
Showing 25 changed files with 419 additions and 125 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
# python ignore files
__pycache__/
.idea/
.coverage
.pytest_cache
*.egg-info

# Project specific ignore files
*.swp
Expand Down
File renamed without changes.
36 changes: 36 additions & 0 deletions pkg/suggestion/bayesianoptimization/src/acquisition_func.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
""" module for acquisition function"""
import numpy as np
from scipy.stats import norm


class AcquisitionFunc:
"""
Class for acquisition function with options for expected improvement,
probability of improvement, or lower confident bound.
"""

def __init__(self, model, current_optimal, mode="ei", trade_off=0.01):
"""
:param mode: pi: probability of improvement, ei: expected improvement, lcb: lower confident bound
:param trade_off: a parameter to control the trade off between exploiting and exploring
:param model_type: gp: gaussian process, rf: random forest
"""
self.model = model
self.current_optimal = current_optimal
self.mode = mode
self.trade_off = trade_off

def compute(self, X_test):
y_mean, y_std, y_variance = self.model.predict(X_test)

z = (y_mean - self.current_optimal - self.trade_off) / y_std

if self.mode == "ei":
if y_std.any() < 0.000001:
return 0, y_mean, y_variance
result = y_std * (z * norm.cdf(z) + norm.pdf(z))
elif self.mode == "pi":
result = norm.cdf(z)
else:
result = - (y_mean - self.trade_off * y_std)
return np.squeeze(result), np.squeeze(y_mean), np.squeeze(y_variance)

This file was deleted.

35 changes: 13 additions & 22 deletions pkg/suggestion/bayesianoptimization/src/algorithm_manager.py
Original file line number Diff line number Diff line change
@@ -1,39 +1,31 @@
""" module for algorithm manager """

import numpy as np

from pkg.api.python import api_pb2
import logging
from logging import getLogger, StreamHandler, INFO, DEBUG

from .utils import get_logger


def deal_with_discrete(feasible_values, current_value):
""" function to embed the current values to the feasible discrete space"""
diff = np.subtract(feasible_values, current_value)
diff = np.absolute(diff)
return feasible_values[np.argmin(diff)]


def deal_with_categorical(feasible_values, one_hot_values):
""" function to do the one hot encoding of the categorical values """
#index = np.argmax(one_hot_values)
index = one_hot_values.argmax()
index = np.argmax(one_hot_values)
#index = one_hot_values.argmax()
return feasible_values[int(index)]


class AlgorithmManager:
""" class for the algorithm manager
provide some helper functions
"""
def __init__(self, study_id, study_config, X_train, y_train, logger=None):
if logger == None:
self.logger = getLogger(__name__)
FORMAT = '%(asctime)-15s StudyID %(studyid)s %(message)s'
logging.basicConfig(format=FORMAT)
handler = StreamHandler()
handler.setLevel(DEBUG)
self.logger.setLevel(DEBUG)
self.logger.addHandler(handler)
self.logger.propagate = False
else:
self.logger = logger
self.logger = logger if (logger is not None) else get_logger()
self._study_id = study_id
self._study_config = study_config
self._goal = self._study_config.optimization_type
Expand Down Expand Up @@ -82,7 +74,7 @@ def lower_bound(self):

@property
def upper_bound(self):
""" return the ipper bound of all the parameters """
""" return the upper bound of all the parameters """
return self._upperbound

@property
Expand Down Expand Up @@ -118,10 +110,10 @@ def y_train(self):
def _parse_config(self):
""" extract info from the study configuration """
for i, param in enumerate(self._study_config.parameter_configs.configs):
self._name_id[param.name]=i
self._name_id[param.name] = i
self._types.append(param.parameter_type)
self._names.append(param.name)
if param.parameter_type == api_pb2.DOUBLE or param.parameter_type == api_pb2.INT:
if param.parameter_type in [api_pb2.DOUBLE, api_pb2.INT]:
self._dim = self._dim + 1
self._lowerbound.append(float(param.feasible.min))
self._upperbound.append(float(param.feasible.max))
Expand Down Expand Up @@ -158,15 +150,15 @@ def _mapping_params(self, parameters_list):
for p in parameters:
self.logger.debug("mapping: %r", p, extra={"StudyID": self._study_id})
map_id = self._name_id[p.name]
if self._types[map_id] == api_pb2.DOUBLE or self._types[map_id] == api_pb2.INT or self._types[map_id] == api_pb2.DISCRETE:
if self._types[map_id] in [api_pb2.DOUBLE, api_pb2.INT, api_pb2.DISCRETE]:
maplist[map_id] = float(p.value)
elif self._types[map_id] == api_pb2.CATEGORICAL:
for ci in self._categorical_info:
if ci["name"] == p.name:
maplist[map_id] = np.zeros(ci["number"])
for i, v in enumerate(ci["values"]):
if v == p.value:
maplist[map_id][i]=1
maplist[map_id][i] = 1
break
self.logger.debug("mapped: %r", maplist, extra={"StudyID": self._study_id})
ret.append(np.hstack(maplist))
Expand Down Expand Up @@ -234,4 +226,3 @@ def convert_to_dict(self, x_next):
})
result.append(tmp)
return result

Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import numpy as np
from sklearn.preprocessing import MinMaxScaler

from pkg.suggestion.bayesianoptimization.src.global_optimizer.global_optimizer import GlobalOptimizer
from .global_optimizer import GlobalOptimizer


class BOAlgorithm:
Expand Down Expand Up @@ -54,7 +54,7 @@ def get_suggestion(self, request_num):
x_next_list = []
if self.X_train is None and self.y_train is None and self.current_optimal is None:
# randomly pick a point as the first trial
for i in range(request_num):
for _ in range(request_num):
x_next_list.append(np.random.uniform(self.lowerbound, self.upperbound, size=(1, self.dim)))
else:
_, x_next_list_que = self.optimizer.direct(request_num)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
DIRECT algorithm is used in this case
"""
import copy

import numpy as np
from collections import deque

from pkg.suggestion.bayesianoptimization.src.acquisition_func.acquisition_func import AcquisitionFunc
import logging
from logging import getLogger, StreamHandler, INFO, DEBUG
from .acquisition_func import AcquisitionFunc
from .model.gp import GaussianProcessModel
from .model.rf import RandomForestModel
from .utils import get_logger


class RectPack:
Expand Down Expand Up @@ -74,37 +75,31 @@ class GlobalOptimizer:

def __init__(self, N, l, u, scaler, X_train, y_train, current_optimal, mode, trade_off, length_scale,
noise, nu, kernel_type, n_estimators, max_features, model_type, logger=None):
if logger == None:
self.logger = getLogger(__name__)
FORMAT = '%(asctime)-15s StudyID %(studyid)s %(message)s'
logging.basicConfig(format=FORMAT)
handler = StreamHandler()
handler.setLevel(INFO)
self.logger.setLevel(INFO)
self.logger.addHandler(handler)
self.logger.propagate = False
else:
self.logger = logger

self.logger = logger if (logger is not None) else get_logger()
self.N = N
self.l = l
self.u = u
self.scaler = scaler
self.buckets = []
self.dim = None
if model_type == "gp":
model = GaussianProcessModel(
length_scale=length_scale,
noise=noise,
nu=nu,
kernel_type=kernel_type,
)
else:
model = RandomForestModel(
n_estimators=n_estimators,
max_features=max_features,
)
model.fit(X_train, y_train)
self.aq_func = AcquisitionFunc(
X_train=X_train,
y_train=y_train,
model=model,
current_optimal=current_optimal,
mode=mode,
trade_off=trade_off,
length_scale=length_scale,
noise=noise,
nu=nu,
kernel_type=kernel_type,
n_estimators=n_estimators,
max_features=max_features,
model_type=model_type,
)

def potential_opt(self, f_min):
Expand Down Expand Up @@ -174,7 +169,7 @@ def direct(self, request_num):
x_next = first_rect.center
ei_min.append(f_min)

for t in range(self.N):
for _ in range(self.N):
opt_set = self.potential_opt(f_min)

# for bucket in self.buckets:
Expand Down Expand Up @@ -215,7 +210,7 @@ def sample_buckets(self, request_num):
fc_sum -= a.fc
bucket_index.append([-a.fc, a.center])
bucket_index = sorted(bucket_index, key=lambda x: x[0])
for i in range(request_num):
for _ in range(request_num):
sample = np.random.rand()
stick = 0.0
for b in bucket_index:
Expand Down
22 changes: 13 additions & 9 deletions pkg/suggestion/bayesianoptimization/src/model/gp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

class GaussianProcessModel:
""" use the gaussian process as a prior """
def __init__(self, length_scale, noise, nu, kernel_type):
def __init__(self, length_scale=0.5, noise=0.00005,
nu=1.5, kernel_type="matern"):
"""
:param length_scale: the larger the length_scale is, the smoother the gaussian prior is. If a float,
an isotropic kernel is used. If an array, an anisotropic kernel is used where each dimension of it defines
Expand All @@ -15,20 +16,23 @@ def __init__(self, length_scale, noise, nu, kernel_type):
approximate function is.
:param kernel_type: "rbf": squared exponential kernel, "matern": Matern kernel.
"""

length_scale = length_scale or 0.5
noise = noise or 0.00005
nu = nu or 1.5
kernel_type = kernel_type or "matern"

if kernel_type == "rbf":
kernel = RBF(length_scale=length_scale)
else:
elif kernel_type == "matern":
kernel = Matern(length_scale=length_scale, nu=nu)

else:
raise Exception("kernel_type must be 'rbf' or 'matern'")
self.gp = GaussianProcessRegressor(
kernel=kernel,
alpha=noise,
random_state=0,
optimizer=None,
)

def fit(self, X_train, y_train):
self.gp.fit(X_train, y_train)

def predict(self, X_test):
y_mean, y_std = self.gp.predict(X_test, return_std=True)
y_variance = y_std ** 2
return y_mean, y_std, y_variance
19 changes: 16 additions & 3 deletions pkg/suggestion/bayesianoptimization/src/model/rf.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,24 @@
import numpy as np
import forestci as fci
from sklearn.ensemble import RandomForestRegressor


class RandomForestModel:
def __init__(self, n_estimators, max_features):
n_estimators = n_estimators or 50
max_features = max_features or "auto"

def __init__(self, n_estimators=50, max_features="auto"):
self.rf = RandomForestRegressor(
n_estimators=n_estimators,
max_features=max_features,
)
self.X_train = None

def fit(self, X_train, y_train):
print(X_train.shape, y_train.shape)
self.X_train = X_train
self.rf.fit(X_train, y_train)

def predict(self, X_test):
y_mean = self.rf.predict(X_test)
y_variance = fci.random_forest_error(self.rf, self.X_train, X_test)
y_std = np.sqrt(y_variance)
return y_mean, y_std, y_variance
17 changes: 17 additions & 0 deletions pkg/suggestion/bayesianoptimization/src/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import os
import logging
from logging import getLogger, StreamHandler


FORMAT = '%(asctime)-15s StudyID %(studyid)s %(message)s'
LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")


def get_logger(name=__name__):
logger = getLogger(name)
logging.basicConfig(format=FORMAT)
handler = StreamHandler()
logger.setLevel(LOG_LEVEL)
logger.addHandler(handler)
logger.propagate = False
return logger
Loading

0 comments on commit c87d583

Please sign in to comment.