Skip to content

Commit

Permalink
support param map for lightGBM (intel-analytics#5712)
Browse files Browse the repository at this point in the history
  • Loading branch information
songhappy authored and ForJadeForest committed Sep 20, 2022
1 parent c2ecf64 commit cb49fd7
Show file tree
Hide file tree
Showing 3 changed files with 275 additions and 25 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,7 @@ import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassificationModel => M
import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassifier => MLightGBMClassifier}
import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRegressionModel => MLightGBMRegressionModel}
import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRegressor => MLightGBMRegressor}
import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRankerModel => MLightGBMRankerModel}
import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRanker => MLightGBMRanker}
import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMBase => MLightGBMBase}
import com.microsoft.azure.synapse.ml.lightgbm.params.{LightGBMParams => MLightGBMParams}
import org.apache.spark.ml.Model
import org.apache.spark.ml.param.{ParamMap, Params}
import org.apache.spark.ml.util.Identifiable


class XGBClassifier (val xgboostParams: Map[String, Any] = Map()) {
val sc = SparkSession.active.sparkContext
Expand Down Expand Up @@ -428,14 +422,20 @@ object XGBRegressorModel {

/**
* [[lightGBMClassifier wrapper]]
* @param lgbmParams, a map of parameters, currently supported 18 params to be set from lgbmParams:
"boostingType", "numLeaves", "maxDepth", "learningRate", "numIterations",
"binConstructSampleCnt", "objective", "minSplitGain", "minSumHessianInLeaf",
"minDataInLeaf", "baggingFraction", "baggingFreq", "featureFraction",
"lambdaL1", "lambdaL2", "numThreads", "earlyStoppingRound", "maxBin".
*/
class LightGBMClassifier {
class LightGBMClassifier (val lgbmParams: Map[String, Any] = Map()) {

val sc = SparkSession.active.sparkContext
sc.getConf.set("spark.task.cpus", Engine.coreNumber().toString)

val estimator = new MLightGBMClassifier()
private val estimator = new MLightGBMClassifier()
estimator.setNumThreads(Engine.coreNumber())
TreeModelUtils.setParams(estimator, lgbmParams)

def setLabelCol(labelColName : String) : this.type = {
estimator.setLabelCol(labelColName)
Expand Down Expand Up @@ -573,11 +573,11 @@ class LightGBMClassifier {
* @param model trained MLightGBMClassificationModel to use in prediction.
*/
class LightGBMClassifierModel private[bigdl](val model: MLightGBMClassificationModel) {
private var featuresCols: String = "features"
private var featuresCol: String = "features"
private var predictionCol: String = "prediction"

def setFeaturesCol(featuresColName: String): this.type = {
featuresCols = featuresColName
featuresCol = featuresColName
this
}

Expand All @@ -587,8 +587,8 @@ class LightGBMClassifierModel private[bigdl](val model: MLightGBMClassificationM
}

def transform(dataset: DataFrame): DataFrame = {
Log4Error.invalidInputError(featuresCols!=None, "Please set feature columns before transform")
model.setFeaturesCol(featuresCols)
Log4Error.invalidInputError(featuresCol!=None, "Please set feature columns before transform")
model.setFeaturesCol(featuresCol)
var output = model.transform(dataset)
if(predictionCol != null) {
output = output.withColumnRenamed("prediction", predictionCol)
Expand All @@ -609,14 +609,20 @@ object LightGBMClassifierModel {

/**
* [[LightGBMRegressor]] lightGBM wrapper of LightGBMRegressor.
* @param lgbmParams, a map of parameters, currently supported 18 params to be set from lgbmParams:
"boostingType", "numLeaves", "maxDepth", "learningRate", "numIterations",
"binConstructSampleCnt", "objective", "minSplitGain", "minSumHessianInLeaf",
"minDataInLeaf", "baggingFraction", "baggingFreq", "featureFraction",
"lambdaL1", "lambdaL2", "numThreads", "earlyStoppingRound", "maxBin".
*/
class LightGBMRegressor {
class LightGBMRegressor (val lgbmParams: Map[String, Any] = Map()) {

val sc = SparkSession.active.sparkContext
sc.getConf.set("spark.task.cpus", Engine.coreNumber().toString)

private val estimator = new MLightGBMRegressor()
estimator.setNumThreads(Engine.coreNumber())
TreeModelUtils.setParams(estimator, lgbmParams)

def setAlpha(value: Double): this.type = {
estimator.setAlpha(value)
Expand Down Expand Up @@ -752,7 +758,6 @@ class LightGBMRegressor {
class LightGBMRegressorModel private[bigdl](val model: MLightGBMRegressionModel) {
var predictionCol: String = null
var featuresCol: String = "features"
var featurearray: Array[String] = Array("features")
def setPredictionCol(value: String): this.type = {
predictionCol = value
this
Expand All @@ -765,16 +770,9 @@ class LightGBMRegressorModel private[bigdl](val model: MLightGBMRegressionModel)
}

def transform(dataset: DataFrame): DataFrame = {
val featureVectorAssembler = new VectorAssembler()
.setInputCols(featurearray)
.setOutputCol("featureAssembledVector")
val assembledDF = featureVectorAssembler.transform(dataset)
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.functions.{col, udf}
val asDense = udf((v: Vector) => v.toDense)
val xgbInput = assembledDF.withColumn("DenseFeatures", asDense(col("featureAssembledVector")))
model.setFeaturesCol("DenseFeatures")
var output = model.transform(xgbInput).drop("DenseFeatures", "featureAssembledVector")
Log4Error.invalidInputError(featuresCol!=None, "Please set feature columns before transform")
model.setFeaturesCol(featuresCol)
var output = model.transform(dataset)
if(predictionCol != null) {
output = output.withColumnRenamed("prediction", predictionCol)
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Copyright 2016 The BigDL Authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.intel.analytics.bigdl.dllib.nnframes
import com.intel.analytics.bigdl.dllib.utils.Log4Error
import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassifier => MLightGBMClassifier}
import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRegressor => MLightGBMRegressor}


object TreeModelUtils {

def convert2CamelCase(pythonStyle: String): String = {
val data = pythonStyle.split("_")
data(0) + data.slice(1, data.size)
.map(x => x.substring(0, 1).toUpperCase() + x.substring(1)).mkString("")
}

def setParams(lgbmEstimator: Any, lgbmParams: Map[String, Any]): Unit = {

val estimator = if (lgbmEstimator.isInstanceOf[MLightGBMClassifier]) {
lgbmEstimator.asInstanceOf[MLightGBMClassifier]
}
else {
lgbmEstimator.asInstanceOf[MLightGBMRegressor]
}

lgbmParams.foreach(kv => kv._1 match {
case "boostingType" => estimator.setBoostingType(kv._2.asInstanceOf[String])
case "numLeaves" => estimator.setNumLeaves(kv._2.asInstanceOf[Int])
case "maxDepth" => estimator.setMaxDepth(kv._2.asInstanceOf[Int])
case "learningRate" => estimator.setLearningRate(kv._2.asInstanceOf[Double])
case "numIterations" => estimator.setNumIterations(kv._2.asInstanceOf[Int])
case "binConstructSampleCnt" => estimator.setBinSampleCount(kv._2.asInstanceOf[Int])
case "objective" => estimator.setObjective(kv._2.asInstanceOf[String])
case "minSplitGain" => estimator.setMinGainToSplit(kv._2.asInstanceOf[Double])
case "minSumHessianInLeaf" => estimator.setMinSumHessianInLeaf(kv._2.asInstanceOf[Double])
case "minDataInLeaf" => estimator.setMinDataInLeaf(kv._2.asInstanceOf[Int])
case "baggingFraction" => estimator.setBaggingFraction(kv._2.asInstanceOf[Double])
case "baggingFreq" => estimator.setBaggingFreq(kv._2.asInstanceOf[Int])
case "featureFraction" => estimator.setFeatureFraction(kv._2.asInstanceOf[Double])
case "lambdaL1" => estimator.setLambdaL1(kv._2.asInstanceOf[Double])
case "lambdaL2" => estimator.setLambdaL2(kv._2.asInstanceOf[Double])
case "numThreads" => estimator.setNumThreads(kv._2.asInstanceOf[Int])
case "earlyStoppingRound" => estimator.setEarlyStoppingRound(kv._2.asInstanceOf[Int])
case "maxBin" => estimator.setMaxBin(kv._2.asInstanceOf[Int])
case _ =>
Log4Error.invalidInputError(false,
s"LightGBM setParams: key ${ kv._1} is not supported by lgbmParams map",
s"try to set this parameter by calling .set${kv._1}")
})

}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import org.apache.spark.SparkContext
import org.apache.spark.ml.feature.VectorAssembler
import org.apache.spark.sql.{SQLContext, SparkSession}
import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassifier => MLightGBMClassifier}
import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRegressor => MLightGBMRegressor}
import org.apache.spark.SparkConf


Expand All @@ -45,6 +46,7 @@ class LightGBMTrainSpec extends ZooSpecHelper {
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
Engine.init

val df = Seq(
(1.0, 2.0, 3.0, 4.0, 1),
(1.0, 3.0, 8.0, 2.0, 0)
Expand All @@ -63,6 +65,55 @@ class LightGBMTrainSpec extends ZooSpecHelper {
}
}

"LightGBMClassifer train with params" should "work" in {
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
Engine.init

val df = Seq(
(1.0, 2.0, 3.0, 4.0, 1),
(1.0, 3.0, 8.0, 2.0, 0),
(1.0, 2.0, 3.0, 4.0, 1),
(1.0, 3.0, 8.0, 2.0, 0),
(1.0, 2.0, 3.0, 4.0, 1),
(1.0, 3.0, 8.0, 2.0, 0),
(1.0, 2.0, 3.0, 4.0, 1),
(1.0, 3.0, 8.0, 2.0, 0),
(1.0, 2.0, 3.0, 4.0, 1),
(1.0, 3.0, 8.0, 2.0, 0)
).toDF("f1", "f2", "f3", "f4", "label")
val vectorAssembler = new VectorAssembler()
.setInputCols(Array("f1", "f2", "f3", "f4"))
.setOutputCol("features")
val assembledDf = vectorAssembler.transform(df).select("features", "label").cache()
if (spark.version.substring(0, 3).toDouble >= 3.1) {
val params = Map(
"boostingType" -> "gbdt",
"numLeaves" -> 2,
"maxDepth" -> 2,
"learningRate" -> 0.3,
"numIterations" -> 10,
"binConstructSampleCnt" -> 5,
"objective" -> "binary",
"minSplitGain" -> 0.1,
"minSumHessianInLeaf" -> 0.01,
"minDataInLeaf" -> 1,
"baggingFraction" -> 0.4,
"baggingFreq" -> 1,
"featureFraction" -> 0.4,
"lambdaL1" -> 0.1,
"lambdaL2" -> 0.1,
"numThreads" -> 2,
"earlyStoppingRound" -> 10,
"maxBin" -> 100)
val lightGBMclassifier = new LightGBMClassifier(params)
lightGBMclassifier.setIsUnbalance(true)
val model1 = lightGBMclassifier.fit(assembledDf)
val res1 = model1.transform(assembledDf)
TestUtils.conditionFailTest(res1.count() == 10)
}
}

"LightGBMClassifer save" should "work" in {
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
Expand Down Expand Up @@ -110,5 +161,140 @@ class LightGBMTrainSpec extends ZooSpecHelper {
TestUtils.conditionFailTest(y0_0.count() == 4)
}
}

"LightGBMRegressor train with params" should "work" in {
val spark = SparkSession.builder().getOrCreate()
import spark.implicits._
Engine.init
val df = Seq(
(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 1.0f, 2.0f, 4.0f, 8.0f, 3.0f, 116.3668f),
(1.0f, 3.0f, 8.0f, 6.0f, 5.0f, 9.0f, 5.0f, 6.0f, 7.0f, 4.0f, 116.367f),
(2.0f, 1.0f, 5.0f, 7.0f, 6.0f, 7.0f, 4.0f, 1.0f, 2.0f, 3.0f, 116.367f),
(2.0f, 1.0f, 4.0f, 3.0f, 6.0f, 1.0f, 3.0f, 2.0f, 1.0f, 3.0f, 116.3668f)
).toDF("f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "label")
val vectorAssembler = new VectorAssembler()
.setInputCols(Array("f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10"))
.setOutputCol("features")
val assembledDf = vectorAssembler.transform(df).select("features", "label").cache()
if (spark.version.substring(0, 3).toDouble >= 3.1) {
val params = Map(
"boostingType" -> "dart",
"numLeaves" -> 2,
"maxDepth" -> 2,
"learningRate" -> 0.3,
"numIterations" -> 10,
"binConstructSampleCnt" -> 5,
"objective" -> "huber",
"minSplitGain" -> 0.1,
"minSumHessianInLeaf" -> 0.01,
"minDataInLeaf" -> 1,
"baggingFraction" -> 0.4,
"baggingFreq" -> 1,
"featureFraction" -> 0.4,
"lambdaL1" -> 0.1,
"lambdaL2" -> 0.1,
"numThreads" -> 2,
"earlyStoppingRound" -> 10,
"maxBin" -> 100)
val lightGBMRegressor = new LightGBMRegressor(params)
val regressorModel0 = lightGBMRegressor.fit(assembledDf)
val y0 = regressorModel0.transform(assembledDf)
regressorModel0.saveNativeModel("/tmp/test")
val model = LightGBMRegressorModel.loadNativeModel("/tmp/test")
val y0_0 = model.transform(assembledDf)
y0_0.show(10)
TestUtils.conditionFailTest(y0.count() == 4)
TestUtils.conditionFailTest(y0_0.count() == 4)
}
}

"setParams for LightGBMClassifer " should "work" in {
Engine.init
val spark = SparkSession.builder().getOrCreate()
if (spark.version.substring(0, 3).toDouble >= 3.1) {
val params = Map(
"boostingType" -> "dart",
"numLeaves" -> 2,
"maxDepth" -> 2,
"learningRate" -> 0.3,
"numIterations" -> 10,
"binConstructSampleCnt" -> 5,
"objective" -> "huber",
"minSplitGain" -> 0.1,
"minSumHessianInLeaf" -> 0.01,
"minDataInLeaf" -> 1,
"baggingFraction" -> 0.4,
"baggingFreq" -> 1,
"featureFraction" -> 0.4,
"lambdaL1" -> 0.1,
"lambdaL2" -> 0.1,
"numThreads" -> 2,
"earlyStoppingRound" -> 10,
"maxBin" -> 100)
val mclassifier = new MLightGBMClassifier()
TreeModelUtils.setParams(mclassifier, params)
TestUtils.conditionFailTest(mclassifier.getEarlyStoppingRound == 10)
TestUtils.conditionFailTest(mclassifier.getMaxBin == 100)
}
}

"setParams for LightGBMRegressor" should "work" in {
Engine.init
val spark = SparkSession.builder().getOrCreate()
if (spark.version.substring(0, 3).toDouble >= 3.1) {
val params = Map(
"boostingType" -> "dart",
"numLeaves" -> 2,
"maxDepth" -> 2,
"learningRate" -> 0.3,
"numIterations" -> 10,
"binConstructSampleCnt" -> 5,
"objective" -> "huber",
"minSplitGain" -> 0.1,
"minSumHessianInLeaf" -> 0.01,
"minDataInLeaf" -> 1,
"baggingFraction" -> 0.4,
"baggingFreq" -> 1,
"featureFraction" -> 0.4,
"lambdaL1" -> 0.1,
"lambdaL2" -> 0.1,
"numThreads" -> 2,
"earlyStoppingRound" -> 10,
"maxBin" -> 100)
val mclassifier = new MLightGBMRegressor()
TreeModelUtils.setParams(mclassifier, params)
TestUtils.conditionFailTest(mclassifier.getEarlyStoppingRound == 10)
TestUtils.conditionFailTest(mclassifier.getMaxBin == 100)
}
}

"convertToCamelCase" should "work" in {
val paramsMap = Map(
"boosting_type" -> "boostingType",
"num_leaves" -> "numLeaves",
"max_depth" -> "maxDepth",
"learning_rate" -> "learningRate",
"num_iterations" -> "numIterations",
"bin_construct_sample_cnt" -> "binConstructSampleCnt",
"objective" -> "objective",
"min_split_gain" -> "minSplitGain",
"min_sum_hessian_in_leaf" -> "minSumHessianInLeaf",
"min_data_in_leaf" -> "minDataInLeaf",
"bagging_fraction" -> "baggingFraction",
"bagging_freq" -> "baggingFreq",
"feature_fraction" -> "featureFraction",
"lambda_l1" -> "lambdaL1",
"lambda_l2" -> "lambdaL2",
"num_threads" -> "numThreads",
"early_stopping_round" -> "earlyStoppingRound",
"max_bin" -> "maxBin",
"max_bin_by_feature" -> "maxBinByFeature")

paramsMap.foreach(kv => {
println(kv._1, TreeModelUtils.convert2CamelCase(kv._1))
TestUtils.conditionFailTest(kv._2 == TreeModelUtils.convert2CamelCase(kv._1))
})
}

}

0 comments on commit cb49fd7

Please sign in to comment.