From cb49fd7e1855f3629594be34aa57225830179ffe Mon Sep 17 00:00:00 2001 From: Guoqiong Song Date: Wed, 14 Sep 2022 09:56:08 -0700 Subject: [PATCH] support param map for lightGBM (#5712) --- .../bigdl/dllib/nnframes/TreeModel.scala | 48 +++-- .../bigdl/dllib/nnframes/TreeModelUtils.scala | 66 +++++++ .../dllib/nnframes/LightGBMTrainSpec.scala | 186 ++++++++++++++++++ 3 files changed, 275 insertions(+), 25 deletions(-) create mode 100644 scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModelUtils.scala diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModel.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModel.scala index bcd6a25e964..c245c021263 100644 --- a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModel.scala +++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModel.scala @@ -25,13 +25,7 @@ import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassificationModel => M import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassifier => MLightGBMClassifier} import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRegressionModel => MLightGBMRegressionModel} import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRegressor => MLightGBMRegressor} -import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRankerModel => MLightGBMRankerModel} -import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRanker => MLightGBMRanker} -import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMBase => MLightGBMBase} -import com.microsoft.azure.synapse.ml.lightgbm.params.{LightGBMParams => MLightGBMParams} -import org.apache.spark.ml.Model -import org.apache.spark.ml.param.{ParamMap, Params} -import org.apache.spark.ml.util.Identifiable + class XGBClassifier (val xgboostParams: Map[String, Any] = Map()) { val sc = SparkSession.active.sparkContext @@ -428,14 +422,20 @@ object XGBRegressorModel { /** * [[lightGBMClassifier wrapper]] + * @param lgbmParams, a map of parameters, currently supported 18 params to be set from lgbmParams: + "boostingType", "numLeaves", "maxDepth", "learningRate", "numIterations", + "binConstructSampleCnt", "objective", "minSplitGain", "minSumHessianInLeaf", + "minDataInLeaf", "baggingFraction", "baggingFreq", "featureFraction", + "lambdaL1", "lambdaL2", "numThreads", "earlyStoppingRound", "maxBin". */ -class LightGBMClassifier { +class LightGBMClassifier (val lgbmParams: Map[String, Any] = Map()) { val sc = SparkSession.active.sparkContext sc.getConf.set("spark.task.cpus", Engine.coreNumber().toString) - val estimator = new MLightGBMClassifier() + private val estimator = new MLightGBMClassifier() estimator.setNumThreads(Engine.coreNumber()) + TreeModelUtils.setParams(estimator, lgbmParams) def setLabelCol(labelColName : String) : this.type = { estimator.setLabelCol(labelColName) @@ -573,11 +573,11 @@ class LightGBMClassifier { * @param model trained MLightGBMClassificationModel to use in prediction. */ class LightGBMClassifierModel private[bigdl](val model: MLightGBMClassificationModel) { - private var featuresCols: String = "features" + private var featuresCol: String = "features" private var predictionCol: String = "prediction" def setFeaturesCol(featuresColName: String): this.type = { - featuresCols = featuresColName + featuresCol = featuresColName this } @@ -587,8 +587,8 @@ class LightGBMClassifierModel private[bigdl](val model: MLightGBMClassificationM } def transform(dataset: DataFrame): DataFrame = { - Log4Error.invalidInputError(featuresCols!=None, "Please set feature columns before transform") - model.setFeaturesCol(featuresCols) + Log4Error.invalidInputError(featuresCol!=None, "Please set feature columns before transform") + model.setFeaturesCol(featuresCol) var output = model.transform(dataset) if(predictionCol != null) { output = output.withColumnRenamed("prediction", predictionCol) @@ -609,14 +609,20 @@ object LightGBMClassifierModel { /** * [[LightGBMRegressor]] lightGBM wrapper of LightGBMRegressor. + * @param lgbmParams, a map of parameters, currently supported 18 params to be set from lgbmParams: + "boostingType", "numLeaves", "maxDepth", "learningRate", "numIterations", + "binConstructSampleCnt", "objective", "minSplitGain", "minSumHessianInLeaf", + "minDataInLeaf", "baggingFraction", "baggingFreq", "featureFraction", + "lambdaL1", "lambdaL2", "numThreads", "earlyStoppingRound", "maxBin". */ -class LightGBMRegressor { +class LightGBMRegressor (val lgbmParams: Map[String, Any] = Map()) { val sc = SparkSession.active.sparkContext sc.getConf.set("spark.task.cpus", Engine.coreNumber().toString) private val estimator = new MLightGBMRegressor() estimator.setNumThreads(Engine.coreNumber()) + TreeModelUtils.setParams(estimator, lgbmParams) def setAlpha(value: Double): this.type = { estimator.setAlpha(value) @@ -752,7 +758,6 @@ class LightGBMRegressor { class LightGBMRegressorModel private[bigdl](val model: MLightGBMRegressionModel) { var predictionCol: String = null var featuresCol: String = "features" - var featurearray: Array[String] = Array("features") def setPredictionCol(value: String): this.type = { predictionCol = value this @@ -765,16 +770,9 @@ class LightGBMRegressorModel private[bigdl](val model: MLightGBMRegressionModel) } def transform(dataset: DataFrame): DataFrame = { - val featureVectorAssembler = new VectorAssembler() - .setInputCols(featurearray) - .setOutputCol("featureAssembledVector") - val assembledDF = featureVectorAssembler.transform(dataset) - import org.apache.spark.ml.linalg.Vector - import org.apache.spark.sql.functions.{col, udf} - val asDense = udf((v: Vector) => v.toDense) - val xgbInput = assembledDF.withColumn("DenseFeatures", asDense(col("featureAssembledVector"))) - model.setFeaturesCol("DenseFeatures") - var output = model.transform(xgbInput).drop("DenseFeatures", "featureAssembledVector") + Log4Error.invalidInputError(featuresCol!=None, "Please set feature columns before transform") + model.setFeaturesCol(featuresCol) + var output = model.transform(dataset) if(predictionCol != null) { output = output.withColumnRenamed("prediction", predictionCol) } diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModelUtils.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModelUtils.scala new file mode 100644 index 00000000000..f2e672bf64e --- /dev/null +++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModelUtils.scala @@ -0,0 +1,66 @@ +/* + * Copyright 2016 The BigDL Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.intel.analytics.bigdl.dllib.nnframes +import com.intel.analytics.bigdl.dllib.utils.Log4Error +import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassifier => MLightGBMClassifier} +import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRegressor => MLightGBMRegressor} + + +object TreeModelUtils { + + def convert2CamelCase(pythonStyle: String): String = { + val data = pythonStyle.split("_") + data(0) + data.slice(1, data.size) + .map(x => x.substring(0, 1).toUpperCase() + x.substring(1)).mkString("") + } + + def setParams(lgbmEstimator: Any, lgbmParams: Map[String, Any]): Unit = { + + val estimator = if (lgbmEstimator.isInstanceOf[MLightGBMClassifier]) { + lgbmEstimator.asInstanceOf[MLightGBMClassifier] + } + else { + lgbmEstimator.asInstanceOf[MLightGBMRegressor] + } + + lgbmParams.foreach(kv => kv._1 match { + case "boostingType" => estimator.setBoostingType(kv._2.asInstanceOf[String]) + case "numLeaves" => estimator.setNumLeaves(kv._2.asInstanceOf[Int]) + case "maxDepth" => estimator.setMaxDepth(kv._2.asInstanceOf[Int]) + case "learningRate" => estimator.setLearningRate(kv._2.asInstanceOf[Double]) + case "numIterations" => estimator.setNumIterations(kv._2.asInstanceOf[Int]) + case "binConstructSampleCnt" => estimator.setBinSampleCount(kv._2.asInstanceOf[Int]) + case "objective" => estimator.setObjective(kv._2.asInstanceOf[String]) + case "minSplitGain" => estimator.setMinGainToSplit(kv._2.asInstanceOf[Double]) + case "minSumHessianInLeaf" => estimator.setMinSumHessianInLeaf(kv._2.asInstanceOf[Double]) + case "minDataInLeaf" => estimator.setMinDataInLeaf(kv._2.asInstanceOf[Int]) + case "baggingFraction" => estimator.setBaggingFraction(kv._2.asInstanceOf[Double]) + case "baggingFreq" => estimator.setBaggingFreq(kv._2.asInstanceOf[Int]) + case "featureFraction" => estimator.setFeatureFraction(kv._2.asInstanceOf[Double]) + case "lambdaL1" => estimator.setLambdaL1(kv._2.asInstanceOf[Double]) + case "lambdaL2" => estimator.setLambdaL2(kv._2.asInstanceOf[Double]) + case "numThreads" => estimator.setNumThreads(kv._2.asInstanceOf[Int]) + case "earlyStoppingRound" => estimator.setEarlyStoppingRound(kv._2.asInstanceOf[Int]) + case "maxBin" => estimator.setMaxBin(kv._2.asInstanceOf[Int]) + case _ => + Log4Error.invalidInputError(false, + s"LightGBM setParams: key ${ kv._1} is not supported by lgbmParams map", + s"try to set this parameter by calling .set${kv._1}") + }) + + } +} diff --git a/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/LightGBMTrainSpec.scala b/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/LightGBMTrainSpec.scala index aaf3ab0a047..20aa383dd4b 100644 --- a/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/LightGBMTrainSpec.scala +++ b/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/LightGBMTrainSpec.scala @@ -22,6 +22,7 @@ import org.apache.spark.SparkContext import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.sql.{SQLContext, SparkSession} import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassifier => MLightGBMClassifier} +import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRegressor => MLightGBMRegressor} import org.apache.spark.SparkConf @@ -45,6 +46,7 @@ class LightGBMTrainSpec extends ZooSpecHelper { val spark = SparkSession.builder().getOrCreate() import spark.implicits._ Engine.init + val df = Seq( (1.0, 2.0, 3.0, 4.0, 1), (1.0, 3.0, 8.0, 2.0, 0) @@ -63,6 +65,55 @@ class LightGBMTrainSpec extends ZooSpecHelper { } } + "LightGBMClassifer train with params" should "work" in { + val spark = SparkSession.builder().getOrCreate() + import spark.implicits._ + Engine.init + + val df = Seq( + (1.0, 2.0, 3.0, 4.0, 1), + (1.0, 3.0, 8.0, 2.0, 0), + (1.0, 2.0, 3.0, 4.0, 1), + (1.0, 3.0, 8.0, 2.0, 0), + (1.0, 2.0, 3.0, 4.0, 1), + (1.0, 3.0, 8.0, 2.0, 0), + (1.0, 2.0, 3.0, 4.0, 1), + (1.0, 3.0, 8.0, 2.0, 0), + (1.0, 2.0, 3.0, 4.0, 1), + (1.0, 3.0, 8.0, 2.0, 0) + ).toDF("f1", "f2", "f3", "f4", "label") + val vectorAssembler = new VectorAssembler() + .setInputCols(Array("f1", "f2", "f3", "f4")) + .setOutputCol("features") + val assembledDf = vectorAssembler.transform(df).select("features", "label").cache() + if (spark.version.substring(0, 3).toDouble >= 3.1) { + val params = Map( + "boostingType" -> "gbdt", + "numLeaves" -> 2, + "maxDepth" -> 2, + "learningRate" -> 0.3, + "numIterations" -> 10, + "binConstructSampleCnt" -> 5, + "objective" -> "binary", + "minSplitGain" -> 0.1, + "minSumHessianInLeaf" -> 0.01, + "minDataInLeaf" -> 1, + "baggingFraction" -> 0.4, + "baggingFreq" -> 1, + "featureFraction" -> 0.4, + "lambdaL1" -> 0.1, + "lambdaL2" -> 0.1, + "numThreads" -> 2, + "earlyStoppingRound" -> 10, + "maxBin" -> 100) + val lightGBMclassifier = new LightGBMClassifier(params) + lightGBMclassifier.setIsUnbalance(true) + val model1 = lightGBMclassifier.fit(assembledDf) + val res1 = model1.transform(assembledDf) + TestUtils.conditionFailTest(res1.count() == 10) + } + } + "LightGBMClassifer save" should "work" in { val spark = SparkSession.builder().getOrCreate() import spark.implicits._ @@ -110,5 +161,140 @@ class LightGBMTrainSpec extends ZooSpecHelper { TestUtils.conditionFailTest(y0_0.count() == 4) } } + + "LightGBMRegressor train with params" should "work" in { + val spark = SparkSession.builder().getOrCreate() + import spark.implicits._ + Engine.init + val df = Seq( + (1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 1.0f, 2.0f, 4.0f, 8.0f, 3.0f, 116.3668f), + (1.0f, 3.0f, 8.0f, 6.0f, 5.0f, 9.0f, 5.0f, 6.0f, 7.0f, 4.0f, 116.367f), + (2.0f, 1.0f, 5.0f, 7.0f, 6.0f, 7.0f, 4.0f, 1.0f, 2.0f, 3.0f, 116.367f), + (2.0f, 1.0f, 4.0f, 3.0f, 6.0f, 1.0f, 3.0f, 2.0f, 1.0f, 3.0f, 116.3668f) + ).toDF("f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "label") + val vectorAssembler = new VectorAssembler() + .setInputCols(Array("f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10")) + .setOutputCol("features") + val assembledDf = vectorAssembler.transform(df).select("features", "label").cache() + if (spark.version.substring(0, 3).toDouble >= 3.1) { + val params = Map( + "boostingType" -> "dart", + "numLeaves" -> 2, + "maxDepth" -> 2, + "learningRate" -> 0.3, + "numIterations" -> 10, + "binConstructSampleCnt" -> 5, + "objective" -> "huber", + "minSplitGain" -> 0.1, + "minSumHessianInLeaf" -> 0.01, + "minDataInLeaf" -> 1, + "baggingFraction" -> 0.4, + "baggingFreq" -> 1, + "featureFraction" -> 0.4, + "lambdaL1" -> 0.1, + "lambdaL2" -> 0.1, + "numThreads" -> 2, + "earlyStoppingRound" -> 10, + "maxBin" -> 100) + val lightGBMRegressor = new LightGBMRegressor(params) + val regressorModel0 = lightGBMRegressor.fit(assembledDf) + val y0 = regressorModel0.transform(assembledDf) + regressorModel0.saveNativeModel("/tmp/test") + val model = LightGBMRegressorModel.loadNativeModel("/tmp/test") + val y0_0 = model.transform(assembledDf) + y0_0.show(10) + TestUtils.conditionFailTest(y0.count() == 4) + TestUtils.conditionFailTest(y0_0.count() == 4) + } + } + + "setParams for LightGBMClassifer " should "work" in { + Engine.init + val spark = SparkSession.builder().getOrCreate() + if (spark.version.substring(0, 3).toDouble >= 3.1) { + val params = Map( + "boostingType" -> "dart", + "numLeaves" -> 2, + "maxDepth" -> 2, + "learningRate" -> 0.3, + "numIterations" -> 10, + "binConstructSampleCnt" -> 5, + "objective" -> "huber", + "minSplitGain" -> 0.1, + "minSumHessianInLeaf" -> 0.01, + "minDataInLeaf" -> 1, + "baggingFraction" -> 0.4, + "baggingFreq" -> 1, + "featureFraction" -> 0.4, + "lambdaL1" -> 0.1, + "lambdaL2" -> 0.1, + "numThreads" -> 2, + "earlyStoppingRound" -> 10, + "maxBin" -> 100) + val mclassifier = new MLightGBMClassifier() + TreeModelUtils.setParams(mclassifier, params) + TestUtils.conditionFailTest(mclassifier.getEarlyStoppingRound == 10) + TestUtils.conditionFailTest(mclassifier.getMaxBin == 100) + } + } + + "setParams for LightGBMRegressor" should "work" in { + Engine.init + val spark = SparkSession.builder().getOrCreate() + if (spark.version.substring(0, 3).toDouble >= 3.1) { + val params = Map( + "boostingType" -> "dart", + "numLeaves" -> 2, + "maxDepth" -> 2, + "learningRate" -> 0.3, + "numIterations" -> 10, + "binConstructSampleCnt" -> 5, + "objective" -> "huber", + "minSplitGain" -> 0.1, + "minSumHessianInLeaf" -> 0.01, + "minDataInLeaf" -> 1, + "baggingFraction" -> 0.4, + "baggingFreq" -> 1, + "featureFraction" -> 0.4, + "lambdaL1" -> 0.1, + "lambdaL2" -> 0.1, + "numThreads" -> 2, + "earlyStoppingRound" -> 10, + "maxBin" -> 100) + val mclassifier = new MLightGBMRegressor() + TreeModelUtils.setParams(mclassifier, params) + TestUtils.conditionFailTest(mclassifier.getEarlyStoppingRound == 10) + TestUtils.conditionFailTest(mclassifier.getMaxBin == 100) + } + } + + "convertToCamelCase" should "work" in { + val paramsMap = Map( + "boosting_type" -> "boostingType", + "num_leaves" -> "numLeaves", + "max_depth" -> "maxDepth", + "learning_rate" -> "learningRate", + "num_iterations" -> "numIterations", + "bin_construct_sample_cnt" -> "binConstructSampleCnt", + "objective" -> "objective", + "min_split_gain" -> "minSplitGain", + "min_sum_hessian_in_leaf" -> "minSumHessianInLeaf", + "min_data_in_leaf" -> "minDataInLeaf", + "bagging_fraction" -> "baggingFraction", + "bagging_freq" -> "baggingFreq", + "feature_fraction" -> "featureFraction", + "lambda_l1" -> "lambdaL1", + "lambda_l2" -> "lambdaL2", + "num_threads" -> "numThreads", + "early_stopping_round" -> "earlyStoppingRound", + "max_bin" -> "maxBin", + "max_bin_by_feature" -> "maxBinByFeature") + + paramsMap.foreach(kv => { + println(kv._1, TreeModelUtils.convert2CamelCase(kv._1)) + TestUtils.conditionFailTest(kv._2 == TreeModelUtils.convert2CamelCase(kv._1)) + }) + } + }