From cb49fd7e1855f3629594be34aa57225830179ffe Mon Sep 17 00:00:00 2001
From: Guoqiong Song <guoqiongsong@gmail.com>
Date: Wed, 14 Sep 2022 09:56:08 -0700
Subject: [PATCH] support param map for lightGBM (#5712)

---
 .../bigdl/dllib/nnframes/TreeModel.scala      |  48 +++--
 .../bigdl/dllib/nnframes/TreeModelUtils.scala |  66 +++++++
 .../dllib/nnframes/LightGBMTrainSpec.scala    | 186 ++++++++++++++++++
 3 files changed, 275 insertions(+), 25 deletions(-)
 create mode 100644 scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModelUtils.scala

diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModel.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModel.scala
index bcd6a25e964..c245c021263 100644
--- a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModel.scala
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModel.scala
@@ -25,13 +25,7 @@ import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassificationModel => M
 import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassifier => MLightGBMClassifier}
 import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRegressionModel => MLightGBMRegressionModel}
 import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRegressor => MLightGBMRegressor}
-import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRankerModel => MLightGBMRankerModel}
-import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRanker => MLightGBMRanker}
-import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMBase => MLightGBMBase}
-import com.microsoft.azure.synapse.ml.lightgbm.params.{LightGBMParams => MLightGBMParams}
-import org.apache.spark.ml.Model
-import org.apache.spark.ml.param.{ParamMap, Params}
-import org.apache.spark.ml.util.Identifiable
+
 
 class XGBClassifier (val xgboostParams: Map[String, Any] = Map()) {
   val sc = SparkSession.active.sparkContext
@@ -428,14 +422,20 @@ object XGBRegressorModel {
 
 /**
  * [[lightGBMClassifier wrapper]]
+ * @param lgbmParams, a map of parameters, currently supported 18 params to be set from lgbmParams:
+        "boostingType", "numLeaves", "maxDepth", "learningRate", "numIterations",
+        "binConstructSampleCnt", "objective", "minSplitGain", "minSumHessianInLeaf",
+        "minDataInLeaf", "baggingFraction", "baggingFreq", "featureFraction",
+        "lambdaL1", "lambdaL2", "numThreads", "earlyStoppingRound", "maxBin".
  */
-class LightGBMClassifier {
+class LightGBMClassifier (val lgbmParams: Map[String, Any] = Map()) {
 
   val sc = SparkSession.active.sparkContext
   sc.getConf.set("spark.task.cpus", Engine.coreNumber().toString)
 
-  val estimator = new MLightGBMClassifier()
+  private val estimator = new MLightGBMClassifier()
   estimator.setNumThreads(Engine.coreNumber())
+  TreeModelUtils.setParams(estimator, lgbmParams)
 
   def setLabelCol(labelColName : String) : this.type = {
     estimator.setLabelCol(labelColName)
@@ -573,11 +573,11 @@ class LightGBMClassifier {
  * @param model trained MLightGBMClassificationModel to use in prediction.
  */
 class LightGBMClassifierModel private[bigdl](val model: MLightGBMClassificationModel) {
-  private var featuresCols: String = "features"
+  private var featuresCol: String = "features"
   private var predictionCol: String = "prediction"
 
   def setFeaturesCol(featuresColName: String): this.type = {
-    featuresCols = featuresColName
+    featuresCol = featuresColName
     this
   }
 
@@ -587,8 +587,8 @@ class LightGBMClassifierModel private[bigdl](val model: MLightGBMClassificationM
   }
 
   def transform(dataset: DataFrame): DataFrame = {
-    Log4Error.invalidInputError(featuresCols!=None, "Please set feature columns before transform")
-    model.setFeaturesCol(featuresCols)
+    Log4Error.invalidInputError(featuresCol!=None, "Please set feature columns before transform")
+    model.setFeaturesCol(featuresCol)
     var output = model.transform(dataset)
     if(predictionCol != null) {
       output = output.withColumnRenamed("prediction", predictionCol)
@@ -609,14 +609,20 @@ object LightGBMClassifierModel {
 
 /**
  * [[LightGBMRegressor]] lightGBM wrapper of LightGBMRegressor.
+ * @param lgbmParams, a map of parameters, currently supported 18 params to be set from lgbmParams:
+        "boostingType", "numLeaves", "maxDepth", "learningRate", "numIterations",
+        "binConstructSampleCnt", "objective", "minSplitGain", "minSumHessianInLeaf",
+        "minDataInLeaf", "baggingFraction", "baggingFreq", "featureFraction",
+        "lambdaL1", "lambdaL2", "numThreads", "earlyStoppingRound", "maxBin".
  */
-class LightGBMRegressor {
+class LightGBMRegressor (val lgbmParams: Map[String, Any] = Map()) {
 
   val sc = SparkSession.active.sparkContext
   sc.getConf.set("spark.task.cpus", Engine.coreNumber().toString)
 
   private val estimator = new MLightGBMRegressor()
   estimator.setNumThreads(Engine.coreNumber())
+  TreeModelUtils.setParams(estimator, lgbmParams)
 
   def setAlpha(value: Double): this.type = {
     estimator.setAlpha(value)
@@ -752,7 +758,6 @@ class LightGBMRegressor {
 class LightGBMRegressorModel private[bigdl](val model: MLightGBMRegressionModel) {
   var predictionCol: String = null
   var featuresCol: String = "features"
-  var featurearray: Array[String] = Array("features")
   def setPredictionCol(value: String): this.type = {
     predictionCol = value
     this
@@ -765,16 +770,9 @@ class LightGBMRegressorModel private[bigdl](val model: MLightGBMRegressionModel)
   }
 
   def transform(dataset: DataFrame): DataFrame = {
-    val featureVectorAssembler = new VectorAssembler()
-      .setInputCols(featurearray)
-      .setOutputCol("featureAssembledVector")
-    val assembledDF = featureVectorAssembler.transform(dataset)
-    import org.apache.spark.ml.linalg.Vector
-    import org.apache.spark.sql.functions.{col, udf}
-    val asDense = udf((v: Vector) => v.toDense)
-    val xgbInput = assembledDF.withColumn("DenseFeatures", asDense(col("featureAssembledVector")))
-    model.setFeaturesCol("DenseFeatures")
-    var output = model.transform(xgbInput).drop("DenseFeatures", "featureAssembledVector")
+    Log4Error.invalidInputError(featuresCol!=None, "Please set feature columns before transform")
+    model.setFeaturesCol(featuresCol)
+    var output = model.transform(dataset)
     if(predictionCol != null) {
       output = output.withColumnRenamed("prediction", predictionCol)
     }
diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModelUtils.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModelUtils.scala
new file mode 100644
index 00000000000..f2e672bf64e
--- /dev/null
+++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModelUtils.scala
@@ -0,0 +1,66 @@
+/*
+ * Copyright 2016 The BigDL Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.intel.analytics.bigdl.dllib.nnframes
+import com.intel.analytics.bigdl.dllib.utils.Log4Error
+import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassifier => MLightGBMClassifier}
+import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRegressor => MLightGBMRegressor}
+
+
+object TreeModelUtils {
+
+  def convert2CamelCase(pythonStyle: String): String = {
+    val data = pythonStyle.split("_")
+    data(0) + data.slice(1, data.size)
+      .map(x => x.substring(0, 1).toUpperCase() + x.substring(1)).mkString("")
+  }
+
+  def setParams(lgbmEstimator: Any, lgbmParams: Map[String, Any]): Unit = {
+
+    val estimator = if (lgbmEstimator.isInstanceOf[MLightGBMClassifier]) {
+      lgbmEstimator.asInstanceOf[MLightGBMClassifier]
+      }
+    else {
+      lgbmEstimator.asInstanceOf[MLightGBMRegressor]
+    }
+
+    lgbmParams.foreach(kv => kv._1 match {
+      case "boostingType" => estimator.setBoostingType(kv._2.asInstanceOf[String])
+      case "numLeaves" => estimator.setNumLeaves(kv._2.asInstanceOf[Int])
+      case "maxDepth" => estimator.setMaxDepth(kv._2.asInstanceOf[Int])
+      case "learningRate" => estimator.setLearningRate(kv._2.asInstanceOf[Double])
+      case "numIterations" => estimator.setNumIterations(kv._2.asInstanceOf[Int])
+      case "binConstructSampleCnt" => estimator.setBinSampleCount(kv._2.asInstanceOf[Int])
+      case "objective" => estimator.setObjective(kv._2.asInstanceOf[String])
+      case "minSplitGain" => estimator.setMinGainToSplit(kv._2.asInstanceOf[Double])
+      case "minSumHessianInLeaf" => estimator.setMinSumHessianInLeaf(kv._2.asInstanceOf[Double])
+      case "minDataInLeaf" => estimator.setMinDataInLeaf(kv._2.asInstanceOf[Int])
+      case "baggingFraction" => estimator.setBaggingFraction(kv._2.asInstanceOf[Double])
+      case "baggingFreq" => estimator.setBaggingFreq(kv._2.asInstanceOf[Int])
+      case "featureFraction" => estimator.setFeatureFraction(kv._2.asInstanceOf[Double])
+      case "lambdaL1" => estimator.setLambdaL1(kv._2.asInstanceOf[Double])
+      case "lambdaL2" => estimator.setLambdaL2(kv._2.asInstanceOf[Double])
+      case "numThreads" => estimator.setNumThreads(kv._2.asInstanceOf[Int])
+      case "earlyStoppingRound" => estimator.setEarlyStoppingRound(kv._2.asInstanceOf[Int])
+      case "maxBin" => estimator.setMaxBin(kv._2.asInstanceOf[Int])
+      case _ =>
+        Log4Error.invalidInputError(false,
+          s"LightGBM setParams: key ${ kv._1} is not supported by lgbmParams map",
+          s"try to set this parameter by calling .set${kv._1}")
+    })
+
+  }
+}
diff --git a/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/LightGBMTrainSpec.scala b/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/LightGBMTrainSpec.scala
index aaf3ab0a047..20aa383dd4b 100644
--- a/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/LightGBMTrainSpec.scala
+++ b/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/LightGBMTrainSpec.scala
@@ -22,6 +22,7 @@ import org.apache.spark.SparkContext
 import org.apache.spark.ml.feature.VectorAssembler
 import org.apache.spark.sql.{SQLContext, SparkSession}
 import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassifier => MLightGBMClassifier}
+import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRegressor => MLightGBMRegressor}
 import org.apache.spark.SparkConf
 
 
@@ -45,6 +46,7 @@ class LightGBMTrainSpec extends ZooSpecHelper {
     val spark = SparkSession.builder().getOrCreate()
     import spark.implicits._
     Engine.init
+
     val df = Seq(
       (1.0, 2.0, 3.0, 4.0, 1),
       (1.0, 3.0, 8.0, 2.0, 0)
@@ -63,6 +65,55 @@ class LightGBMTrainSpec extends ZooSpecHelper {
     }
   }
 
+  "LightGBMClassifer train with params" should "work" in {
+    val spark = SparkSession.builder().getOrCreate()
+    import spark.implicits._
+    Engine.init
+
+    val df = Seq(
+      (1.0, 2.0, 3.0, 4.0, 1),
+      (1.0, 3.0, 8.0, 2.0, 0),
+      (1.0, 2.0, 3.0, 4.0, 1),
+      (1.0, 3.0, 8.0, 2.0, 0),
+      (1.0, 2.0, 3.0, 4.0, 1),
+      (1.0, 3.0, 8.0, 2.0, 0),
+      (1.0, 2.0, 3.0, 4.0, 1),
+      (1.0, 3.0, 8.0, 2.0, 0),
+      (1.0, 2.0, 3.0, 4.0, 1),
+      (1.0, 3.0, 8.0, 2.0, 0)
+    ).toDF("f1", "f2", "f3", "f4", "label")
+    val vectorAssembler = new VectorAssembler()
+      .setInputCols(Array("f1", "f2", "f3", "f4"))
+      .setOutputCol("features")
+    val assembledDf = vectorAssembler.transform(df).select("features", "label").cache()
+    if (spark.version.substring(0, 3).toDouble >= 3.1) {
+      val params = Map(
+        "boostingType" -> "gbdt",
+        "numLeaves" -> 2,
+        "maxDepth" -> 2,
+        "learningRate" -> 0.3,
+        "numIterations" -> 10,
+        "binConstructSampleCnt" -> 5,
+        "objective" -> "binary",
+        "minSplitGain" -> 0.1,
+        "minSumHessianInLeaf" -> 0.01,
+        "minDataInLeaf" -> 1,
+        "baggingFraction" -> 0.4,
+        "baggingFreq" -> 1,
+        "featureFraction" -> 0.4,
+        "lambdaL1" -> 0.1,
+        "lambdaL2" -> 0.1,
+        "numThreads" -> 2,
+        "earlyStoppingRound" -> 10,
+        "maxBin" -> 100)
+      val lightGBMclassifier = new LightGBMClassifier(params)
+      lightGBMclassifier.setIsUnbalance(true)
+      val model1 = lightGBMclassifier.fit(assembledDf)
+      val res1 = model1.transform(assembledDf)
+      TestUtils.conditionFailTest(res1.count() == 10)
+    }
+  }
+
   "LightGBMClassifer save" should "work" in {
     val spark = SparkSession.builder().getOrCreate()
     import spark.implicits._
@@ -110,5 +161,140 @@ class LightGBMTrainSpec extends ZooSpecHelper {
       TestUtils.conditionFailTest(y0_0.count() == 4)
     }
   }
+
+  "LightGBMRegressor train with params" should "work" in {
+    val spark = SparkSession.builder().getOrCreate()
+    import spark.implicits._
+    Engine.init
+    val df = Seq(
+      (1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 1.0f, 2.0f, 4.0f, 8.0f, 3.0f, 116.3668f),
+      (1.0f, 3.0f, 8.0f, 6.0f, 5.0f, 9.0f, 5.0f, 6.0f, 7.0f, 4.0f, 116.367f),
+      (2.0f, 1.0f, 5.0f, 7.0f, 6.0f, 7.0f, 4.0f, 1.0f, 2.0f, 3.0f, 116.367f),
+      (2.0f, 1.0f, 4.0f, 3.0f, 6.0f, 1.0f, 3.0f, 2.0f, 1.0f, 3.0f, 116.3668f)
+    ).toDF("f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "label")
+    val vectorAssembler = new VectorAssembler()
+      .setInputCols(Array("f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10"))
+      .setOutputCol("features")
+    val assembledDf = vectorAssembler.transform(df).select("features", "label").cache()
+    if (spark.version.substring(0, 3).toDouble >= 3.1) {
+      val params = Map(
+        "boostingType" -> "dart",
+        "numLeaves" -> 2,
+        "maxDepth" -> 2,
+        "learningRate" -> 0.3,
+        "numIterations" -> 10,
+        "binConstructSampleCnt" -> 5,
+        "objective" -> "huber",
+        "minSplitGain" -> 0.1,
+        "minSumHessianInLeaf" -> 0.01,
+        "minDataInLeaf" -> 1,
+        "baggingFraction" -> 0.4,
+        "baggingFreq" -> 1,
+        "featureFraction" -> 0.4,
+        "lambdaL1" -> 0.1,
+        "lambdaL2" -> 0.1,
+        "numThreads" -> 2,
+        "earlyStoppingRound" -> 10,
+        "maxBin" -> 100)
+      val lightGBMRegressor = new LightGBMRegressor(params)
+      val regressorModel0 = lightGBMRegressor.fit(assembledDf)
+      val y0 = regressorModel0.transform(assembledDf)
+      regressorModel0.saveNativeModel("/tmp/test")
+      val model = LightGBMRegressorModel.loadNativeModel("/tmp/test")
+      val y0_0 = model.transform(assembledDf)
+      y0_0.show(10)
+      TestUtils.conditionFailTest(y0.count() == 4)
+      TestUtils.conditionFailTest(y0_0.count() == 4)
+    }
+  }
+
+  "setParams for LightGBMClassifer " should "work" in {
+    Engine.init
+    val spark = SparkSession.builder().getOrCreate()
+    if (spark.version.substring(0, 3).toDouble >= 3.1) {
+      val params = Map(
+        "boostingType" -> "dart",
+        "numLeaves" -> 2,
+        "maxDepth" -> 2,
+        "learningRate" -> 0.3,
+        "numIterations" -> 10,
+        "binConstructSampleCnt" -> 5,
+        "objective" -> "huber",
+        "minSplitGain" -> 0.1,
+        "minSumHessianInLeaf" -> 0.01,
+        "minDataInLeaf" -> 1,
+        "baggingFraction" -> 0.4,
+        "baggingFreq" -> 1,
+        "featureFraction" -> 0.4,
+        "lambdaL1" -> 0.1,
+        "lambdaL2" -> 0.1,
+        "numThreads" -> 2,
+        "earlyStoppingRound" -> 10,
+        "maxBin" -> 100)
+      val mclassifier = new MLightGBMClassifier()
+      TreeModelUtils.setParams(mclassifier, params)
+      TestUtils.conditionFailTest(mclassifier.getEarlyStoppingRound == 10)
+      TestUtils.conditionFailTest(mclassifier.getMaxBin == 100)
+    }
+  }
+
+  "setParams for LightGBMRegressor" should "work" in {
+    Engine.init
+    val spark = SparkSession.builder().getOrCreate()
+    if (spark.version.substring(0, 3).toDouble >= 3.1) {
+      val params = Map(
+        "boostingType" -> "dart",
+        "numLeaves" -> 2,
+        "maxDepth" -> 2,
+        "learningRate" -> 0.3,
+        "numIterations" -> 10,
+        "binConstructSampleCnt" -> 5,
+        "objective" -> "huber",
+        "minSplitGain" -> 0.1,
+        "minSumHessianInLeaf" -> 0.01,
+        "minDataInLeaf" -> 1,
+        "baggingFraction" -> 0.4,
+        "baggingFreq" -> 1,
+        "featureFraction" -> 0.4,
+        "lambdaL1" -> 0.1,
+        "lambdaL2" -> 0.1,
+        "numThreads" -> 2,
+        "earlyStoppingRound" -> 10,
+        "maxBin" -> 100)
+      val mclassifier = new MLightGBMRegressor()
+      TreeModelUtils.setParams(mclassifier, params)
+      TestUtils.conditionFailTest(mclassifier.getEarlyStoppingRound == 10)
+      TestUtils.conditionFailTest(mclassifier.getMaxBin == 100)
+    }
+  }
+
+  "convertToCamelCase" should "work" in {
+    val paramsMap = Map(
+      "boosting_type" -> "boostingType",
+      "num_leaves" -> "numLeaves",
+      "max_depth" -> "maxDepth",
+      "learning_rate" -> "learningRate",
+      "num_iterations" -> "numIterations",
+      "bin_construct_sample_cnt" -> "binConstructSampleCnt",
+      "objective" -> "objective",
+      "min_split_gain" -> "minSplitGain",
+      "min_sum_hessian_in_leaf" -> "minSumHessianInLeaf",
+      "min_data_in_leaf" -> "minDataInLeaf",
+      "bagging_fraction" -> "baggingFraction",
+      "bagging_freq" -> "baggingFreq",
+      "feature_fraction" -> "featureFraction",
+      "lambda_l1" -> "lambdaL1",
+      "lambda_l2" -> "lambdaL2",
+      "num_threads" -> "numThreads",
+      "early_stopping_round" -> "earlyStoppingRound",
+      "max_bin" -> "maxBin",
+      "max_bin_by_feature" -> "maxBinByFeature")
+
+    paramsMap.foreach(kv => {
+      println(kv._1, TreeModelUtils.convert2CamelCase(kv._1))
+      TestUtils.conditionFailTest(kv._2 == TreeModelUtils.convert2CamelCase(kv._1))
+    })
+  }
+
 }