diff --git a/scala/dllib/pom.xml b/scala/dllib/pom.xml index 5f3b6f97163..626cc2ae76b 100644 --- a/scala/dllib/pom.xml +++ b/scala/dllib/pom.xml @@ -332,6 +332,11 @@ 1.7.16 test + + com.microsoft.azure + synapseml-lightgbm_2.12 + 0.9.5 + @@ -374,6 +379,14 @@ META-INF/maven/com.google.protobuf/protobuf-java/* + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/NNClassifier.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/NNClassifier.scala index b44e7354205..24e5a26cbbf 100644 --- a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/NNClassifier.scala +++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/NNClassifier.scala @@ -322,396 +322,3 @@ object NNClassifierModel extends MLReadable[NNClassifierModel[_]] { new NNClassifierModel.NNClassifierModelReader } } - -class XGBClassifier (val xgboostParams: Map[String, Any] = Map()) { - val sc = SparkContext.getOrCreate() - sc.getConf.set("spark.task.cpus", Engine.coreNumber().toString) - private val model = new XGBoostClassifier(xgboostParams) - model.setNthread(Engine.coreNumber()) - model.setNumWorkers(Engine.nodeNumber()) - model.setMaxBins(256) - - def setFeaturesCol(featuresColName: String): this.type = { - model.setFeaturesCol(featuresColName) - this - } - - def fit(df: DataFrame): XGBClassifierModel = { - df.repartition(Engine.nodeNumber()) - val xgbmodel = model.fit(df) - new XGBClassifierModel(xgbmodel) - } - - def setNthread(value: Int): this.type = { - model.setNthread(value) - this - } - - def setNumRound(value: Int): this.type = { - model.setNumRound(value) - this - } - - def setNumWorkers(value: Int): this.type = { - model.setNumWorkers(value) - this - } - - def setEta(value: Double): this.type = { - model.setEta(value) - this - } - - def setGamma(value: Int): this.type = { - model.setGamma(value) - this - } - - def setMaxDepth(value: Int): this.type = { - model.setMaxDepth(value) - this - } - - def setMissing(value: Float): this.type = { - model.setMissing(value) - this - } - - def setLabelCol(labelColName: String): this.type = { - model.setLabelCol(labelColName) - this - } - def setTreeMethod(value: String): this.type = { - model.setTreeMethod(value) - this - } - - def setObjective(value: String): this.type = { - model.setObjective(value) - this - } - - def setNumClass(value: Int): this.type = { - model.setNumClass(value) - this - } - - def setTimeoutRequestWorkers(value: Long): this.type = { - model.setTimeoutRequestWorkers(value) - this - } -} -/** - * [[XGBClassifierModel]] is a trained XGBoost classification model. - * The prediction column will have the prediction results. - * - * @param model trained XGBoostClassificationModel to use in prediction. - */ -class XGBClassifierModel private[bigdl]( - val model: XGBoostClassificationModel) { - private var featuresCols: String = null - private var predictionCol: String = null - - def setFeaturesCol(featuresColName: String): this.type = { - featuresCols = featuresColName - this - } - - def setPredictionCol(value: String): this.type = { - predictionCol = value - this - } - - def setInferBatchSize(value: Int): this.type = { - model.setInferBatchSize(value) - this - } - - def transform(dataset: DataFrame): DataFrame = { - Log4Error.invalidInputError(featuresCols!=None, "Please set feature columns before transform") - model.setFeaturesCol(featuresCols) - var output = model.transform(dataset) - if(predictionCol != null) { - output = output.withColumnRenamed("prediction", predictionCol) - } - output - } - - def save(path: String): Unit = { - model.write.overwrite().save(path) - } - -} - -object XGBClassifierModel { - def load(path: String, numClass: Int): XGBClassifierModel = { - new XGBClassifierModel(XGBoostHelper.load(path, numClass)) - } - - def load(path: String): XGBClassifierModel = { - new XGBClassifierModel(XGBoostClassificationModel.load(path)) - } - -} - -/** - * [[XGBRegressor]] xgboost wrapper of XGBRegressor. - */ -class XGBRegressor () { - - private val model = new XGBoostRegressor() - model.setNthread(Engine.coreNumber()) - model.setMaxBins(256) - - def setLabelCol(labelColName : String) : this.type = { - model.setLabelCol(labelColName) - this - } - - def setFeaturesCol(featuresColName: String): this.type = { - model.setFeaturesCol(featuresColName) - this - } - - def fit(df: DataFrame): XGBRegressorModel = { - df.repartition(Engine.nodeNumber()) - val xgbModel = model.fit(df) - new XGBRegressorModel(xgbModel) - } - - def setNumRound(value: Int): this.type = { - model.setNumRound(value) - this - } - - def setNumWorkers(value: Int): this.type = { - model.setNumWorkers(value) - this - } - - def setNthread(value: Int): this.type = { - model.setNthread(value) - this - } - - def setSilent(value: Int): this.type = { - model.setSilent(value) - this - } - - def setMissing(value: Float): this.type = { - model.setMissing(value) - this - } - - def setCheckpointPath(value: String): this.type = { - model.setCheckpointPath(value) - this - } - - def setCheckpointInterval(value: Int): this.type = { - model.setCheckpointInterval(value) - this - } - - def setSeed(value: Long): this.type = { - model.setSeed(value) - this - } - - def setEta(value: Double): this.type = { - model.setEta(value) - this - } - - def setGamma(value: Double): this.type = { - model.setGamma(value) - this - } - - def setMaxDepth(value: Int): this.type = { - model.setMaxDepth(value) - this - } - - def setMinChildWeight(value: Double): this.type = { - model.setMinChildWeight(value) - this - } - - def setMaxDeltaStep(value: Double): this.type = { - model.setMaxDeltaStep(value) - this - } - - def setColsampleBytree(value: Double): this.type = { - model.setColsampleBytree(value) - this - } - - def setColsampleBylevel(value: Double): this.type = { - model.setColsampleBylevel(value) - this - } - - def setLambda(value: Double): this.type = { - model.setLambda(value) - this - } - - def setAlpha(value: Double): this.type = { - model.setAlpha(value) - this - } - - def setTreeMethod(value: String): this.type = { - model.setTreeMethod(value) - this - } - - def setGrowPolicy(value: String): this.type = { - model.setGrowPolicy(value) - this - } - - def setMaxBins(value: Int): this.type = { - model.setMaxBins(value) - this - } - - def setMaxLeaves(value: Int): this.type = { - model.setMaxLeaves(value) - this - } - - def setSketchEps(value: Double): this.type = { - model.setSketchEps(value) - this - } - - def setScalePosWeight(value: Double): this.type = { - model.setScalePosWeight(value) - this - } - - def setSampleType(value: String): this.type = { - model.setSampleType(value) - this - } - - def setNormalizeType(value: String): this.type = { - model.setNormalizeType(value) - this - } - - def setRateDrop(value: Double): this.type = { - model.setRateDrop(value) - this - } - - def setSkipDrop(value: Double): this.type = { - model.setSkipDrop(value) - this - } - - def setLambdaBias(value: Double): this.type = { - model.setLambdaBias(value) - this - } - - def setObjective(value: String): this.type = { - model.setObjective(value) - this - } - - def setObjectiveType(value: String): this.type = { - model.setObjectiveType(value) - this - } - - def setSubsample(value: Double): this.type = { - model.setSubsample(value) - this - } - - def setBaseScore(value: Double): this.type = { - model.setBaseScore(value) - this - } - - def setEvalMetric(value: String): this.type = { - model.setEvalMetric(value) - this - } - - def setNumEarlyStoppingRounds(value: Int): this.type = { - model.setNumEarlyStoppingRounds(value) - this - } - - def setMaximizeEvaluationMetrics(value: Boolean): this.type = { - model.setMaximizeEvaluationMetrics(value) - this - } -} - -/** - * [[XGBRegressorModel]] xgboost wrapper of XGBRegressorModel. - */ -class XGBRegressorModel private[bigdl](val model: XGBoostRegressionModel) { - var predictionCol: String = null - var featuresCol: String = "features" - var featurearray: Array[String] = Array("features") - def setPredictionCol(value: String): this.type = { - predictionCol = value - this - } - - def setInferBatchSize(value: Int): this.type = { - model.setInferBatchSize(value) - this - } - - def setFeaturesCol(value: String): this.type = { - model.setFeaturesCol(value) - featuresCol = value - this - } - - def transform(dataset: DataFrame): DataFrame = { - val featureVectorAssembler = new VectorAssembler() - .setInputCols(featurearray) - .setOutputCol("featureAssembledVector") - val assembledDF = featureVectorAssembler.transform(dataset) - import org.apache.spark.sql.functions.{col, udf} - import org.apache.spark.ml.linalg.Vector - val asDense = udf((v: Vector) => v.toDense) - val xgbInput = assembledDF.withColumn("DenseFeatures", asDense(col("featureAssembledVector"))) - model.setFeaturesCol("DenseFeatures") - var output = model.transform(xgbInput).drop("DenseFeatures", "featureAssembledVector") - if(predictionCol != null) { - output = output.withColumnRenamed("prediction", predictionCol) - } - output - } - - def save(path: String): Unit = { - model.write.overwrite().save(path) - } -} - -object XGBRegressorModel { - /** - * Load pretrained Zoo XGBRegressorModel. - */ - def load(path: String): XGBRegressorModel = { - new XGBRegressorModel(XGBoostRegressionModel.load(path)) - } - - /** - * Load pretrained xgboost XGBoostRegressionModel. - */ - def loadFromXGB(path: String): XGBRegressorModel = { - new XGBRegressorModel(XGBoostHelper.load(path)) - } -} diff --git a/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModel.scala b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModel.scala new file mode 100644 index 00000000000..bcd6a25e964 --- /dev/null +++ b/scala/dllib/src/main/scala/com/intel/analytics/bigdl/dllib/nnframes/TreeModel.scala @@ -0,0 +1,797 @@ +/* + * Copyright 2016 The BigDL Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.intel.analytics.bigdl.dllib.nnframes + +import com.intel.analytics.bigdl.dllib.utils.{Engine, Log4Error} +import org.apache.spark.sql.SparkSession +import ml.dmlc.xgboost4j.scala.spark._ +import org.apache.spark.ml.feature.VectorAssembler +import org.apache.spark.sql.DataFrame +import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassificationModel => MLightGBMClassificationModel} +import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassifier => MLightGBMClassifier} +import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRegressionModel => MLightGBMRegressionModel} +import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRegressor => MLightGBMRegressor} +import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRankerModel => MLightGBMRankerModel} +import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMRanker => MLightGBMRanker} +import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMBase => MLightGBMBase} +import com.microsoft.azure.synapse.ml.lightgbm.params.{LightGBMParams => MLightGBMParams} +import org.apache.spark.ml.Model +import org.apache.spark.ml.param.{ParamMap, Params} +import org.apache.spark.ml.util.Identifiable + +class XGBClassifier (val xgboostParams: Map[String, Any] = Map()) { + val sc = SparkSession.active.sparkContext + sc.getConf.set("spark.task.cpus", Engine.coreNumber().toString) + private val estimator = new XGBoostClassifier(xgboostParams) + estimator.setNthread(Engine.coreNumber()) + estimator.setNumWorkers(Engine.nodeNumber()) + estimator.setMaxBins(256) + + def setFeaturesCol(featuresColName: String): this.type = { + estimator.setFeaturesCol(featuresColName) + this + } + + def fit(df: DataFrame): XGBClassifierModel = { + df.repartition(Engine.nodeNumber()) + val xgbmodel = estimator.fit(df) + new XGBClassifierModel(xgbmodel) + } + + def setNthread(value: Int): this.type = { + estimator.setNthread(value) + this + } + + def setNumRound(value: Int): this.type = { + estimator.setNumRound(value) + this + } + + def setNumWorkers(value: Int): this.type = { + estimator.setNumWorkers(value) + this + } + + def setEta(value: Double): this.type = { + estimator.setEta(value) + this + } + + def setGamma(value: Int): this.type = { + estimator.setGamma(value) + this + } + + def setMaxDepth(value: Int): this.type = { + estimator.setMaxDepth(value) + this + } + + def setMissing(value: Float): this.type = { + estimator.setMissing(value) + this + } + + def setLabelCol(labelColName: String): this.type = { + estimator.setLabelCol(labelColName) + this + } + def setTreeMethod(value: String): this.type = { + estimator.setTreeMethod(value) + this + } + + def setObjective(value: String): this.type = { + estimator.setObjective(value) + this + } + + def setNumClass(value: Int): this.type = { + estimator.setNumClass(value) + this + } + + def setTimeoutRequestWorkers(value: Long): this.type = { + estimator.setTimeoutRequestWorkers(value) + this + } +} +/** + * [[XGBClassifierModel]] is a trained XGBoost classification model. + * The prediction column will have the prediction results. + * + * @param model trained XGBoostClassificationModel to use in prediction. + */ +class XGBClassifierModel private[bigdl]( + val model: XGBoostClassificationModel) { + private var featuresCols: String = null + private var predictionCol: String = null + + def setFeaturesCol(featuresColName: String): this.type = { + featuresCols = featuresColName + this + } + + def setPredictionCol(value: String): this.type = { + predictionCol = value + this + } + + def setInferBatchSize(value: Int): this.type = { + model.setInferBatchSize(value) + this + } + + def transform(dataset: DataFrame): DataFrame = { + Log4Error.invalidInputError(featuresCols!=None, "Please set feature columns before transform") + model.setFeaturesCol(featuresCols) + var output = model.transform(dataset) + if(predictionCol != null) { + output = output.withColumnRenamed("prediction", predictionCol) + } + output + } + + def save(path: String): Unit = { + model.write.overwrite().save(path) + } + +} + +object XGBClassifierModel { + def load(path: String, numClass: Int): XGBClassifierModel = { + new XGBClassifierModel(XGBoostHelper.load(path, numClass)) + } + + def load(path: String): XGBClassifierModel = { + new XGBClassifierModel(XGBoostClassificationModel.load(path)) + } + +} + +/** + * [[XGBRegressor]] xgboost wrapper of XGBRegressor. + */ +class XGBRegressor () { + + private val estimator = new XGBoostRegressor() + estimator.setNthread(Engine.coreNumber()) + estimator.setMaxBins(256) + + def setLabelCol(labelColName : String) : this.type = { + estimator.setLabelCol(labelColName) + this + } + + def setFeaturesCol(featuresColName: String): this.type = { + estimator.setFeaturesCol(featuresColName) + this + } + + def fit(df: DataFrame): XGBRegressorModel = { + df.repartition(Engine.nodeNumber()) + val xgbModel = estimator.fit(df) + new XGBRegressorModel(xgbModel) + } + + def setNumRound(value: Int): this.type = { + estimator.setNumRound(value) + this + } + + def setNumWorkers(value: Int): this.type = { + estimator.setNumWorkers(value) + this + } + + def setNthread(value: Int): this.type = { + estimator.setNthread(value) + this + } + + def setSilent(value: Int): this.type = { + estimator.setSilent(value) + this + } + + def setMissing(value: Float): this.type = { + estimator.setMissing(value) + this + } + + def setCheckpointPath(value: String): this.type = { + estimator.setCheckpointPath(value) + this + } + + def setCheckpointInterval(value: Int): this.type = { + estimator.setCheckpointInterval(value) + this + } + + def setSeed(value: Long): this.type = { + estimator.setSeed(value) + this + } + + def setEta(value: Double): this.type = { + estimator.setEta(value) + this + } + + def setGamma(value: Double): this.type = { + estimator.setGamma(value) + this + } + + def setMaxDepth(value: Int): this.type = { + estimator.setMaxDepth(value) + this + } + + def setMinChildWeight(value: Double): this.type = { + estimator.setMinChildWeight(value) + this + } + + def setMaxDeltaStep(value: Double): this.type = { + estimator.setMaxDeltaStep(value) + this + } + + def setColsampleBytree(value: Double): this.type = { + estimator.setColsampleBytree(value) + this + } + + def setColsampleBylevel(value: Double): this.type = { + estimator.setColsampleBylevel(value) + this + } + + def setLambda(value: Double): this.type = { + estimator.setLambda(value) + this + } + + def setAlpha(value: Double): this.type = { + estimator.setAlpha(value) + this + } + + def setTreeMethod(value: String): this.type = { + estimator.setTreeMethod(value) + this + } + + def setGrowPolicy(value: String): this.type = { + estimator.setGrowPolicy(value) + this + } + + def setMaxBins(value: Int): this.type = { + estimator.setMaxBins(value) + this + } + + def setMaxLeaves(value: Int): this.type = { + estimator.setMaxLeaves(value) + this + } + + def setSketchEps(value: Double): this.type = { + estimator.setSketchEps(value) + this + } + + def setScalePosWeight(value: Double): this.type = { + estimator.setScalePosWeight(value) + this + } + + def setSampleType(value: String): this.type = { + estimator.setSampleType(value) + this + } + + def setNormalizeType(value: String): this.type = { + estimator.setNormalizeType(value) + this + } + + def setRateDrop(value: Double): this.type = { + estimator.setRateDrop(value) + this + } + + def setSkipDrop(value: Double): this.type = { + estimator.setSkipDrop(value) + this + } + + def setLambdaBias(value: Double): this.type = { + estimator.setLambdaBias(value) + this + } + + def setObjective(value: String): this.type = { + estimator.setObjective(value) + this + } + + def setObjectiveType(value: String): this.type = { + estimator.setObjectiveType(value) + this + } + + def setSubsample(value: Double): this.type = { + estimator.setSubsample(value) + this + } + + def setBaseScore(value: Double): this.type = { + estimator.setBaseScore(value) + this + } + + def setEvalMetric(value: String): this.type = { + estimator.setEvalMetric(value) + this + } + + def setNumEarlyStoppingRounds(value: Int): this.type = { + estimator.setNumEarlyStoppingRounds(value) + this + } + + def setMaximizeEvaluationMetrics(value: Boolean): this.type = { + estimator.setMaximizeEvaluationMetrics(value) + this + } +} + +/** + * [[XGBRegressorModel]] xgboost wrapper of XGBRegressorModel. + */ +class XGBRegressorModel private[bigdl](val model: XGBoostRegressionModel) { + var predictionCol: String = null + var featuresCol: String = "features" + var featurearray: Array[String] = Array("features") + def setPredictionCol(value: String): this.type = { + predictionCol = value + this + } + + def setInferBatchSize(value: Int): this.type = { + model.setInferBatchSize(value) + this + } + + def setFeaturesCol(value: String): this.type = { + model.setFeaturesCol(value) + featuresCol = value + this + } + + def transform(dataset: DataFrame): DataFrame = { + val featureVectorAssembler = new VectorAssembler() + .setInputCols(featurearray) + .setOutputCol("featureAssembledVector") + val assembledDF = featureVectorAssembler.transform(dataset) + import org.apache.spark.ml.linalg.Vector + import org.apache.spark.sql.functions.{col, udf} + val asDense = udf((v: Vector) => v.toDense) + val xgbInput = assembledDF.withColumn("DenseFeatures", asDense(col("featureAssembledVector"))) + model.setFeaturesCol("DenseFeatures") + var output = model.transform(xgbInput).drop("DenseFeatures", "featureAssembledVector") + if(predictionCol != null) { + output = output.withColumnRenamed("prediction", predictionCol) + } + output + } + + def save(path: String): Unit = { + model.write.overwrite().save(path) + } +} + +object XGBRegressorModel { + /** + * Load pretrained Zoo XGBRegressorModel. + */ + def load(path: String): XGBRegressorModel = { + new XGBRegressorModel(XGBoostRegressionModel.load(path)) + } + + /** + * Load pretrained xgboost XGBoostRegressionModel. + */ + def loadFromXGB(path: String): XGBRegressorModel = { + new XGBRegressorModel(XGBoostHelper.load(path)) + } +} + +/** + * [[lightGBMClassifier wrapper]] + */ +class LightGBMClassifier { + + val sc = SparkSession.active.sparkContext + sc.getConf.set("spark.task.cpus", Engine.coreNumber().toString) + + val estimator = new MLightGBMClassifier() + estimator.setNumThreads(Engine.coreNumber()) + + def setLabelCol(labelColName : String) : this.type = { + estimator.setLabelCol(labelColName) + this + } + + def setFeaturesCol(featuresColName: String): this.type = { + estimator.setFeaturesCol(featuresColName) + this + } + + def setBoostingType(value: String): this.type = { + estimator.setBoostingType(value) + this + } + // for regularization + def setMaxBin(value: Int): this.type = { + estimator.setMaxBin(value) + this + } + + def setNumLeaves(value: Int): this.type = { + estimator.setNumLeaves(value) + this + } + + def setMinDataInLeaf(value: Int): this.type = { + estimator.setMinDataInLeaf(value) + this + } + + def setMinSumHessianInLeaf(value: Int): this.type = { + estimator.setMinSumHessianInLeaf(value) + this + } + + def setBaggingFraction(value: Double): this.type = { + estimator.setBaggingFraction(value) + this + } + + def setBaggingFreq(value: Int): this.type = { + estimator.setBaggingFreq(value) + this + } + + def setFeatureFraction(value: Double): this.type = { + estimator.setFeatureFraction(value) + this + } + + def setLambdaL1(value: Double): this.type = { + estimator.setLambdaL1(value) + this + } + + def setLambdaL2(value: Double): this.type = { + estimator.setLambdaL2(value) + this + } + + def setMaxDepth(value: Int): this.type = { + estimator.setMaxDepth(value) + this + } + + def setMinGainToSplit(value: Double): this.type = { + estimator.setMinGainToSplit(value) + this + } + + def setMaxDeltaStep(value: Double): this.type = { + estimator.setMaxDeltaStep(value) + this + } + + def setSkipDrop(value: Double): this.type = { + estimator.setSkipDrop(value) + this + } + + // training + def setNumIterations(value: Int): this.type = { + estimator.setNumIterations(value) + this + } + def getNumIterations: Int = { + estimator.getNumIterations + } + + def setLearningRate(value: Double): this.type = { + estimator.setLearningRate(value) + this + } + + def setEarlyStoppingRound(value: Int): this.type = { + estimator.setEarlyStoppingRound(value) + this + } + + def setCategoricalSlotNames(value: Array[String]): this.type = { + estimator.setCategoricalSlotNames(value) + this + } + def setCategoricalSlotIndexes(value: Array[Int]): this.type = { + estimator.setCategoricalSlotIndexes(value) + this + } + + def setObjective(value: String): this.type = { + estimator.setObjective(value) + this + } + + + def setIsUnbalance(value: Boolean): this.type = { + estimator.setIsUnbalance(value) + this + } + def setNumThreads(value: Int): this.type = { + estimator.setNumThreads(value) + this + } + def fit(df: DataFrame): LightGBMClassifierModel = { + df.repartition(Engine.nodeNumber()) + val lightgbmmodel = estimator.fit(df) + new LightGBMClassifierModel(lightgbmmodel) + } +} + +/** + * [[LightGBMClassifierModel]] is a trained LightGBM classification model. + * The prediction column will have the prediction results. + * + * @param model trained MLightGBMClassificationModel to use in prediction. + */ +class LightGBMClassifierModel private[bigdl](val model: MLightGBMClassificationModel) { + private var featuresCols: String = "features" + private var predictionCol: String = "prediction" + + def setFeaturesCol(featuresColName: String): this.type = { + featuresCols = featuresColName + this + } + + def setPredictionCol(value: String): this.type = { + predictionCol = value + this + } + + def transform(dataset: DataFrame): DataFrame = { + Log4Error.invalidInputError(featuresCols!=None, "Please set feature columns before transform") + model.setFeaturesCol(featuresCols) + var output = model.transform(dataset) + if(predictionCol != null) { + output = output.withColumnRenamed("prediction", predictionCol) + } + output + } + + def saveNativeModel(path: String): Unit = { + model.saveNativeModel(path, overwrite = true) + } +} + +object LightGBMClassifierModel { + def loadNativeModel(path: String): LightGBMClassifierModel = { + new LightGBMClassifierModel(MLightGBMClassificationModel.loadNativeModelFromFile(path)) + } +} + +/** + * [[LightGBMRegressor]] lightGBM wrapper of LightGBMRegressor. + */ +class LightGBMRegressor { + + val sc = SparkSession.active.sparkContext + sc.getConf.set("spark.task.cpus", Engine.coreNumber().toString) + + private val estimator = new MLightGBMRegressor() + estimator.setNumThreads(Engine.coreNumber()) + + def setAlpha(value: Double): this.type = { + estimator.setAlpha(value) + this + } + + def setLabelCol(labelColName : String) : this.type = { + estimator.setLabelCol(labelColName) + this + } + + def setFeaturesCol(featuresColName: String): this.type = { + estimator.setFeaturesCol(featuresColName) + this + } + + def setBoostingType(value: String): this.type = { + estimator.setBoostingType(value) + this + } + // for regularization + def setMaxBin(value: Int): this.type = { + estimator.setMaxBin(value) + this + } + + def setNumLeaves(value: Int): this.type = { + estimator.setNumLeaves(value) + this + } + + def setMinDataInLeaf(value: Int): this.type = { + estimator.setMinDataInLeaf(value) + this + } + + def setMinSumHessianInLeaf(value: Int): this.type = { + estimator.setMinSumHessianInLeaf(value) + this + } + + def setBaggingFraction(value: Double): this.type = { + estimator.setBaggingFraction(value) + this + } + + def setBaggingFreq(value: Int): this.type = { + estimator.setBaggingFreq(value) + this + } + + def setFeatureFraction(value: Double): this.type = { + estimator.setFeatureFraction(value) + this + } + + def setLambdaL1(value: Double): this.type = { + estimator.setLambdaL1(value) + this + } + + def setLambdaL2(value: Double): this.type = { + estimator.setLambdaL2(value) + this + } + + def setMaxDepth(value: Int): this.type = { + estimator.setMaxDepth(value) + this + } + + def setMinGainToSplit(value: Double): this.type = { + estimator.setMinGainToSplit(value) + this + } + + def setMaxDeltaStep(value: Double): this.type = { + estimator.setMaxDeltaStep(value) + this + } + + def setSkipDrop(value: Double): this.type = { + estimator.setSkipDrop(value) + this + } + + // training + def setNumIterations(value: Int): this.type = { + estimator.setNumIterations(value) + this + } + + + def setLearningRate(value: Double): this.type = { + estimator.setLearningRate(value) + this + } + + def setEarlyStoppingRound(value: Int): this.type = { + estimator.setEarlyStoppingRound(value) + this + } + + def setCategoricalSlotNames(value: Array[String]): this.type = { + estimator.setCategoricalSlotNames(value) + this + } + def setCategoricalSlotIndexes(value: Array[Int]): this.type = { + estimator.setCategoricalSlotIndexes(value) + this + } + + def setObjective(value: String): this.type = { + estimator.setObjective(value) + this + } + + def setNumThreads(value: Int): this.type = { + estimator.setNumThreads(value) + this + } + + def fit(df: DataFrame): LightGBMRegressorModel = { + df.repartition(Engine.nodeNumber()) + val lightgbmmodel = estimator.fit(df) + new LightGBMRegressorModel(lightgbmmodel) + } +} + +/** + * [[LightGBMRegressorModel]] lightGBM wrapper of LightGBMRegressorModel. + */ +class LightGBMRegressorModel private[bigdl](val model: MLightGBMRegressionModel) { + var predictionCol: String = null + var featuresCol: String = "features" + var featurearray: Array[String] = Array("features") + def setPredictionCol(value: String): this.type = { + predictionCol = value + this + } + + def setFeaturesCol(value: String): this.type = { + model.setFeaturesCol(value) + featuresCol = value + this + } + + def transform(dataset: DataFrame): DataFrame = { + val featureVectorAssembler = new VectorAssembler() + .setInputCols(featurearray) + .setOutputCol("featureAssembledVector") + val assembledDF = featureVectorAssembler.transform(dataset) + import org.apache.spark.ml.linalg.Vector + import org.apache.spark.sql.functions.{col, udf} + val asDense = udf((v: Vector) => v.toDense) + val xgbInput = assembledDF.withColumn("DenseFeatures", asDense(col("featureAssembledVector"))) + model.setFeaturesCol("DenseFeatures") + var output = model.transform(xgbInput).drop("DenseFeatures", "featureAssembledVector") + if(predictionCol != null) { + output = output.withColumnRenamed("prediction", predictionCol) + } + output + } + + def saveNativeModel(path: String): Unit = { + model.saveNativeModel(path, overwrite = true) + } +} + +object LightGBMRegressorModel { + /** + * Load pretrained Zoo XGBRegressorModel. + */ + def loadNativeModel(path: String): LightGBMRegressorModel = { + new LightGBMRegressorModel(MLightGBMRegressionModel.loadNativeModelFromFile(path)) + } +} + diff --git a/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/LightGBMTrainSpec.scala b/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/LightGBMTrainSpec.scala new file mode 100644 index 00000000000..aaf3ab0a047 --- /dev/null +++ b/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/LightGBMTrainSpec.scala @@ -0,0 +1,114 @@ +/* + * Copyright 2016 The BigDL Authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.intel.analytics.bigdl.dllib.nnframes + +import com.intel.analytics.bigdl.dllib.keras.ZooSpecHelper +import com.intel.analytics.bigdl.dllib.utils.{Engine, TestUtils} +import org.apache.spark.SparkContext +import org.apache.spark.ml.feature.VectorAssembler +import org.apache.spark.sql.{SQLContext, SparkSession} +import com.microsoft.azure.synapse.ml.lightgbm.{LightGBMClassifier => MLightGBMClassifier} +import org.apache.spark.SparkConf + + +class LightGBMTrainSpec extends ZooSpecHelper { + var sc : SparkContext = _ + var sqlContext : SQLContext = _ + + override def doBefore(): Unit = { + val conf = new SparkConf().setAppName("Test lightGBM").setMaster("local[1]") + sc = SparkContext.getOrCreate(conf) + sqlContext = new SQLContext(sc) + } + + override def doAfter(): Unit = { + if (sc != null) { + sc.stop() + } + } + + "LightGBMClassifer train" should "work" in { + val spark = SparkSession.builder().getOrCreate() + import spark.implicits._ + Engine.init + val df = Seq( + (1.0, 2.0, 3.0, 4.0, 1), + (1.0, 3.0, 8.0, 2.0, 0) + ).toDF("f1", "f2", "f3", "f4", "label") + val vectorAssembler = new VectorAssembler() + .setInputCols(Array("f1", "f2", "f3", "f4")) + .setOutputCol("features") + val assembledDf = vectorAssembler.transform(df).select("features", "label").cache() + if (spark.version.substring(0, 3).toDouble >= 3.1) { + val lightGBMclassifier = new LightGBMClassifier() + val classifier = new MLightGBMClassifier() + val model1 = lightGBMclassifier.fit(assembledDf) + val model = classifier.fit(assembledDf) + val res1 = model1.transform(assembledDf) + TestUtils.conditionFailTest(res1.count() == 2) + } + } + + "LightGBMClassifer save" should "work" in { + val spark = SparkSession.builder().getOrCreate() + import spark.implicits._ + Engine.init + val df = Seq( + (1.0, 2.0, 3.0, 4.0, 1), + (1.0, 3.0, 8.0, 2.0, 0) + ).toDF("f1", "f2", "f3", "f4", "label") + val vectorAssembler = new VectorAssembler() + .setInputCols(Array("f1", "f2", "f3", "f4")) + .setOutputCol("features") + val assembledDf = vectorAssembler.transform(df).select("features", "label").cache() + if (spark.version.substring(0, 3).toDouble >= 3.1) { + val lightGBMclassifier = new LightGBMClassifier() + val model = lightGBMclassifier.fit(assembledDf) + model.saveNativeModel("/tmp/lightgbm/classifier1") + val model2 = LightGBMClassifierModel.loadNativeModel("/tmp/lightgbm/classifier1") + val res2 = model2.transform(assembledDf) + TestUtils.conditionFailTest(res2.count() == 2) + } + } + + "LightGBMRegressor train" should "work" in { + val spark = SparkSession.builder().getOrCreate() + import spark.implicits._ + Engine.init + val df = Seq( + (1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 1.0f, 2.0f, 4.0f, 8.0f, 3.0f, 116.3668f), + (1.0f, 3.0f, 8.0f, 6.0f, 5.0f, 9.0f, 5.0f, 6.0f, 7.0f, 4.0f, 116.367f), + (2.0f, 1.0f, 5.0f, 7.0f, 6.0f, 7.0f, 4.0f, 1.0f, 2.0f, 3.0f, 116.367f), + (2.0f, 1.0f, 4.0f, 3.0f, 6.0f, 1.0f, 3.0f, 2.0f, 1.0f, 3.0f, 116.3668f) + ).toDF("f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10", "label") + val vectorAssembler = new VectorAssembler() + .setInputCols(Array("f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8", "f9", "f10")) + .setOutputCol("features") + val assembledDf = vectorAssembler.transform(df).select("features", "label").cache() + if (spark.version.substring(0, 3).toDouble >= 3.1) { + val lightGBMRegressor = new LightGBMRegressor() + val regressorModel0 = lightGBMRegressor.fit(assembledDf) + val y0 = regressorModel0.transform(assembledDf) + regressorModel0.saveNativeModel("/tmp/test") + val model = LightGBMRegressorModel.loadNativeModel("/tmp/test") + val y0_0 = model.transform(assembledDf) + TestUtils.conditionFailTest(y0.count() == 4) + TestUtils.conditionFailTest(y0_0.count() == 4) + } + } +} + diff --git a/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/XgboostTrainSpec.scala b/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/XgboostTrainSpec.scala index 5c138ad3d21..ec5c3a62a4e 100644 --- a/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/XgboostTrainSpec.scala +++ b/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/nnframes/XgboostTrainSpec.scala @@ -16,10 +16,10 @@ package com.intel.analytics.bigdl.dllib.nnframes -import com.intel.analytics.bigdl.dllib.utils.Engine +import com.intel.analytics.bigdl.dllib.utils.{Engine, TestUtils} import com.intel.analytics.bigdl.dllib.keras.ZooSpecHelper import org.apache.spark.SparkContext -import org.apache.spark.ml.feature.{VectorAssembler} +import org.apache.spark.ml.feature.VectorAssembler import org.apache.spark.sql.{SQLContext, SparkSession} class XgboostTrainSpec extends ZooSpecHelper { @@ -38,7 +38,7 @@ class XgboostTrainSpec extends ZooSpecHelper { } } -/* + "XGBClassifer train" should "work" in { if (!(scala.util.Properties.isMac || scala.util.Properties.isWin)) { val spark = SparkSession.builder().getOrCreate() @@ -57,13 +57,9 @@ class XgboostTrainSpec extends ZooSpecHelper { xgbCf0.setNumRound(10) xgbCf0.setNthread(1) val model = xgbCf0.fit(assembledDf) - - model.setFeaturesCol(Array("f1", "f2", "f3", "f4")) - // testdf = df.cache() - println("the df is: ") - df.show() - val res = model.transform(df) - print(res) + model.setFeaturesCol("features") + model.save("/tmp/xgboost") + val res = model.transform(assembledDf) res.show() } } @@ -90,9 +86,9 @@ class XgboostTrainSpec extends ZooSpecHelper { xgbRegressorModel0.save("/tmp/test") val model = XGBRegressorModel.load("/tmp/test") val y0_0 = model.transform(assembledDf) - TestUtils.conditionFailTest(y0_0.except(y0).count()==0) + y0_0.show() + TestUtils.conditionFailTest(y0_0.except(y0).count() == 0) } } -*/ } diff --git a/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/parameters/FP16ParameterSpec.scala b/scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/optim/parameters/FP16ParameterSpec.scala similarity index 100% rename from scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/parameters/FP16ParameterSpec.scala rename to scala/dllib/src/test/scala/com/intel/analytics/bigdl/dllib/optim/parameters/FP16ParameterSpec.scala