v1.1.0 - added 6 optimised algorithms (#18)

Add Kmeans、DecisionTree、LinearRegression、LogisticRegression、PCA and SVD testcase
kunpengcompute · Dec 24, 2020 · 36e654a · 36e654a
1 parent 92288c1
commit 36e654a
Show file tree

Hide file tree

Showing 58 changed files with 7,040 additions and 130 deletions.
diff --git a/ml-accelerator/pom.xml b/ml-accelerator/pom.xml
@@ -2,40 +2,49 @@
   <parent>
      <groupId>org.apache.spark</groupId>
      <artifactId>sophon-ml</artifactId>
-     <version>1.0.0</version>
+     <version>1.1.0</version>
   </parent>
 
   <modelVersion>4.0.0</modelVersion>
   <artifactId>sophon-ml-acc_2.11</artifactId>
-  <version>1.0.0</version>
+  <version>1.1.0</version>
   <name>${project.artifactId}</name>
   <description>Spark ml algo accelerator</description>
 
   <dependencies>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>sophon-ml-core_2.11</artifactId>
-      <version>1.0.0</version>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>sophon-ml-kernel_2.11</artifactId>
+      <version>${project.version}</version>
+      <classifier>${os.detected.arch}</classifier>
+      <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.spark</groupId>
       <artifactId>sophon-ml-kernel-client_2.11</artifactId>
-      <version>1.0.0</version>
+      <version>${project.version}</version>
+      <scope>compile</scope>
     </dependency>
   </dependencies>
 
   <build>
     <sourceDirectory>src/main/scala</sourceDirectory>
+    <testSourceDirectory>src/test/scala</testSourceDirectory>
     <plugins>
       <plugin>
-        <!-- see http://davidb.github.com/scala-maven-plugin -->
         <groupId>net.alchim31.maven</groupId>
         <artifactId>scala-maven-plugin</artifactId>
         <version>3.2.0</version>
         <executions>
           <execution>
             <goals>
               <goal>compile</goal>
+              <goal>testCompile</goal>
             </goals>
             <configuration>
               <args>
@@ -46,6 +55,42 @@
           </execution>
         </executions>
       </plugin>
+      <!-- scalatest plugin for UT-->
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-surefire-plugin</artifactId>
+        <version>2.6</version>
+        <configuration>
+          <skipTests>true</skipTests>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.scalatest</groupId>
+        <artifactId>scalatest-maven-plugin</artifactId>
+        <version>1.0</version>
+        <configuration>
+          <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
+          <junitxml>.</junitxml>
+          <filereports>TestSuite.txt</filereports>
+        </configuration>
+        <executions>
+          <execution>
+            <id>test</id>
+            <goals>
+              <goal>test</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
     </plugins>
+
+<extensions>
+      <extension>
+        <groupId>kr.motd.maven</groupId>
+        <artifactId>os-maven-plugin</artifactId>
+        <version>1.6.1</version>
+      </extension>
+    </extensions>
+
   </build>
 </project>
diff --git a/ml-accelerator/src/main/scala/breeze/optimize/FirstOrderMinimizerX.scala b/ml-accelerator/src/main/scala/breeze/optimize/FirstOrderMinimizerX.scala
@@ -1,34 +1,14 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
+// scalastyle:off header.matches
 package breeze.optimize
 
-import java.util.Date
+import scala.language.implicitConversions
 
 import FirstOrderMinimizerX.ConvergenceCheck
 import breeze.linalg.norm
-import breeze.math.{MutableEnumeratedCoordinateField, MutableFiniteCoordinateField, MutableInnerProductModule, NormedModule}
-import breeze.stats.distributions.{RandBasis, ThreadLocalRandomGenerator}
+import breeze.math.{MutableInnerProductModule, NormedModule}
 import breeze.util.Implicits._
 import breeze.util.SerializableLogging
 
-
-
 /**
  *
  * @author dlwh

diff --git a/ml-accelerator/src/main/scala/breeze/optimize/LBFGSX.scala b/ml-accelerator/src/main/scala/breeze/optimize/LBFGSX.scala
@@ -1,19 +1,4 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
+// scalastyle:off header.matches
 package breeze.optimize
 
 /*
@@ -33,12 +18,23 @@ package breeze.optimize
 */
 
 import breeze.linalg._
-import breeze.linalg.operators.OpMulMatrix
 import breeze.math.MutableInnerProductModule
-import breeze.optimize.FirstOrderMinimizerX.{ConvergenceCheck, ConvergenceReason}
-import breeze.optimize.linear.PowerMethod
+import breeze.optimize.FirstOrderMinimizerX.ConvergenceCheck
 import breeze.util.SerializableLogging
 
+/**
+ * Port of LBFGS to Scala.
+ *
+ * Special note for LBFGS:
+ *  If you use it in published work, you must cite one of:
+ *     * J. Nocedal. Updating  Quasi-Newton  Matrices  with  Limited  Storage
+ *    (1980), Mathematics of Computation 35, pp. 773-782.
+ *  * D.C. Liu and J. Nocedal. On the  Limited  mem  Method  for  Large
+ *    Scale  Optimization  (1989),  Mathematical  Programming  B,  45,  3,
+ *    pp. 503-528.
+ *
+ * @param m: The memory of the search. 3 to 7 is usually sufficient.
+ */
 class LBFGSX[T](convergenceCheck: ConvergenceCheck[T], m: Int)
                (implicit space: MutableInnerProductModule[T, Double]) extends
   FirstOrderMinimizerX[T, DiffFunction[T]](convergenceCheck) with SerializableLogging {

diff --git a/ml-accelerator/src/main/scala/breeze/optimize/OWLQNX.scala b/ml-accelerator/src/main/scala/breeze/optimize/OWLQNX.scala
@@ -1,27 +1,18 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
+// scalastyle:off header.matches
 package breeze.optimize
 
 import breeze.math._
 import breeze.numerics._
 import breeze.util._
 
-
+/**
+ * Implements the Orthant-wise Limited Memory QuasiNewton method,
+ * which is a variant of LBFGS that handles L1 regularization.
+ *
+ * Paper is Andrew and Gao (2007) Scalable Training of L1-Regularized Log-Linear Models
+ *
+ * @author dlwh
+ */
 class OWLQNX[K, T](maxIter: Int, m: Int, l1reg: K => Double, tolerance: Double)
                   (implicit space: MutableEnumeratedCoordinateField[T, K, Double])
   extends LBFGSX[T](maxIter, m, tolerance = tolerance) with SerializableLogging {

diff --git a/...ccelerator/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala b/...ccelerator/src/main/scala/org/apache/spark/ml/classification/DecisionTreeClassifier.scala
@@ -0,0 +1,154 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.ml.classification
+
+import org.apache.spark.annotation.Since
+import org.apache.spark.ml.feature.LabeledPoint
+import org.apache.spark.ml.linalg.Vector
+import org.apache.spark.ml.param.ParamMap
+import org.apache.spark.ml.tree._
+import org.apache.spark.ml.tree.impl.DecisionForest
+import org.apache.spark.ml.util._
+import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, Strategy => OldStrategy}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.Dataset
+
+
+/**
+ * Decision tree learning algorithm (http://en.wikipedia.org/wiki/Decision_tree_learning)
+ * for classification.
+ * It supports both binary and multiclass labels, as well as both continuous and categorical
+ * features.
+ */
+@Since("1.4.0")
+class DecisionTreeClassifier @Since("1.4.0") (
+    @Since("1.4.0") override val uid: String)
+  extends ProbabilisticClassifier[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel]
+  with DecisionTreeClassifierParams with DefaultParamsWritable {
+
+  @Since("1.4.0")
+  def this() = this(Identifiable.randomUID("dtc"))
+
+  // Override parameter setters from parent trait for Java API compatibility.
+
+  /** @group setParam */
+  @Since("1.4.0")
+  override def setMaxDepth(value: Int): this.type = set(maxDepth, value)
+
+  /** @group setParam */
+  @Since("1.4.0")
+  override def setMaxBins(value: Int): this.type = set(maxBins, value)
+
+  /** @group setParam */
+  @Since("1.4.0")
+  override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)
+
+  /** @group setParam */
+  @Since("1.4.0")
+  override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)
+
+  /** @group expertSetParam */
+  @Since("1.4.0")
+  override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)
+
+  /** @group expertSetParam */
+  @Since("1.4.0")
+  override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)
+
+  /**
+   * Specifies how often to checkpoint the cached node IDs.
+   * E.g. 10 means that the cache will get checkpointed every 10 iterations.
+   * This is only used if cacheNodeIds is true and if the checkpoint directory is set in
+   * [[org.apache.spark.SparkContext]].
+   * Must be at least 1.
+   * (default = 10)
+   * @group setParam
+   */
+  @Since("1.4.0")
+  override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)
+
+  /** @group setParam */
+  @Since("1.4.0")
+  override def setImpurity(value: String): this.type = set(impurity, value)
+
+  /** @group setParam */
+  @Since("1.6.0")
+  override def setSeed(value: Long): this.type = set(seed, value)
+
+  override protected def train(dataset: Dataset[_]): DecisionTreeClassificationModel = {
+    val categoricalFeatures: Map[Int, Int] =
+      MetadataUtils.getCategoricalFeatures(dataset.schema($(featuresCol)))
+    val numClasses: Int = getNumClasses(dataset)
+
+    if (isDefined(thresholds)) {
+      require($(thresholds).length == numClasses, this.getClass.getSimpleName +
+        ".train() called with non-matching numClasses and thresholds.length." +
+        s" numClasses=$numClasses, but thresholds has length ${$(thresholds).length}")
+    }
+
+    val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset, numClasses)
+    val strategy = getOldStrategy(categoricalFeatures, numClasses)
+
+    val instr = Instrumentation.create(this, oldDataset)
+    instr.logParams(params: _*)
+
+    val trees = DecisionForest.run(oldDataset, strategy, numTrees = 1,
+      featureSubsetStrategy = "all",
+      seed = $(seed), instr = Some(instr), parentUID = Some(uid))
+
+    val m = trees.head.asInstanceOf[DecisionTreeClassificationModel]
+    instr.logSuccess(m)
+    m
+  }
+
+  /** (private[ml]) Train a decision tree on an RDD */
+  private[ml] def train(data: RDD[LabeledPoint],
+      oldStrategy: OldStrategy): DecisionTreeClassificationModel = {
+    val instr = Instrumentation.create(this, data)
+    instr.logParams(params: _*)
+
+    val trees = DecisionForest.run(data, oldStrategy, numTrees = 1,
+      featureSubsetStrategy = "all",
+      seed = 0L, instr = Some(instr), parentUID = Some(uid))
+
+    val m = trees.head.asInstanceOf[DecisionTreeClassificationModel]
+    instr.logSuccess(m)
+    m
+  }
+
+  /** (private[ml]) Create a Strategy instance to use with the old API. */
+  private[ml] def getOldStrategy(
+      categoricalFeatures: Map[Int, Int],
+      numClasses: Int): OldStrategy = {
+    super.getOldStrategy(categoricalFeatures, numClasses, OldAlgo.Classification, getOldImpurity,
+      subsamplingRate = 1.0)
+  }
+
+  @Since("1.4.1")
+  override def copy(extra: ParamMap): DecisionTreeClassifier = defaultCopy(extra)
+}
+
+@Since("1.4.0")
+object DecisionTreeClassifier extends DefaultParamsReadable[DecisionTreeClassifier] {
+  /** Accessor for supported impurities: entropy, gini */
+  @Since("1.4.0")
+  final val supportedImpurities: Array[String] = TreeClassifierParams.supportedImpurities
+
+  @Since("2.0.0")
+  override def load(path: String): DecisionTreeClassifier = super.load(path)
+}
diff --git a/ml-accelerator/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala b/ml-accelerator/src/main/scala/org/apache/spark/ml/classification/GBTClassifier.scala
diff --git a/ml-accelerator/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala b/ml-accelerator/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
@@ -19,18 +19,16 @@ package org.apache.spark.ml.classification
 
 import scala.collection.mutable
 
-import breeze.linalg.{norm, DenseVector => BDV}
+import breeze.linalg.{DenseVector => BDV}
 import breeze.optimize.{CachedDiffFunction, OWLQNX => BreezeOWLQN}
 
 import org.apache.spark.SparkException
 import org.apache.spark.annotation.{Experimental, Since}
-import org.apache.spark.internal.Logging
 import org.apache.spark.ml.feature.Instance
 import org.apache.spark.ml.linalg._
-import org.apache.spark.ml.optim.aggregator.{HingeAggregator, HingeAggregatorX}
-import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction, RDDLossFunctionX}
+import org.apache.spark.ml.optim.aggregator.HingeAggregatorX
+import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunctionX}
 import org.apache.spark.ml.param._
-import org.apache.spark.ml.param.shared._
 import org.apache.spark.ml.util._
 import org.apache.spark.mllib.linalg.VectorImplicits._
 import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer