Skip to content

Commit

Permalink
v1.1.0 - added 6 optimised algorithms (#18)
Browse files Browse the repository at this point in the history
Add Kmeans、DecisionTree、LinearRegression、LogisticRegression、PCA and SVD testcase
  • Loading branch information
J493339298 authored Dec 24, 2020
1 parent 92288c1 commit 36e654a
Show file tree
Hide file tree
Showing 58 changed files with 7,040 additions and 130 deletions.
55 changes: 50 additions & 5 deletions ml-accelerator/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,40 +2,49 @@
<parent>
<groupId>org.apache.spark</groupId>
<artifactId>sophon-ml</artifactId>
<version>1.0.0</version>
<version>1.1.0</version>
</parent>

<modelVersion>4.0.0</modelVersion>
<artifactId>sophon-ml-acc_2.11</artifactId>
<version>1.0.0</version>
<version>1.1.0</version>
<name>${project.artifactId}</name>
<description>Spark ml algo accelerator</description>

<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>sophon-ml-core_2.11</artifactId>
<version>1.0.0</version>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>sophon-ml-kernel_2.11</artifactId>
<version>${project.version}</version>
<classifier>${os.detected.arch}</classifier>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>sophon-ml-kernel-client_2.11</artifactId>
<version>1.0.0</version>
<version>${project.version}</version>
<scope>compile</scope>
</dependency>
</dependencies>

<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<!-- see http://davidb.github.com/scala-maven-plugin -->
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.0</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
Expand All @@ -46,6 +55,42 @@
</execution>
</executions>
</plugin>
<!-- scalatest plugin for UT-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.6</version>
<configuration>
<skipTests>true</skipTests>
</configuration>
</plugin>
<plugin>
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
<version>1.0</version>
<configuration>
<reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
<junitxml>.</junitxml>
<filereports>TestSuite.txt</filereports>
</configuration>
<executions>
<execution>
<id>test</id>
<goals>
<goal>test</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>

<extensions>
<extension>
<groupId>kr.motd.maven</groupId>
<artifactId>os-maven-plugin</artifactId>
<version>1.6.1</version>
</extension>
</extensions>

</build>
</project>
Original file line number Diff line number Diff line change
@@ -1,34 +1,14 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/


// scalastyle:off header.matches
package breeze.optimize

import java.util.Date
import scala.language.implicitConversions

import FirstOrderMinimizerX.ConvergenceCheck
import breeze.linalg.norm
import breeze.math.{MutableEnumeratedCoordinateField, MutableFiniteCoordinateField, MutableInnerProductModule, NormedModule}
import breeze.stats.distributions.{RandBasis, ThreadLocalRandomGenerator}
import breeze.math.{MutableInnerProductModule, NormedModule}
import breeze.util.Implicits._
import breeze.util.SerializableLogging



/**
*
* @author dlwh
Expand Down
34 changes: 15 additions & 19 deletions ml-accelerator/src/main/scala/breeze/optimize/LBFGSX.scala
Original file line number Diff line number Diff line change
@@ -1,19 +1,4 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// scalastyle:off header.matches
package breeze.optimize

/*
Expand All @@ -33,12 +18,23 @@ package breeze.optimize
*/

import breeze.linalg._
import breeze.linalg.operators.OpMulMatrix
import breeze.math.MutableInnerProductModule
import breeze.optimize.FirstOrderMinimizerX.{ConvergenceCheck, ConvergenceReason}
import breeze.optimize.linear.PowerMethod
import breeze.optimize.FirstOrderMinimizerX.ConvergenceCheck
import breeze.util.SerializableLogging

/**
* Port of LBFGS to Scala.
*
* Special note for LBFGS:
* If you use it in published work, you must cite one of:
* * J. Nocedal. Updating Quasi-Newton Matrices with Limited Storage
* (1980), Mathematics of Computation 35, pp. 773-782.
* * D.C. Liu and J. Nocedal. On the Limited mem Method for Large
* Scale Optimization (1989), Mathematical Programming B, 45, 3,
* pp. 503-528.
*
* @param m: The memory of the search. 3 to 7 is usually sufficient.
*/
class LBFGSX[T](convergenceCheck: ConvergenceCheck[T], m: Int)
(implicit space: MutableInnerProductModule[T, Double]) extends
FirstOrderMinimizerX[T, DiffFunction[T]](convergenceCheck) with SerializableLogging {
Expand Down
27 changes: 9 additions & 18 deletions ml-accelerator/src/main/scala/breeze/optimize/OWLQNX.scala
Original file line number Diff line number Diff line change
@@ -1,27 +1,18 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// scalastyle:off header.matches
package breeze.optimize

import breeze.math._
import breeze.numerics._
import breeze.util._


/**
* Implements the Orthant-wise Limited Memory QuasiNewton method,
* which is a variant of LBFGS that handles L1 regularization.
*
* Paper is Andrew and Gao (2007) Scalable Training of L1-Regularized Log-Linear Models
*
* @author dlwh
*/
class OWLQNX[K, T](maxIter: Int, m: Int, l1reg: K => Double, tolerance: Double)
(implicit space: MutableEnumeratedCoordinateField[T, K, Double])
extends LBFGSX[T](maxIter, m, tolerance = tolerance) with SerializableLogging {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.ml.classification

import org.apache.spark.annotation.Since
import org.apache.spark.ml.feature.LabeledPoint
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.ml.tree._
import org.apache.spark.ml.tree.impl.DecisionForest
import org.apache.spark.ml.util._
import org.apache.spark.mllib.tree.configuration.{Algo => OldAlgo, Strategy => OldStrategy}
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Dataset


/**
* Decision tree learning algorithm (http://en.wikipedia.org/wiki/Decision_tree_learning)
* for classification.
* It supports both binary and multiclass labels, as well as both continuous and categorical
* features.
*/
@Since("1.4.0")
class DecisionTreeClassifier @Since("1.4.0") (
@Since("1.4.0") override val uid: String)
extends ProbabilisticClassifier[Vector, DecisionTreeClassifier, DecisionTreeClassificationModel]
with DecisionTreeClassifierParams with DefaultParamsWritable {

@Since("1.4.0")
def this() = this(Identifiable.randomUID("dtc"))

// Override parameter setters from parent trait for Java API compatibility.

/** @group setParam */
@Since("1.4.0")
override def setMaxDepth(value: Int): this.type = set(maxDepth, value)

/** @group setParam */
@Since("1.4.0")
override def setMaxBins(value: Int): this.type = set(maxBins, value)

/** @group setParam */
@Since("1.4.0")
override def setMinInstancesPerNode(value: Int): this.type = set(minInstancesPerNode, value)

/** @group setParam */
@Since("1.4.0")
override def setMinInfoGain(value: Double): this.type = set(minInfoGain, value)

/** @group expertSetParam */
@Since("1.4.0")
override def setMaxMemoryInMB(value: Int): this.type = set(maxMemoryInMB, value)

/** @group expertSetParam */
@Since("1.4.0")
override def setCacheNodeIds(value: Boolean): this.type = set(cacheNodeIds, value)

/**
* Specifies how often to checkpoint the cached node IDs.
* E.g. 10 means that the cache will get checkpointed every 10 iterations.
* This is only used if cacheNodeIds is true and if the checkpoint directory is set in
* [[org.apache.spark.SparkContext]].
* Must be at least 1.
* (default = 10)
* @group setParam
*/
@Since("1.4.0")
override def setCheckpointInterval(value: Int): this.type = set(checkpointInterval, value)

/** @group setParam */
@Since("1.4.0")
override def setImpurity(value: String): this.type = set(impurity, value)

/** @group setParam */
@Since("1.6.0")
override def setSeed(value: Long): this.type = set(seed, value)

override protected def train(dataset: Dataset[_]): DecisionTreeClassificationModel = {
val categoricalFeatures: Map[Int, Int] =
MetadataUtils.getCategoricalFeatures(dataset.schema($(featuresCol)))
val numClasses: Int = getNumClasses(dataset)

if (isDefined(thresholds)) {
require($(thresholds).length == numClasses, this.getClass.getSimpleName +
".train() called with non-matching numClasses and thresholds.length." +
s" numClasses=$numClasses, but thresholds has length ${$(thresholds).length}")
}

val oldDataset: RDD[LabeledPoint] = extractLabeledPoints(dataset, numClasses)
val strategy = getOldStrategy(categoricalFeatures, numClasses)

val instr = Instrumentation.create(this, oldDataset)
instr.logParams(params: _*)

val trees = DecisionForest.run(oldDataset, strategy, numTrees = 1,
featureSubsetStrategy = "all",
seed = $(seed), instr = Some(instr), parentUID = Some(uid))

val m = trees.head.asInstanceOf[DecisionTreeClassificationModel]
instr.logSuccess(m)
m
}

/** (private[ml]) Train a decision tree on an RDD */
private[ml] def train(data: RDD[LabeledPoint],
oldStrategy: OldStrategy): DecisionTreeClassificationModel = {
val instr = Instrumentation.create(this, data)
instr.logParams(params: _*)

val trees = DecisionForest.run(data, oldStrategy, numTrees = 1,
featureSubsetStrategy = "all",
seed = 0L, instr = Some(instr), parentUID = Some(uid))

val m = trees.head.asInstanceOf[DecisionTreeClassificationModel]
instr.logSuccess(m)
m
}

/** (private[ml]) Create a Strategy instance to use with the old API. */
private[ml] def getOldStrategy(
categoricalFeatures: Map[Int, Int],
numClasses: Int): OldStrategy = {
super.getOldStrategy(categoricalFeatures, numClasses, OldAlgo.Classification, getOldImpurity,
subsamplingRate = 1.0)
}

@Since("1.4.1")
override def copy(extra: ParamMap): DecisionTreeClassifier = defaultCopy(extra)
}

@Since("1.4.0")
object DecisionTreeClassifier extends DefaultParamsReadable[DecisionTreeClassifier] {
/** Accessor for supported impurities: entropy, gini */
@Since("1.4.0")
final val supportedImpurities: Array[String] = TreeClassifierParams.supportedImpurities

@Since("2.0.0")
override def load(path: String): DecisionTreeClassifier = super.load(path)
}
Empty file.
8 changes: 3 additions & 5 deletions ml-accelerator/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,16 @@ package org.apache.spark.ml.classification

import scala.collection.mutable

import breeze.linalg.{norm, DenseVector => BDV}
import breeze.linalg.{DenseVector => BDV}
import breeze.optimize.{CachedDiffFunction, OWLQNX => BreezeOWLQN}

import org.apache.spark.SparkException
import org.apache.spark.annotation.{Experimental, Since}
import org.apache.spark.internal.Logging
import org.apache.spark.ml.feature.Instance
import org.apache.spark.ml.linalg._
import org.apache.spark.ml.optim.aggregator.{HingeAggregator, HingeAggregatorX}
import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction, RDDLossFunctionX}
import org.apache.spark.ml.optim.aggregator.HingeAggregatorX
import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunctionX}
import org.apache.spark.ml.param._
import org.apache.spark.ml.param.shared._
import org.apache.spark.ml.util._
import org.apache.spark.mllib.linalg.VectorImplicits._
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer
Expand Down
Loading

0 comments on commit 36e654a

Please sign in to comment.