Skip to content

Commit

Permalink
[ML-53] [CPU] Add Linear & Ridge Regression (#75)
Browse files Browse the repository at this point in the history
* linear regression using DAL

* Coding done

* linear regression trainwithDAL pass

* Remove naive bayes files

* fix conflicts

* Fix crash due to DAL LiR return n+1 weights columns (w0...wn)

* Add linear regression example

* Add condition to enable oap and switch linear & ridge regressions
Add ridgeParameter as input
Code cleanup

* nit

* nit

* move LinearRegression.scala to fit in new dir structure

* Add 3.0.x and 3.1.1 support

* turn spark.oap.mllib.enabled true by default, ignore tests not passed, will be fixed later

Co-authored-by: Jiang, Bo <[email protected]>
  • Loading branch information
xwu99 and bobjiang82 authored Jun 22, 2021
1 parent 01d5278 commit 279980e
Show file tree
Hide file tree
Showing 19 changed files with 6,804 additions and 9 deletions.
501 changes: 501 additions & 0 deletions examples/data/sample_linear_regression_data.txt

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions examples/linear-regression/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/usr/bin/env bash

mvn clean package
94 changes: 94 additions & 0 deletions examples/linear-regression/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>com.intel.oap</groupId>
<artifactId>oap-mllib-examples</artifactId>
<version>${oap.version}-with-spark-${spark.version}</version>
<packaging>jar</packaging>

<name>LinearRegressionExample</name>
<url>https://github.com/oap-project/oap-mllib.git</url>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<oap.version>1.1.0</oap.version>
<scala.version>2.12.10</scala.version>
<scala.binary.version>2.12</scala.binary.version>
<spark.version>3.0.0</spark.version>
</properties>

<dependencies>

<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.12.10</version>
</dependency>

<dependency>
<groupId>com.github.scopt</groupId>
<artifactId>scopt_2.12</artifactId>
<version>3.7.0</version>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>

<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.12</artifactId>
<version>${spark.version}</version>
<scope>provided</scope>
</dependency>

</dependencies>

<build>
<plugins>
<plugin>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
</execution>
</executions>
<configuration>
<scalaVersion>${scala.version}</scalaVersion>
<args>
<arg>-target:jvm-1.8</arg>
</args>
</configuration>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.0.0</version>
<configuration>
<appendAssemblyId>false</appendAssemblyId>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>

</project>
28 changes: 28 additions & 0 deletions examples/linear-regression/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/usr/bin/env bash

source ../../conf/env.sh

APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION-with-spark-3.0.0.jar
APP_CLASS=org.apache.spark.examples.ml.LinearRegressionExample
DATA_FILE=data/sample_linear_regression_data.txt

OAP_MLLIB_ENABLED=true

time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
--num-executors $SPARK_NUM_EXECUTORS \
--driver-memory $SPARK_DRIVER_MEMORY \
--executor-cores $SPARK_EXECUTOR_CORES \
--executor-memory $SPARK_EXECUTOR_MEMORY \
--conf "spark.oap.mllib.enabled=$OAP_MLLIB_ENABLED" \
--conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
--conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
--conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
--conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
--conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
--conf "spark.shuffle.reduceLocality.enabled=false" \
--conf "spark.network.timeout=1200s" \
--conf "spark.task.maxFailures=1" \
--jars $OAP_MLLIB_JAR \
--class $APP_CLASS \
$APP_JAR $DATA_FILE \
2>&1 | tee LinearRegression-$(date +%m%d_%H_%M_%S).log
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

// scalastyle:off println
package org.apache.spark.examples.ml

import scopt.OptionParser

import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.sql.{DataFrame, SparkSession}

/**
* An example runner for linear regression with elastic-net (mixing L1/L2) regularization.
* Run with
* {{{
* bin/run-example ml.LinearRegressionExample [options]
* }}}
* A synthetic dataset can be found at `data/mllib/sample_linear_regression_data.txt` which can be
* trained by
* {{{
* bin/run-example ml.LinearRegressionExample --regParam 0.15 --elasticNetParam 1.0 \
* data/mllib/sample_linear_regression_data.txt
* }}}
* If you use it as a template to create your own app, please use `spark-submit` to submit your app.
*/
object LinearRegressionExample {

case class Params(
input: String = null,
testInput: String = "",
dataFormat: String = "libsvm",
regParam: Double = 0.0,
elasticNetParam: Double = 0.0,
maxIter: Int = 100,
tol: Double = 1E-6,
fracTest: Double = 0.2)

def main(args: Array[String]): Unit = {
val defaultParams = Params()

val parser = new OptionParser[Params]("LinearRegressionExample") {
head("LinearRegressionExample: an example Linear Regression with Elastic-Net app.")
opt[Double]("regParam")
.text(s"regularization parameter, default: ${defaultParams.regParam}")
.action((x, c) => c.copy(regParam = x))
opt[Double]("elasticNetParam")
.text(s"ElasticNet mixing parameter. For alpha = 0, the penalty is an L2 penalty. " +
s"For alpha = 1, it is an L1 penalty. For 0 < alpha < 1, the penalty is a combination of " +
s"L1 and L2, default: ${defaultParams.elasticNetParam}")
.action((x, c) => c.copy(elasticNetParam = x))
opt[Int]("maxIter")
.text(s"maximum number of iterations, default: ${defaultParams.maxIter}")
.action((x, c) => c.copy(maxIter = x))
opt[Double]("tol")
.text(s"the convergence tolerance of iterations, Smaller value will lead " +
s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}")
.action((x, c) => c.copy(tol = x))
opt[Double]("fracTest")
.text(s"fraction of data to hold out for testing. If given option testInput, " +
s"this option is ignored. default: ${defaultParams.fracTest}")
.action((x, c) => c.copy(fracTest = x))
opt[String]("testInput")
.text(s"input path to test dataset. If given, option fracTest is ignored." +
s" default: ${defaultParams.testInput}")
.action((x, c) => c.copy(testInput = x))
opt[String]("dataFormat")
.text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
.action((x, c) => c.copy(dataFormat = x))
arg[String]("<input>")
.text("input path to labeled examples")
.required()
.action((x, c) => c.copy(input = x))
checkConfig { params =>
if (params.fracTest < 0 || params.fracTest >= 1) {
failure(s"fracTest ${params.fracTest} value incorrect; should be in [0,1).")
} else {
success
}
}
}

parser.parse(args, defaultParams) match {
case Some(params) => run(params)
case _ => sys.exit(1)
}
}

def run(params: Params): Unit = {
val spark = SparkSession
.builder
.appName(s"LinearRegressionExample with $params")
.getOrCreate()

println(s"LinearRegressionExample with parameters:\n$params")

val training = spark.read.format("libsvm")
.load(params.input)

val lir = new LinearRegression()
.setFeaturesCol("features")
.setLabelCol("label")
.setRegParam(params.regParam)
.setElasticNetParam(params.elasticNetParam)
.setMaxIter(params.maxIter)
.setTol(params.tol)

// Train the model
val startTime = System.nanoTime()
val lirModel = lir.fit(training)
val elapsedTime = (System.nanoTime() - startTime) / 1e9
println(s"Training time: $elapsedTime seconds")

// Print the coefficients and intercept for linear regression
println(s"Coefficients: ${lirModel.coefficients} Intercept: ${lirModel.intercept}")

// Summarize the model over the training set and print out some metrics
val trainingSummary = lirModel.summary
println(s"numIterations: ${trainingSummary.totalIterations}")
println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
trainingSummary.residuals.show()
println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
println(s"r2: ${trainingSummary.r2}")

spark.stop()
}
}
// scalastyle:on println
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
/*******************************************************************************
* Copyright 2020 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/

package org.apache.spark.ml.regression;

public class LiRResult {
public long coeffNumericTable; // first element of coeff is actually intercept
}
Loading

0 comments on commit 279980e

Please sign in to comment.