[ML-53] [CPU] Add Linear & Ridge Regression (#75)

* linear regression using DAL * Coding done * linear regression trainwithDAL pass * Remove naive bayes files * fix conflicts * Fix crash due to DAL LiR return n+1 weights columns (w0...wn) * Add linear regression example * Add condition to enable oap and switch linear & ridge regressions Add ridgeParameter as input Code cleanup * nit * nit * move LinearRegression.scala to fit in new dir structure * Add 3.0.x and 3.1.1 support * turn spark.oap.mllib.enabled true by default, ignore tests not passed, will be fixed later Co-authored-by: Jiang, Bo <[email protected]>
oap-project · Jun 22, 2021 · 279980e · 279980e
1 parent 01d5278
commit 279980e
Show file tree

Hide file tree

Showing 19 changed files with 6,804 additions and 9 deletions.
diff --git a/examples/data/sample_linear_regression_data.txt b/examples/data/sample_linear_regression_data.txt
diff --git a/examples/linear-regression/build.sh b/examples/linear-regression/build.sh
@@ -0,0 +1,3 @@
+#!/usr/bin/env bash
+
+mvn clean package
diff --git a/examples/linear-regression/pom.xml b/examples/linear-regression/pom.xml
@@ -0,0 +1,94 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" 
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>com.intel.oap</groupId>
+  <artifactId>oap-mllib-examples</artifactId>
+  <version>${oap.version}-with-spark-${spark.version}</version>  
+  <packaging>jar</packaging>
+
+  <name>LinearRegressionExample</name>
+  <url>https://github.com/oap-project/oap-mllib.git</url>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    <oap.version>1.1.0</oap.version>
+    <scala.version>2.12.10</scala.version>
+    <scala.binary.version>2.12</scala.binary.version>
+    <spark.version>3.0.0</spark.version>
+  </properties>
+
+  <dependencies>
+
+    <dependency>
+      <groupId>org.scala-lang</groupId>
+      <artifactId>scala-library</artifactId>
+      <version>2.12.10</version>
+    </dependency>
+
+    <dependency>
+      <groupId>com.github.scopt</groupId>
+      <artifactId>scopt_2.12</artifactId>
+      <version>3.7.0</version>      
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-sql_2.12</artifactId>
+      <version>${spark.version}</version>
+      <scope>provided</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.spark</groupId>
+      <artifactId>spark-mllib_2.12</artifactId>
+      <version>${spark.version}</version>
+      <scope>provided</scope>
+    </dependency>
+
+  </dependencies>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.scala-tools</groupId>
+        <artifactId>maven-scala-plugin</artifactId>
+        <version>2.15.2</version>
+        <executions>
+          <execution>
+            <goals>
+              <goal>compile</goal>
+              <goal>testCompile</goal>
+            </goals>
+          </execution>
+        </executions>
+        <configuration>
+          <scalaVersion>${scala.version}</scalaVersion>
+          <args>
+            <arg>-target:jvm-1.8</arg>
+          </args>
+        </configuration>
+      </plugin>
+      <plugin>
+        <artifactId>maven-assembly-plugin</artifactId>
+        <version>3.0.0</version>
+        <configuration>
+          <appendAssemblyId>false</appendAssemblyId>
+          <descriptorRefs>
+            <descriptorRef>jar-with-dependencies</descriptorRef>
+          </descriptorRefs>
+        </configuration>
+        <executions>
+          <execution>
+            <id>assembly</id>
+            <phase>package</phase>
+            <goals>
+              <goal>single</goal>
+            </goals>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+  </build>
+
+</project>
diff --git a/examples/linear-regression/run.sh b/examples/linear-regression/run.sh
@@ -0,0 +1,28 @@
+#!/usr/bin/env bash
+
+source ../../conf/env.sh
+
+APP_JAR=target/oap-mllib-examples-$OAP_MLLIB_VERSION-with-spark-3.0.0.jar
+APP_CLASS=org.apache.spark.examples.ml.LinearRegressionExample
+DATA_FILE=data/sample_linear_regression_data.txt
+
+OAP_MLLIB_ENABLED=true
+
+time $SPARK_HOME/bin/spark-submit --master $SPARK_MASTER -v \
+    --num-executors $SPARK_NUM_EXECUTORS \
+    --driver-memory $SPARK_DRIVER_MEMORY \
+    --executor-cores $SPARK_EXECUTOR_CORES \
+    --executor-memory $SPARK_EXECUTOR_MEMORY \
+    --conf "spark.oap.mllib.enabled=$OAP_MLLIB_ENABLED" \
+    --conf "spark.serializer=org.apache.spark.serializer.KryoSerializer" \
+    --conf "spark.default.parallelism=$SPARK_DEFAULT_PARALLELISM" \
+    --conf "spark.sql.shuffle.partitions=$SPARK_DEFAULT_PARALLELISM" \
+    --conf "spark.driver.extraClassPath=$SPARK_DRIVER_CLASSPATH" \
+    --conf "spark.executor.extraClassPath=$SPARK_EXECUTOR_CLASSPATH" \
+    --conf "spark.shuffle.reduceLocality.enabled=false" \
+    --conf "spark.network.timeout=1200s" \
+    --conf "spark.task.maxFailures=1" \
+    --jars $OAP_MLLIB_JAR \
+    --class $APP_CLASS \
+    $APP_JAR $DATA_FILE \
+    2>&1 | tee LinearRegression-$(date +%m%d_%H_%M_%S).log
diff --git a/...near-regression/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala b/...near-regression/src/main/scala/org/apache/spark/examples/ml/LinearRegressionExample.scala
@@ -0,0 +1,141 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// scalastyle:off println
+package org.apache.spark.examples.ml
+
+import scopt.OptionParser
+
+import org.apache.spark.ml.regression.LinearRegression
+import org.apache.spark.sql.{DataFrame, SparkSession}
+
+/**
+ * An example runner for linear regression with elastic-net (mixing L1/L2) regularization.
+ * Run with
+ * {{{
+ * bin/run-example ml.LinearRegressionExample [options]
+ * }}}
+ * A synthetic dataset can be found at `data/mllib/sample_linear_regression_data.txt` which can be
+ * trained by
+ * {{{
+ * bin/run-example ml.LinearRegressionExample --regParam 0.15 --elasticNetParam 1.0 \
+ *   data/mllib/sample_linear_regression_data.txt
+ * }}}
+ * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
+ */
+object LinearRegressionExample {
+
+  case class Params(
+      input: String = null,
+      testInput: String = "",
+      dataFormat: String = "libsvm",
+      regParam: Double = 0.0,
+      elasticNetParam: Double = 0.0,
+      maxIter: Int = 100,
+      tol: Double = 1E-6,
+      fracTest: Double = 0.2)
+
+  def main(args: Array[String]): Unit = {
+    val defaultParams = Params()
+
+    val parser = new OptionParser[Params]("LinearRegressionExample") {
+      head("LinearRegressionExample: an example Linear Regression with Elastic-Net app.")
+      opt[Double]("regParam")
+        .text(s"regularization parameter, default: ${defaultParams.regParam}")
+        .action((x, c) => c.copy(regParam = x))
+      opt[Double]("elasticNetParam")
+        .text(s"ElasticNet mixing parameter. For alpha = 0, the penalty is an L2 penalty. " +
+        s"For alpha = 1, it is an L1 penalty. For 0 < alpha < 1, the penalty is a combination of " +
+        s"L1 and L2, default: ${defaultParams.elasticNetParam}")
+        .action((x, c) => c.copy(elasticNetParam = x))
+      opt[Int]("maxIter")
+        .text(s"maximum number of iterations, default: ${defaultParams.maxIter}")
+        .action((x, c) => c.copy(maxIter = x))
+      opt[Double]("tol")
+        .text(s"the convergence tolerance of iterations, Smaller value will lead " +
+        s"to higher accuracy with the cost of more iterations, default: ${defaultParams.tol}")
+        .action((x, c) => c.copy(tol = x))
+      opt[Double]("fracTest")
+        .text(s"fraction of data to hold out for testing. If given option testInput, " +
+        s"this option is ignored. default: ${defaultParams.fracTest}")
+        .action((x, c) => c.copy(fracTest = x))
+      opt[String]("testInput")
+        .text(s"input path to test dataset. If given, option fracTest is ignored." +
+        s" default: ${defaultParams.testInput}")
+        .action((x, c) => c.copy(testInput = x))
+      opt[String]("dataFormat")
+        .text("data format: libsvm (default), dense (deprecated in Spark v1.1)")
+        .action((x, c) => c.copy(dataFormat = x))
+      arg[String]("<input>")
+        .text("input path to labeled examples")
+        .required()
+        .action((x, c) => c.copy(input = x))
+      checkConfig { params =>
+        if (params.fracTest < 0 || params.fracTest >= 1) {
+          failure(s"fracTest ${params.fracTest} value incorrect; should be in [0,1).")
+        } else {
+          success
+        }
+      }
+    }
+
+    parser.parse(args, defaultParams) match {
+      case Some(params) => run(params)
+      case _ => sys.exit(1)
+    }
+  }
+
+  def run(params: Params): Unit = {
+    val spark = SparkSession
+      .builder
+      .appName(s"LinearRegressionExample with $params")
+      .getOrCreate()
+
+    println(s"LinearRegressionExample with parameters:\n$params")
+
+    val training = spark.read.format("libsvm")
+      .load(params.input)
+
+    val lir = new LinearRegression()
+      .setFeaturesCol("features")
+      .setLabelCol("label")
+      .setRegParam(params.regParam)
+      .setElasticNetParam(params.elasticNetParam)
+      .setMaxIter(params.maxIter)
+      .setTol(params.tol)
+
+    // Train the model
+    val startTime = System.nanoTime()
+    val lirModel = lir.fit(training)
+    val elapsedTime = (System.nanoTime() - startTime) / 1e9
+    println(s"Training time: $elapsedTime seconds")    
+
+    // Print the coefficients and intercept for linear regression
+    println(s"Coefficients: ${lirModel.coefficients} Intercept: ${lirModel.intercept}")
+
+    // Summarize the model over the training set and print out some metrics
+    val trainingSummary = lirModel.summary
+    println(s"numIterations: ${trainingSummary.totalIterations}")
+    println(s"objectiveHistory: [${trainingSummary.objectiveHistory.mkString(",")}]")
+    trainingSummary.residuals.show()
+    println(s"RMSE: ${trainingSummary.rootMeanSquaredError}")
+    println(s"r2: ${trainingSummary.r2}")    
+
+    spark.stop()
+  }
+}
+// scalastyle:on println
diff --git a/mllib-dal/src/main/java/org/apache/spark/ml/regression/LiRResult.java b/mllib-dal/src/main/java/org/apache/spark/ml/regression/LiRResult.java
@@ -0,0 +1,21 @@
+/*******************************************************************************
+ * Copyright 2020 Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *******************************************************************************/
+
+package org.apache.spark.ml.regression;
+
+public class LiRResult {
+    public long coeffNumericTable;    // first element of coeff is actually intercept
+}