Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SPARK-1094 Support MiMa for reporting binary compatibility accross versions. #207

Closed
wants to merge 9 commits into from
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
sbt/*.jar
.settings
.cache
.mima-excludes
/build/
work/
out/
Expand Down
1 change: 1 addition & 0 deletions bin/compute-classpath.sh
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ if [ -f "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-dep
CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/tools/target/scala-$SCALA_VERSION/classes"

DEPS_ASSEMBLY_JAR=`ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-deps.jar`
CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
Expand Down
3 changes: 1 addition & 2 deletions bin/spark-class
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,7 @@ fi

# Compute classpath using external script
CLASSPATH=`$FWDIR/bin/compute-classpath.sh`

if [ "$1" == "org.apache.spark.tools.JavaAPICompletenessChecker" ]; then
if [[ "$1" =~ org.apache.spark.tools.* ]]; then
CLASSPATH="$CLASSPATH:$SPARK_TOOLS_JAR"
fi

Expand Down
7 changes: 7 additions & 0 deletions dev/run-tests
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,10 @@ if [ -z "$PYSPARK_PYTHON" ]; then
export PYSPARK_PYTHON=/usr/local/bin/python2.7
fi
./python/run-tests

echo "========================================================================="
echo "Detecting binary incompatibilites with MiMa"
echo "========================================================================="
./bin/spark-class org.apache.spark.tools.GenerateMIMAIgnore
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, about how long does it take to run this? I'm just wondering if it will lengthen our test time significantly.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line takes about 5 seconds. I think including the subsequent check it's less than 1 minute.

sbt/sbt mima-report-binary-issues

62 changes: 62 additions & 0 deletions project/MimaBuild.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
import com.typesafe.tools.mima.plugin.MimaKeys.{binaryIssueFilters, previousArtifact}
import com.typesafe.tools.mima.plugin.MimaPlugin.mimaDefaultSettings
import sbt._

object MimaBuild {

def ignoredABIProblems(base: File) = {
import com.typesafe.tools.mima.core._
import com.typesafe.tools.mima.core.ProblemFilters._

// Excludes placed here will be used for all Spark versions
val defaultExcludes = Seq()

// Read package-private excludes from file
val excludeFilePath = (base.getAbsolutePath + "/.mima-excludes")
val excludeFile = file(excludeFilePath)
val packagePrivateList: Seq[String] =
if (!excludeFile.exists()) {
Seq()
} else {
IO.read(excludeFile).split("\n")
}

def excludeClass(className: String) = {
Seq(
excludePackage(className),
ProblemFilters.exclude[MissingClassProblem](className),
ProblemFilters.exclude[MissingTypesProblem](className),
excludePackage(className + "$"),
ProblemFilters.exclude[MissingClassProblem](className + "$"),
ProblemFilters.exclude[MissingTypesProblem](className + "$")
)
}
def excludeSparkClass(className: String) = excludeClass("org.apache.spark." + className)

val packagePrivateExcludes = packagePrivateList.flatMap(excludeClass)

/* Excludes specific to a given version of Spark. When comparing the given version against
its immediate predecessor, the excludes listed here will be applied. */
val versionExcludes =
SparkBuild.SPARK_VERSION match {
case v if v.startsWith("1.0") =>
Seq(excludePackage("org.apache.spark.api.java")) ++
excludeSparkClass("rdd.ClassTags") ++
excludeSparkClass("util.XORShiftRandom") ++
excludeSparkClass("mllib.recommendation.MFDataGenerator") ++
excludeSparkClass("mllib.optimization.SquaredGradient") ++
excludeSparkClass("mllib.regression.RidgeRegressionWithSGD") ++
excludeSparkClass("mllib.regression.LassoWithSGD") ++
excludeSparkClass("mllib.regression.LinearRegressionWithSGD")
case _ => Seq()
}

defaultExcludes ++ packagePrivateExcludes ++ versionExcludes
}

def mimaSettings(sparkHome: File) = mimaDefaultSettings ++ Seq(
previousArtifact := None,
binaryIssueFilters ++= ignoredABIProblems(sparkHome)
)

}
29 changes: 24 additions & 5 deletions project/SparkBuild.scala
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,14 @@ import sbtassembly.Plugin._
import AssemblyKeys._
import scala.util.Properties
import org.scalastyle.sbt.ScalastylePlugin.{Settings => ScalaStyleSettings}
import com.typesafe.tools.mima.plugin.MimaKeys.previousArtifact

// For Sonatype publishing
//import com.jsuereth.pgp.sbtplugin.PgpKeys._

object SparkBuild extends Build {
val SPARK_VERSION = "1.0.0-incubating-SNAPSHOT"

// Hadoop version to build against. For example, "1.0.4" for Apache releases, or
// "2.0.0-mr1-cdh4.2.0" for Cloudera Hadoop. Note that these variables can be set
// through the environment variables SPARK_HADOOP_VERSION and SPARK_YARN.
Expand Down Expand Up @@ -121,9 +124,9 @@ object SparkBuild extends Build {

lazy val allProjects = packageProjects ++ allExternalRefs ++ Seq[ProjectReference](examples, tools, assemblyProj)

def sharedSettings = Defaults.defaultSettings ++ Seq(
def sharedSettings = Defaults.defaultSettings ++ MimaBuild.mimaSettings(file(sparkHome)) ++ Seq(
organization := "org.apache.spark",
version := "1.0.0-incubating-SNAPSHOT",
version := SPARK_VERSION,
scalaVersion := "2.10.3",
scalacOptions := Seq("-Xmax-classfile-name", "120", "-unchecked", "-deprecation",
"-target:" + SCALAC_JVM_VERSION),
Expand Down Expand Up @@ -244,13 +247,19 @@ object SparkBuild extends Build {
val excludeAsm = ExclusionRule(organization = "asm")
val excludeSnappy = ExclusionRule(organization = "org.xerial.snappy")

def sparkPreviousArtifact(id: String, organization: String = "org.apache.spark",
version: String = "0.9.0-incubating", crossVersion: String = "2.10"): Option[sbt.ModuleID] = {
val fullId = if (crossVersion.isEmpty) id else id + "_" + crossVersion
Some(organization % fullId % version) // the artifact to compare binary compatibility with
}

def coreSettings = sharedSettings ++ Seq(
name := "spark-core",
resolvers ++= Seq(
"JBoss Repository" at "http://repository.jboss.org/nexus/content/repositories/releases/",
"Cloudera Repository" at "https://repository.cloudera.com/artifactory/cloudera-repos/"
),

previousArtifact := sparkPreviousArtifact("spark-core"),
libraryDependencies ++= Seq(
"com.google.guava" % "guava" % "14.0.1",
"com.google.code.findbugs" % "jsr305" % "1.3.9",
Expand Down Expand Up @@ -289,7 +298,7 @@ object SparkBuild extends Build {
publish := {}
)

def replSettings = sharedSettings ++ Seq(
def replSettings = sharedSettings ++ Seq(
name := "spark-repl",
libraryDependencies <+= scalaVersion(v => "org.scala-lang" % "scala-compiler" % v ),
libraryDependencies <+= scalaVersion(v => "org.scala-lang" % "jline" % v ),
Expand All @@ -298,6 +307,7 @@ object SparkBuild extends Build {

def examplesSettings = sharedSettings ++ Seq(
name := "spark-examples",
previousArtifact := sparkPreviousArtifact("spark-examples"),
libraryDependencies ++= Seq(
"com.twitter" %% "algebird-core" % "0.1.11",
"org.apache.hbase" % "hbase" % "0.94.6" excludeAll(excludeNetty, excludeAsm),
Expand All @@ -321,24 +331,28 @@ object SparkBuild extends Build {

def graphxSettings = sharedSettings ++ Seq(
name := "spark-graphx",
previousArtifact := sparkPreviousArtifact("spark-graphx"),
libraryDependencies ++= Seq(
"org.jblas" % "jblas" % "1.2.3"
)
)

def bagelSettings = sharedSettings ++ Seq(
name := "spark-bagel"
name := "spark-bagel",
previousArtifact := sparkPreviousArtifact("spark-bagel")
)

def mllibSettings = sharedSettings ++ Seq(
name := "spark-mllib",
previousArtifact := sparkPreviousArtifact("spark-mllib"),
libraryDependencies ++= Seq(
"org.jblas" % "jblas" % "1.2.3"
)
)

def streamingSettings = sharedSettings ++ Seq(
name := "spark-streaming",
previousArtifact := sparkPreviousArtifact("spark-streaming"),
libraryDependencies ++= Seq(
"commons-io" % "commons-io" % "2.4"
)
Expand Down Expand Up @@ -403,13 +417,15 @@ object SparkBuild extends Build {

def twitterSettings() = sharedSettings ++ Seq(
name := "spark-streaming-twitter",
previousArtifact := sparkPreviousArtifact("spark-streaming-twitter"),
libraryDependencies ++= Seq(
"org.twitter4j" % "twitter4j-stream" % "3.0.3" excludeAll(excludeNetty)
)
)

def kafkaSettings() = sharedSettings ++ Seq(
name := "spark-streaming-kafka",
previousArtifact := sparkPreviousArtifact("spark-streaming-kafka"),
libraryDependencies ++= Seq(
"com.github.sgroschupf" % "zkclient" % "0.1" excludeAll(excludeNetty),
"org.apache.kafka" %% "kafka" % "0.8.0"
Expand All @@ -422,20 +438,23 @@ object SparkBuild extends Build {

def flumeSettings() = sharedSettings ++ Seq(
name := "spark-streaming-flume",
previousArtifact := sparkPreviousArtifact("spark-streaming-flume"),
libraryDependencies ++= Seq(
"org.apache.flume" % "flume-ng-sdk" % "1.2.0" % "compile" excludeAll(excludeNetty, excludeSnappy)
)
)

def zeromqSettings() = sharedSettings ++ Seq(
name := "spark-streaming-zeromq",
previousArtifact := sparkPreviousArtifact("spark-streaming-zeromq"),
libraryDependencies ++= Seq(
"org.spark-project.akka" %% "akka-zeromq" % "2.2.3-shaded-protobuf" excludeAll(excludeNetty)
)
)

def mqttSettings() = streamingSettings ++ Seq(
name := "spark-streaming-mqtt",
previousArtifact := sparkPreviousArtifact("spark-streaming-mqtt"),
resolvers ++= Seq("Eclipse Repo" at "https://repo.eclipse.org/content/repositories/paho-releases/"),
libraryDependencies ++= Seq("org.eclipse.paho" % "mqtt-client" % "0.4.0")
)
Expand Down
1 change: 1 addition & 0 deletions project/plugins.sbt
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ addSbtPlugin("net.virtual-void" % "sbt-dependency-graph" % "0.7.4")

addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "0.4.0")

addSbtPlugin("com.typesafe" % "sbt-mima-plugin" % "0.1.6")
131 changes: 131 additions & 0 deletions tools/src/main/scala/org/apache/spark/tools/GenerateMIMAIgnore.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.spark.tools

import java.io.File
import java.util.jar.JarFile

import scala.collection.mutable
import scala.collection.JavaConversions._
import scala.reflect.runtime.universe.runtimeMirror

/**
* A tool for generating classes to be excluded during binary checking with MIMA. It is expected
* that this tool is run with ./spark-class.
*
* MIMA itself only supports JVM-level visibility and doesn't account for package-private classes.
* This tool looks at all currently package-private classes and generates exclusions for them. Note
* that this approach is not sound. It can lead to false positives if we move or rename a previously
* package-private class. It can lead to false negatives if someone explicitly makes a class
* package-private that wasn't before. This exists only to help catch certain classes of changes
* which might be difficult to catch during review.
*/
object GenerateMIMAIgnore {
private val classLoader = Thread.currentThread().getContextClassLoader
private val mirror = runtimeMirror(classLoader)

private def classesPrivateWithin(packageName: String): Set[String] = {

val classes = getClasses(packageName, classLoader)
val privateClasses = mutable.HashSet[String]()

def isPackagePrivate(className: String) = {
try {
/* Couldn't figure out if it's possible to determine a-priori whether a given symbol
is a module or class. */

val privateAsClass = mirror
.staticClass(className)
.privateWithin
.fullName
.startsWith(packageName)

val privateAsModule = mirror
.staticModule(className)
.privateWithin
.fullName
.startsWith(packageName)

privateAsClass || privateAsModule
} catch {
case _: Throwable => {
println("Error determining visibility: " + className)
false
}
}
}

for (className <- classes) {
val directlyPrivateSpark = isPackagePrivate(className)

/* Inner classes defined within a private[spark] class or object are effectively
invisible, so we account for them as package private. */
val indirectlyPrivateSpark = {
val maybeOuter = className.toString.takeWhile(_ != '$')
if (maybeOuter != className) {
isPackagePrivate(maybeOuter)
} else {
false
}
}
if (directlyPrivateSpark || indirectlyPrivateSpark) privateClasses += className
}
privateClasses.flatMap(c => Seq(c, c.replace("$", "#"))).toSet
}

def main(args: Array[String]) {
scala.tools.nsc.io.File(".mima-excludes").
writeAll(classesPrivateWithin("org.apache.spark").mkString("\n"))
println("Created : .mima-excludes in current directory.")
}


private def shouldExclude(name: String) = {
// Heuristic to remove JVM classes that do not correspond to user-facing classes in Scala
name.contains("anon") ||
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I keep trying to come up with a valid class name that contains "anon". In the dictionary, there seems to be Canon, Lebanon, and, of course, anonymous. We're probably safe?

name.endsWith("$class") ||
name.contains("$sp")
}

/**
* Scans all classes accessible from the context class loader which belong to the given package
* and subpackages both from directories and jars present on the classpath.
*/
private def getClasses(packageName: String,
classLoader: ClassLoader = Thread.currentThread().getContextClassLoader): Set[String] = {
val path = packageName.replace('.', '/')
val resources = classLoader.getResources(path)

val jars = resources.filter(x => x.getProtocol == "jar")
.map(_.getFile.split(":")(1).split("!")(0)).toSeq
val classesFromJars = jars.map(getClassesFromJar(_, path)).flatten

classesFromJars.map(_.getName).filterNot(shouldExclude).toSet
}

/**
* Get all classes in a package from a jar file.
*/
private def getClassesFromJar(jarPath: String, packageName: String) = {
val jar = new JarFile(new File(jarPath))
val enums = jar.entries().map(_.getName).filter(_.startsWith(packageName))
val classes = for (entry <- enums if entry.endsWith(".class"))
yield Class.forName(entry.replace('/', '.').stripSuffix(".class"))
classes
}
}