Skip to content

Commit

Permalink
[WIP] Use SPARK_HIVE to determine if we include Hive in packaging
Browse files Browse the repository at this point in the history
Previously, we based our decision regarding including datanucleus jars
based on the existence of a spark-hive-assembly jar, which was incididentally
built whenever "sbt assembly" is run. This means that a typical and
previously supported pathway would start using hive jars.

This patch has the following features/bug fixes:

- Use of SPARK_HIVE (default false) to determine if we should include Hive
  in the assembly jar.
- Analagous feature in Maven with -Phive.
- assemble-deps fixed since we no longer use a different ASSEMBLY_DIR

Still TODO before mergeable:
- We need to download the datanucleus jars outside of sbt. Perhaps we can have
  spark-class download them if SPARK_HIVE is set similar to how sbt downloads
  itself.
- Spark SQL documentation updates.
  • Loading branch information
aarondav committed Apr 6, 2014
1 parent 6e88583 commit a9269b5
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 19 deletions.
10 changes: 10 additions & 0 deletions assembly/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,16 @@
</dependency>
</dependencies>
</profile>
<profile>
<id>hive</id>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</profile>
<profile>
<id>spark-ganglia-lgpl</id>
<dependencies>
Expand Down
11 changes: 5 additions & 6 deletions bin/compute-classpath.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,17 +35,16 @@ CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"
# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
# the future.
if [ -f "$FWDIR"/sql/hive/target/scala-$SCALA_VERSION/spark-hive-assembly-*.jar ]; then
if [ "$SPARK_HIVE" = "true" ]; then
echo 1>&2 "SPARK_HIVE is set, including Hive support."

# Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS

ASSEMBLY_DIR="$FWDIR/sql/hive/target/scala-$SCALA_VERSION/"
else
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
fi

ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"

# First check if we have a dependencies jar. If so, include binary classes with the deps jar
if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
Expand All @@ -59,7 +58,7 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"

DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*-deps.jar`
DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
else
# Else use spark-assembly jar from either RELEASE or assembly directory
Expand Down
2 changes: 0 additions & 2 deletions bin/spark-class
Original file line number Diff line number Diff line change
Expand Up @@ -154,5 +154,3 @@ if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
fi

exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"


4 changes: 2 additions & 2 deletions dev/create-release/create-release.sh
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,14 @@ mvn -DskipTests \
-Darguments="-DskipTests=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \
-Dusername=$GIT_USERNAME -Dpassword=$GIT_PASSWORD \
-Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
-Pyarn -Pspark-ganglia-lgpl \
-Pyarn -Phive -Pspark-ganglia-lgpl\
-Dtag=$GIT_TAG -DautoVersionSubmodules=true \
--batch-mode release:prepare

mvn -DskipTests \
-Darguments="-DskipTests=true -Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 -Dgpg.passphrase=${GPG_PASSPHRASE}" \
-Dhadoop.version=2.2.0 -Dyarn.version=2.2.0 \
-Pyarn -Pspark-ganglia-lgpl\
-Pyarn -Phive -Pspark-ganglia-lgpl\
release:perform

rm -rf spark
Expand Down
1 change: 0 additions & 1 deletion pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,6 @@
<groupId>org.apache.derby</groupId>
<artifactId>derby</artifactId>
<version>10.4.2.0</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>net.liftweb</groupId>
Expand Down
23 changes: 15 additions & 8 deletions project/SparkBuild.scala
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ object SparkBuild extends Build {

val DEFAULT_YARN = false

val DEFAULT_HIVE = false

// HBase version; set as appropriate.
val HBASE_VERSION = "0.94.6"

Expand All @@ -67,15 +69,17 @@ object SparkBuild extends Build {

lazy val sql = Project("sql", file("sql/core"), settings = sqlCoreSettings) dependsOn(core, catalyst)

// Since hive is its own assembly, it depends on all of the modules.
lazy val hive = Project("hive", file("sql/hive"), settings = hiveSettings) dependsOn(sql, graphx, bagel, mllib, streaming, repl)
lazy val hive = Project("hive", file("sql/hive"), settings = hiveSettings) dependsOn(sql)

lazy val maybeHive: Seq[ClasspathDependency] = if (isHiveEnabled) Seq(hive) else Seq()
lazy val maybeHiveRef: Seq[ProjectReference] = if (isHiveEnabled) Seq(hive) else Seq()

lazy val streaming = Project("streaming", file("streaming"), settings = streamingSettings) dependsOn(core)

lazy val mllib = Project("mllib", file("mllib"), settings = mllibSettings) dependsOn(core)

lazy val assemblyProj = Project("assembly", file("assembly"), settings = assemblyProjSettings)
.dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeGanglia: _*)
.dependsOn(core, graphx, bagel, mllib, streaming, repl, sql) dependsOn(maybeYarn: _*) dependsOn(maybeHive: _*) dependsOn(maybeGanglia: _*)

lazy val assembleDeps = TaskKey[Unit]("assemble-deps", "Build assembly of dependencies and packages Spark projects")

Expand All @@ -101,6 +105,11 @@ object SparkBuild extends Build {
lazy val hadoopClient = if (hadoopVersion.startsWith("0.20.") || hadoopVersion == "1.0.0") "hadoop-core" else "hadoop-client"
val maybeAvro = if (hadoopVersion.startsWith("0.23.") && isYarnEnabled) Seq("org.apache.avro" % "avro" % "1.7.4") else Seq()

lazy val isHiveEnabled = Properties.envOrNone("SPARK_HIVE") match {
case None => DEFAULT_HIVE
case Some(v) => v.toBoolean
}

// Include Ganglia integration if the user has enabled Ganglia
// This is isolated from the normal build due to LGPL-licensed code in the library
lazy val isGangliaEnabled = Properties.envOrNone("SPARK_GANGLIA_LGPL").isDefined
Expand Down Expand Up @@ -141,13 +150,13 @@ object SparkBuild extends Build {
lazy val allExternalRefs = Seq[ProjectReference](externalTwitter, externalKafka, externalFlume, externalZeromq, externalMqtt)

lazy val examples = Project("examples", file("examples"), settings = examplesSettings)
.dependsOn(core, mllib, graphx, bagel, streaming, externalTwitter, hive) dependsOn(allExternal: _*)
.dependsOn(core, mllib, graphx, bagel, streaming, hive) dependsOn(allExternal: _*)

// Everything except assembly, hive, tools, java8Tests and examples belong to packageProjects
lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeGangliaRef
lazy val packageProjects = Seq[ProjectReference](core, repl, bagel, streaming, mllib, graphx, catalyst, sql) ++ maybeYarnRef ++ maybeHiveRef ++ maybeGangliaRef

lazy val allProjects = packageProjects ++ allExternalRefs ++
Seq[ProjectReference](examples, tools, assemblyProj, hive) ++ maybeJava8Tests
Seq[ProjectReference](examples, tools, assemblyProj) ++ maybeJava8Tests

def sharedSettings = Defaults.defaultSettings ++ MimaBuild.mimaSettings(file(sparkHome)) ++ Seq(
organization := "org.apache.spark",
Expand Down Expand Up @@ -412,8 +421,6 @@ object SparkBuild extends Build {
// assembly jar.
def hiveSettings = sharedSettings ++ assemblyProjSettings ++ Seq(
name := "spark-hive",
jarName in assembly <<= version map { v => "spark-hive-assembly-" + v + "-hadoop" + hadoopVersion + ".jar" },
jarName in packageDependency <<= version map { v => "spark-hive-assembly-" + v + "-hadoop" + hadoopVersion + "-deps.jar" },
javaOptions += "-XX:MaxPermSize=1g",
libraryDependencies ++= Seq(
"org.apache.hive" % "hive-metastore" % "0.12.0",
Expand Down
6 changes: 6 additions & 0 deletions sql/hive/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@
<artifactId>hive-exec</artifactId>
<version>${hive.version}</version>
</dependency>
<dependency>
<!-- Matches the version of jackson-core-asl pulled in by avro -->
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
<version>1.8.8</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-serde</artifactId>
Expand Down

0 comments on commit a9269b5

Please sign in to comment.