Skip to content

Commit

Permalink
Eliminate need for SPARK_HIVE at runtime by d/ling datanucleus from M…
Browse files Browse the repository at this point in the history
…aven
  • Loading branch information
aarondav committed Apr 6, 2014
1 parent a9269b5 commit dd1a365
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 17 deletions.
32 changes: 18 additions & 14 deletions bin/compute-classpath.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,20 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
# Build up classpath
CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"

# Support for interacting with Hive. Since hive pulls in a lot of dependencies that might break
# existing Spark applications, it is not included in the standard spark assembly. Instead, we only
# include it in the classpath if the user has explicitly requested it by running "sbt hive/assembly"
# Hopefully we will find a way to avoid uber-jars entirely and deploy only the needed packages in
# the future.
if [ "$SPARK_HIVE" = "true" ]; then
echo 1>&2 "SPARK_HIVE is set, including Hive support."

# Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
DATANUCLEUSJARS=$(JARS=("$FWDIR/lib_managed/jars"/datanucleus-*.jar); IFS=:; echo "${JARS[*]}")
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
fi

ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION/"
ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"

# First check if we have a dependencies jar. If so, include binary classes with the deps jar
if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
Expand All @@ -70,6 +57,23 @@ else
CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
fi

# When Hive support is needed, Datanucleus jars must be included on the classpath.
# Datanucleus jars do not work if only included in the uberjar as plugin.xml metadata is lost.
# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ | grep "datanucleus-.*\\.jar" | wc -l)
if [ $num_datanucleus_jars -gt 0 ]; then
AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
if [ $num_hive_files -gt 0 ]; then
echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
fi
fi

# Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1
if [[ $SPARK_TESTING == 1 ]]; then
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/test-classes"
Expand Down
6 changes: 6 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -575,6 +575,12 @@
</exclusion>
</exclusions>
</dependency>
<dependency>
<!-- Matches the version of jackson-core-asl pulled in by avro -->
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
<version>1.8.8</version>
</dependency>
</dependencies>
</dependencyManagement>

Expand Down
2 changes: 1 addition & 1 deletion project/SparkBuild.scala
Original file line number Diff line number Diff line change
Expand Up @@ -419,7 +419,7 @@ object SparkBuild extends Build {

// Since we don't include hive in the main assembly this project also acts as an alternative
// assembly jar.
def hiveSettings = sharedSettings ++ assemblyProjSettings ++ Seq(
def hiveSettings = sharedSettings ++ Seq(
name := "spark-hive",
javaOptions += "-XX:MaxPermSize=1g",
libraryDependencies ++= Seq(
Expand Down
26 changes: 24 additions & 2 deletions sql/hive/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,8 @@
<version>${hive.version}</version>
</dependency>
<dependency>
<!-- Matches the version of jackson-core-asl pulled in by avro -->
<groupId>org.codehaus.jackson</groupId>
<artifactId>jackson-mapper-asl</artifactId>
<version>1.8.8</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
Expand All @@ -93,6 +91,30 @@
<groupId>org.scalatest</groupId>
<artifactId>scalatest-maven-plugin</artifactId>
</plugin>

<!-- Deploy datanucleus jars to the spark/lib_managed/jars directory -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>2.4</version>
<executions>
<execution>
<id>copy-dependencies</id>
<phase>package</phase>
<goals>
<goal>copy-dependencies</goal>
</goals>
<configuration>
<!-- basedir is spark/sql/hive/ -->
<outputDirectory>${basedir}/../../lib_managed/jars</outputDirectory>
<overWriteReleases>false</overWriteReleases>
<overWriteSnapshots>false</overWriteSnapshots>
<overWriteIfNewer>true</overWriteIfNewer>
<includeGroupIds>org.datanucleus</includeGroupIds>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>

0 comments on commit dd1a365

Please sign in to comment.