Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/master' into pyspark-inputformats
Browse files Browse the repository at this point in the history
Conflicts:
	core/src/main/scala/org/apache/spark/api/python/PythonRDD.scala
	project/SparkBuild.scala
	python/pyspark/context.py
	python/pyspark/serializers.py
  • Loading branch information
MLnick committed Apr 10, 2014
2 parents e67212a + 9689b66 commit dd57922
Show file tree
Hide file tree
Showing 11,947 changed files with 271,831 additions and 5,121 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
sbt/*.jar
.settings
.cache
.mima-excludes
/build/
work/
out/
Expand Down Expand Up @@ -45,3 +46,5 @@ dist/
spark-*-bin.tar.gz
unit-tests.log
/lib/
rat-results.txt
scalastyle.txt
42 changes: 42 additions & 0 deletions .rat-excludes
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
target
.gitignore
.project
.classpath
.mima-excludes
.rat-excludes
.*md
derby.log
TAGS
RELEASE
control
docs
fairscheduler.xml.template
log4j.properties
log4j.properties.template
metrics.properties.template
slaves
spark-env.sh
spark-env.sh.template
log4j-defaults.properties
sorttable.js
.*txt
.*data
.*log
cloudpickle.py
join.py
SparkExprTyper.scala
SparkILoop.scala
SparkILoopInit.scala
SparkIMain.scala
SparkImports.scala
SparkJLineCompletion.scala
SparkJLineReader.scala
SparkMemberHandlers.scala
sbt
sbt-launch-lib.bash
plugins.sbt
work
.*\.q
golden
test.out/*
.*iml
32 changes: 32 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

language: scala
scala:
- "2.10.3"
jdk:
- oraclejdk7
env:
matrix:
- TEST="scalastyle assembly/assembly"
- TEST="catalyst/test sql/test streaming/test mllib/test graphx/test bagel/test"
- TEST=hive/test
cache:
directories:
- $HOME/.m2
- $HOME/.ivy2
- $HOME/.sbt
script:
- "sbt ++$TRAVIS_SCALA_VERSION $TEST"
9 changes: 9 additions & 0 deletions NOTICE
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,12 @@ Copyright 2014 The Apache Software Foundation.

This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).

In addition, this product includes:

- JUnit (http://www.junit.org) is a testing framework for Java. We included it
under the terms of the Eclipse Public License v1.0.

- JTransforms (https://sites.google.com/site/piotrwendykier/software/jtransforms)
provides fast transforms in Java. It is tri-licensed, and we included it under
the terms of the Mozilla Public License v1.1.
17 changes: 16 additions & 1 deletion assembly/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@
<artifactId>spark-graphx_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
<dependency>
<groupId>net.sf.py4j</groupId>
<artifactId>py4j</artifactId>
Expand Down Expand Up @@ -158,6 +163,16 @@
</dependency>
</dependencies>
</profile>
<profile>
<id>hive</id>
<dependencies>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_${scala.binary.version}</artifactId>
<version>${project.version}</version>
</dependency>
</dependencies>
</profile>
<profile>
<id>spark-ganglia-lgpl</id>
<dependencies>
Expand Down Expand Up @@ -203,7 +218,7 @@
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>buildnumber-maven-plugin</artifactId>
<version>1.1</version>
<version>1.2</version>
<executions>
<execution>
<phase>validate</phase>
Expand Down
20 changes: 12 additions & 8 deletions bagel/src/main/scala/org/apache/spark/bagel/Bagel.scala
Original file line number Diff line number Diff line change
Expand Up @@ -220,27 +220,31 @@ object Bagel extends Logging {
*/
private def comp[K: Manifest, V <: Vertex, M <: Message[K], C](
sc: SparkContext,
grouped: RDD[(K, (Seq[C], Seq[V]))],
grouped: RDD[(K, (Iterable[C], Iterable[V]))],
compute: (V, Option[C]) => (V, Array[M]),
storageLevel: StorageLevel
): (RDD[(K, (V, Array[M]))], Int, Int) = {
var numMsgs = sc.accumulator(0)
var numActiveVerts = sc.accumulator(0)
val processed = grouped.flatMapValues {
case (_, vs) if vs.size == 0 => None
case (c, vs) =>
val processed = grouped.mapValues(x => (x._1.iterator, x._2.iterator))
.flatMapValues {
case (_, vs) if !vs.hasNext => None
case (c, vs) => {
val (newVert, newMsgs) =
compute(vs(0), c match {
case Seq(comb) => Some(comb)
case Seq() => None
})
compute(vs.next,
c.hasNext match {
case true => Some(c.next)
case false => None
}
)

numMsgs += newMsgs.size
if (newVert.active) {
numActiveVerts += 1
}

Some((newVert, newMsgs))
}
}.persist(storageLevel)

// Force evaluation of processed RDD for accurate performance measurements
Expand Down
39 changes: 31 additions & 8 deletions bin/compute-classpath.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,35 +25,55 @@ SCALA_VERSION=2.10
# Figure out where Spark is installed
FWDIR="$(cd `dirname $0`/..; pwd)"

# Load environment variables from conf/spark-env.sh, if it exists
if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
. $FWDIR/conf/spark-env.sh
fi
. $FWDIR/bin/load-spark-env.sh

# Build up classpath
CLASSPATH="$SPARK_CLASSPATH:$FWDIR/conf"

ASSEMBLY_DIR="$FWDIR/assembly/target/scala-$SCALA_VERSION"

# First check if we have a dependencies jar. If so, include binary classes with the deps jar
if [ -f "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-deps.jar ]; then
if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/tools/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"

DEPS_ASSEMBLY_JAR=`ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*-deps.jar`
DEPS_ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar`
CLASSPATH="$CLASSPATH:$DEPS_ASSEMBLY_JAR"
else
# Else use spark-assembly jar from either RELEASE or assembly directory
if [ -f "$FWDIR/RELEASE" ]; then
ASSEMBLY_JAR=`ls "$FWDIR"/jars/spark-assembly*.jar`
ASSEMBLY_JAR=`ls "$FWDIR"/jars/spark*-assembly*.jar`
else
ASSEMBLY_JAR=`ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/spark-assembly*hadoop*.jar`
ASSEMBLY_JAR=`ls "$ASSEMBLY_DIR"/spark*-assembly*hadoop*.jar`
fi
CLASSPATH="$CLASSPATH:$ASSEMBLY_JAR"
fi

# When Hive support is needed, Datanucleus jars must be included on the classpath.
# Datanucleus jars do not work if only included in the uber jar as plugin.xml metadata is lost.
# Both sbt and maven will populate "lib_managed/jars/" with the datanucleus jars when Spark is
# built with Hive, so first check if the datanucleus jars exist, and then ensure the current Spark
# assembly is built for Hive, before actually populating the CLASSPATH with the jars.
# Note that this check order is faster (by up to half a second) in the case where Hive is not used.
num_datanucleus_jars=$(ls "$FWDIR"/lib_managed/jars/ 2>/dev/null | grep "datanucleus-.*\\.jar" | wc -l)
if [ $num_datanucleus_jars -gt 0 ]; then
AN_ASSEMBLY_JAR=${ASSEMBLY_JAR:-$DEPS_ASSEMBLY_JAR}
num_hive_files=$(jar tvf "$AN_ASSEMBLY_JAR" org/apache/hadoop/hive/ql/exec 2>/dev/null | wc -l)
if [ $num_hive_files -gt 0 ]; then
echo "Spark assembly has been built with Hive, including Datanucleus jars on classpath" 1>&2
DATANUCLEUSJARS=$(echo "$FWDIR/lib_managed/jars"/datanucleus-*.jar | tr " " :)
CLASSPATH=$CLASSPATH:$DATANUCLEUSJARS
fi
fi

# Add test classes if we're running from SBT or Maven with SPARK_TESTING set to 1
if [[ $SPARK_TESTING == 1 ]]; then
CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/test-classes"
Expand All @@ -62,6 +82,9 @@ if [[ $SPARK_TESTING == 1 ]]; then
CLASSPATH="$CLASSPATH:$FWDIR/bagel/target/scala-$SCALA_VERSION/test-classes"
CLASSPATH="$CLASSPATH:$FWDIR/graphx/target/scala-$SCALA_VERSION/test-classes"
CLASSPATH="$CLASSPATH:$FWDIR/streaming/target/scala-$SCALA_VERSION/test-classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/catalyst/target/scala-$SCALA_VERSION/test-classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/test-classes"
CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/test-classes"
fi

# Add hadoop conf dir if given -- otherwise FileSystem.*, etc fail !
Expand Down
38 changes: 38 additions & 0 deletions bin/load-spark-env.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/usr/bin/env bash

#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

# This script loads spark-env.sh if it exists, and ensures it is only loaded once.
# spark-env.sh is loaded from SPARK_CONF_DIR if set, or within the current directory's
# conf/ subdirectory.

if [ -z "$SPARK_ENV_LOADED" ]; then
export SPARK_ENV_LOADED=1

# Returns the parent of the directory this script lives in.
parent_dir="$(cd `dirname $0`/..; pwd)"

use_conf_dir=${SPARK_CONF_DIR:-"$parent_dir/conf"}

if [ -f "${use_conf_dir}/spark-env.sh" ]; then
# Promote all variable declarations to environment (exported) variables
set -a
. "${use_conf_dir}/spark-env.sh"
set +a
fi
fi
8 changes: 3 additions & 5 deletions bin/pyspark
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,7 @@ if [ ! -f "$FWDIR/RELEASE" ]; then
fi
fi

# Load environment variables from conf/spark-env.sh, if it exists
if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
. $FWDIR/conf/spark-env.sh
fi
. $FWDIR/bin/load-spark-env.sh

# Figure out which Python executable to use
if [ -z "$PYSPARK_PYTHON" ] ; then
Expand All @@ -58,7 +55,8 @@ if [ -n "$IPYTHON_OPTS" ]; then
IPYTHON=1
fi

if [[ "$IPYTHON" = "1" ]] ; then
# Only use ipython if no command line arguments were provided [SPARK-1134]
if [[ "$IPYTHON" = "1" && $# = 0 ]] ; then
exec ipython $IPYTHON_OPTS
else
exec "$PYSPARK_PYTHON" "$@"
Expand Down
5 changes: 1 addition & 4 deletions bin/run-example
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
# Export this as SPARK_HOME
export SPARK_HOME="$FWDIR"

# Load environment variables from conf/spark-env.sh, if it exists
if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
. $FWDIR/conf/spark-env.sh
fi
. $FWDIR/bin/load-spark-env.sh

if [ -z "$1" ]; then
echo "Usage: run-example <example-class> [<args>]" >&2
Expand Down
10 changes: 2 additions & 8 deletions bin/spark-class
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,7 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
# Export this as SPARK_HOME
export SPARK_HOME="$FWDIR"

# Load environment variables from conf/spark-env.sh, if it exists
if [ -e "$FWDIR/conf/spark-env.sh" ] ; then
. $FWDIR/conf/spark-env.sh
fi
. $FWDIR/bin/load-spark-env.sh

if [ -z "$1" ]; then
echo "Usage: spark-class <class> [<args>]" >&2
Expand Down Expand Up @@ -137,8 +134,7 @@ fi

# Compute classpath using external script
CLASSPATH=`$FWDIR/bin/compute-classpath.sh`

if [ "$1" == "org.apache.spark.tools.JavaAPICompletenessChecker" ]; then
if [[ "$1" =~ org.apache.spark.tools.* ]]; then
CLASSPATH="$CLASSPATH:$SPARK_TOOLS_JAR"
fi

Expand All @@ -158,5 +154,3 @@ if [ "$SPARK_PRINT_LAUNCH_COMMAND" == "1" ]; then
fi

exec "$RUNNER" -cp "$CLASSPATH" $JAVA_OPTS "$@"


Loading

0 comments on commit dd57922

Please sign in to comment.