-
Notifications
You must be signed in to change notification settings - Fork 51
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
SPARK-25299: Add rest of shuffle writer benchmarks #507
Changes from 79 commits
c7abec6
22ef648
4084e27
fb8266d
89104e2
b90b381
5e13dd8
845e645
a68f459
757f6fe
0bcd5d9
26c01ec
0d7a036
3fc5331
8c33701
9546397
d72ba73
1859805
c20f0be
444d46a
2322933
da0d91c
e590917
cbfdb99
cbe38c6
acdda71
fd7a7c5
d82618b
610ea1d
295d7f3
0c696dc
323a296
85836c2
b67d1f3
252963d
f72afb2
3bcd35e
d8b5d79
d9fb78a
b142951
d0466b8
f91dfad
5839b1d
0b8c7ed
d11f87f
6f2779f
bbe9edc
567d372
47c1938
e79ac28
c3858df
68d6f62
7c8d52e
9d46fae
9f51758
f169acd
bcb09c5
d4a1b52
25da723
8559264
13703fa
e3751cd
33a1b72
fa1b96c
b38abb0
96c66c9
37cef1f
459e1b5
4cabdbd
9225bb7
a3b0ee5
47d2dcf
c78e491
f28b75c
a85acf4
f26ab40
e5481b4
6151cab
da1c2d0
350eb6e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package org.apache.spark.shuffle.sort | ||
|
||
import org.apache.spark.SparkConf | ||
import org.apache.spark.benchmark.Benchmark | ||
import org.apache.spark.util.Utils | ||
|
||
/** | ||
* Benchmark to measure performance for aggregate primitives. | ||
* {{{ | ||
* To run this benchmark: | ||
* 1. without sbt: bin/spark-submit --class <this class> <spark sql test jar> | ||
* 2. build/sbt "sql/test:runMain <this class>" | ||
* 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>" | ||
* Results will be written to "benchmarks/<this class>-results.txt". | ||
* }}} | ||
*/ | ||
object BypassMergeSortShuffleWriterBenchmark extends ShuffleWriterBenchmarkBase { | ||
|
||
private val shuffleHandle: BypassMergeSortShuffleHandle[String, String] = | ||
new BypassMergeSortShuffleHandle[String, String]( | ||
shuffleId = 0, | ||
numMaps = 1, | ||
dependency) | ||
|
||
private val MIN_NUM_ITERS = 10 | ||
private val DATA_SIZE_SMALL = 1000 | ||
private val DATA_SIZE_LARGE = | ||
PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES/4/DEFAULT_DATA_STRING_SIZE | ||
|
||
def getWriter(transferTo: Boolean): BypassMergeSortShuffleWriter[String, String] = { | ||
val conf = new SparkConf(loadDefaults = false) | ||
conf.set("spark.file.transferTo", String.valueOf(transferTo)) | ||
conf.set("spark.shuffle.file.buffer", "32k") | ||
|
||
val shuffleWriter = new BypassMergeSortShuffleWriter[String, String]( | ||
blockManager, | ||
blockResolver, | ||
shuffleHandle, | ||
0, | ||
conf, | ||
taskContext.taskMetrics().shuffleWriteMetrics | ||
) | ||
|
||
shuffleWriter | ||
} | ||
|
||
def writeBenchmarkWithLargeDataset(): Unit = { | ||
val size = DATA_SIZE_LARGE | ||
val benchmark = new Benchmark( | ||
"BypassMergeSortShuffleWrite with spill", | ||
size, | ||
minNumIters = MIN_NUM_ITERS, | ||
output = output) | ||
|
||
addBenchmarkCase(benchmark, "without transferTo", size, () => getWriter(false)) | ||
addBenchmarkCase(benchmark, "with transferTo", size, () => getWriter(true)) | ||
benchmark.run() | ||
} | ||
|
||
def writeBenchmarkWithSmallDataset(): Unit = { | ||
val size = DATA_SIZE_SMALL | ||
val benchmark = new Benchmark("BypassMergeSortShuffleWrite without spill", | ||
size, | ||
minNumIters = MIN_NUM_ITERS, | ||
output = output) | ||
addBenchmarkCase(benchmark, "small dataset without disk spill", size, () => getWriter(false)) | ||
benchmark.run() | ||
} | ||
|
||
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { | ||
runBenchmark("BypassMergeSortShuffleWriter write") { | ||
writeBenchmarkWithSmallDataset() | ||
writeBenchmarkWithLargeDataset() | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one or more | ||
* contributor license agreements. See the NOTICE file distributed with | ||
* this work for additional information regarding copyright ownership. | ||
* The ASF licenses this file to You under the Apache License, Version 2.0 | ||
* (the "License"); you may not use this file except in compliance with | ||
* the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package org.apache.spark.shuffle.sort | ||
|
||
import org.apache.spark.SparkConf | ||
import org.apache.spark.benchmark.Benchmark | ||
import org.apache.spark.util.Utils | ||
|
||
/** | ||
* Benchmark to measure performance for aggregate primitives. | ||
* {{{ | ||
* To run this benchmark: | ||
* 1. without sbt: bin/spark-submit --class <this class> <spark sql test jar> | ||
* 2. build/sbt "sql/test:runMain <this class>" | ||
* 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>" | ||
* Results will be written to "benchmarks/<this class>-results.txt". | ||
* }}} | ||
*/ | ||
object UnsafeShuffleWriterBenchmark extends ShuffleWriterBenchmarkBase { | ||
|
||
private val shuffleHandle: SerializedShuffleHandle[String, String] = | ||
new SerializedShuffleHandle[String, String](0, 0, this.dependency) | ||
|
||
private val MIN_NUM_ITERS = 10 | ||
private val DATA_SIZE_SMALL = 1000 | ||
private val DATA_SIZE_LARGE = | ||
PackedRecordPointer.MAXIMUM_PAGE_SIZE_BYTES/2/DEFAULT_DATA_STRING_SIZE | ||
|
||
def getWriter(transferTo: Boolean): UnsafeShuffleWriter[String, String] = { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One idea I just thought of - why not make There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
val conf = new SparkConf(loadDefaults = false) | ||
conf.set("spark.file.transferTo", String.valueOf(transferTo)) | ||
|
||
new UnsafeShuffleWriter[String, String]( | ||
blockManager, | ||
blockResolver, | ||
taskMemoryManager, | ||
shuffleHandle, | ||
0, | ||
taskContext, | ||
conf, | ||
taskContext.taskMetrics().shuffleWriteMetrics | ||
) | ||
} | ||
|
||
def writeBenchmarkWithSmallDataset(): Unit = { | ||
val size = DATA_SIZE_SMALL | ||
val benchmark = new Benchmark("UnsafeShuffleWriter without spills", | ||
size, | ||
minNumIters = MIN_NUM_ITERS, | ||
output = output) | ||
addBenchmarkCase(benchmark, | ||
"small dataset without spills", | ||
size, | ||
() => getWriter(false), | ||
Some(1)) // The single temp file is for the temp index file | ||
benchmark.run() | ||
} | ||
|
||
def writeBenchmarkWithSpill(): Unit = { | ||
val size = DATA_SIZE_LARGE | ||
val benchmark = new Benchmark("UnsafeShuffleWriter with spills", | ||
size, | ||
minNumIters = MIN_NUM_ITERS, | ||
output = output, | ||
outputPerIteration = true) | ||
addBenchmarkCase(benchmark, "without transferTo", size, () => getWriter(false), Some(7)) | ||
addBenchmarkCase(benchmark, "with transferTo", size, () => getWriter(true), Some(7)) | ||
benchmark.run() | ||
} | ||
|
||
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { | ||
runBenchmark("UnsafeShuffleWriter write") { | ||
writeBenchmarkWithSmallDataset() | ||
writeBenchmarkWithSpill() | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -50,12 +50,16 @@ done | |
|
||
echo "Running SPARK-25299 benchmarks" | ||
|
||
SPARK_GENERATE_BENCHMARK_FILES=1 ./build/sbt "sql/test:runMain org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriterBenchmark" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Not for right now, but if this list grows beyond size 5, let's make a text file with all the classes that we need to benchmark, and then for-loop over it. |
||
SPARK_GENERATE_BENCHMARK_FILES=1 ./build/sbt "sql/test:runMain org.apache.spark.shuffle.sort.SortShuffleWriterBenchmark" | ||
SPARK_GENERATE_BENCHMARK_FILES=1 ./build/sbt "sql/test:runMain org.apache.spark.shuffle.sort.UnsafeShuffleWriterBenchmark" | ||
|
||
SPARK_DIR=`pwd` | ||
|
||
mkdir -p /tmp/artifacts | ||
cp $SPARK_DIR/sql/core/benchmarks/BypassMergeSortShuffleWriterBenchmark-results.txt /tmp/artifacts/ | ||
cp $SPARK_DIR/sql/core/benchmarks/SortShuffleWriterBenchmark-results.txt /tmp/artifacts/ | ||
cp $SPARK_DIR/sql/core/benchmarks/UnsafeShuffleWriterBenchmark-results.txt /tmp/artifacts/ | ||
|
||
if [ "$UPLOAD" = false ]; then | ||
exit 0 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit: Start args on this line, then 1 arg per line, with 4-space indentation from
def
.