From 74fcde92ed09a28aa8ae82a4b5e77928e2163cae Mon Sep 17 00:00:00 2001 From: minyk Date: Mon, 16 Mar 2015 20:17:36 +0900 Subject: [PATCH 1/2] Fix scale factor. - original value * 100 --- src/main/resources/run.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/resources/run.sh b/src/main/resources/run.sh index 962b2a6..6ef0461 100644 --- a/src/main/resources/run.sh +++ b/src/main/resources/run.sh @@ -251,7 +251,7 @@ if (($benchmark_result == 1)) then total_time=`expr ${end} - ${start}` total_time_in_hour=$(echo "scale=4;${total_time}/3600" | bc) -scale_factor=$(echo "scale=4;${hssize}/10000000000" | bc) +scale_factor=$(echo "scale=4;${hssize}/1000000000000" | bc) perf_metric=$(echo "scale=4;${scale_factor}/${total_time_in_hour}" | bc) echo -e "${green}$sep============${NC}" | tee -a ./result-"${prefix}".log From a020f3dc80c47c055902bdac836f4d3731c76219 Mon Sep 17 00:00:00 2001 From: minyk Date: Tue, 5 May 2015 15:48:34 +0900 Subject: [PATCH 2/2] Edits from Ewan Higgs' [Spark-Terasort](https://github.com/ehiggs/spark-terasort). - https://github.com/ehiggs/spark-terasort/commit/563059c4a1229edc371f3d225671f8424fafcb86 - https://github.com/ehiggs/spark-terasort/commit/62fe9dbf5e748e7e3213972d2fbf61e3b77fc4d6 --- src/main/scala/com/nexr/spark/terasort/TeraInputFormat.scala | 4 ---- src/main/scala/com/nexr/spark/terasort/TeraSort.scala | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/main/scala/com/nexr/spark/terasort/TeraInputFormat.scala b/src/main/scala/com/nexr/spark/terasort/TeraInputFormat.scala index e2e2a84..b0b63e5 100644 --- a/src/main/scala/com/nexr/spark/terasort/TeraInputFormat.scala +++ b/src/main/scala/com/nexr/spark/terasort/TeraInputFormat.scala @@ -34,10 +34,6 @@ object TeraInputFormat { val KEY_LEN = 10 val VALUE_LEN = 90 val RECORD_LEN = KEY_LEN + VALUE_LEN - val NUM_INPUT_FILES = "mapreduce.input.fileinputformat.numinputfiles" - val NUM_PARTITIONS = "mapreduce.terasort.num.partitions" - val SAMPLE_SIZE = "mapreduce.terasort.partitions.sample" - val SPLIT_SLOP = 1.1 var lastContext : JobContext = null var lastResult : List[InputSplit] = null implicit val caseInsensitiveOrdering = UnsignedBytes.lexicographicalComparator diff --git a/src/main/scala/com/nexr/spark/terasort/TeraSort.scala b/src/main/scala/com/nexr/spark/terasort/TeraSort.scala index f08ad42..eb99eba 100644 --- a/src/main/scala/com/nexr/spark/terasort/TeraSort.scala +++ b/src/main/scala/com/nexr/spark/terasort/TeraSort.scala @@ -38,7 +38,7 @@ object TeraSort { val sc = new SparkContext(conf) val dataset = sc.newAPIHadoopFile[Array[Byte], Array[Byte], TeraInputFormat](inputFile) - val sorted = dataset.sortByKey() + val sorted = dataset.partitionBy(new TeraSortPartitioner(dataset.partitions.length)).sortByKey() sorted.saveAsNewAPIHadoopFile[TeraOutputFormat](outputFile) } } \ No newline at end of file