[SPARK-35350][SQL] Add code-gen for left semi sort merge join #32528

c21 · 2021-05-12T23:52:21Z

What changes were proposed in this pull request?

As title. This PR is to add code-gen support for LEFT SEMI sort merge join. The main change is to add semiJoin code path in SortMergeJoinExec.doProduce() and introduce onlyBufferFirstMatchedRow in SortMergeJoinExec.genScanner(). The latter is for left semi sort merge join without condition. For this kind of query, we don't need to buffer all matched rows, but only the first one (this is same as non-code-gen code path).

Example query:

val df1 = spark.range(10).select($"id".as("k1"))
val df2 = spark.range(4).select($"id".as("k2"))
val oneJoinDF = df1.join(df2.hint("SHUFFLE_MERGE"), $"k1" === $"k2", "left_semi")

Example of generated code for the query:

== Subtree 5 / 5 (maxMethodCodeSize:302; maxConstantPoolSize:156(0.24% used); numInnerClasses:0) ==
*(5) Project [id#0L AS k1#2L]
+- *(5) SortMergeJoin [id#0L], [k2#6L], LeftSemi
   :- *(2) Sort [id#0L ASC NULLS FIRST], false, 0
   :  +- Exchange hashpartitioning(id#0L, 5), ENSURE_REQUIREMENTS, [id=#27]
   :     +- *(1) Range (0, 10, step=1, splits=2)
   +- *(4) Sort [k2#6L ASC NULLS FIRST], false, 0
      +- Exchange hashpartitioning(k2#6L, 5), ENSURE_REQUIREMENTS, [id=#33]
         +- *(3) Project [id#4L AS k2#6L]
            +- *(3) Range (0, 4, step=1, splits=2)

Generated code:
/* 001 */ public Object generate(Object[] references) {
/* 002 */   return new GeneratedIteratorForCodegenStage5(references);
/* 003 */ }
/* 004 */
/* 005 */ // codegenStageId=5
/* 006 */ final class GeneratedIteratorForCodegenStage5 extends org.apache.spark.sql.execution.BufferedRowIterator {
/* 007 */   private Object[] references;
/* 008 */   private scala.collection.Iterator[] inputs;
/* 009 */   private scala.collection.Iterator smj_streamedInput_0;
/* 010 */   private scala.collection.Iterator smj_bufferedInput_0;
/* 011 */   private InternalRow smj_streamedRow_0;
/* 012 */   private InternalRow smj_bufferedRow_0;
/* 013 */   private long smj_value_2;
/* 014 */   private org.apache.spark.sql.execution.ExternalAppendOnlyUnsafeRowArray smj_matches_0;
/* 015 */   private long smj_value_3;
/* 016 */   private org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[] smj_mutableStateArray_0 = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter[2];
/* 017 */
/* 018 */   public GeneratedIteratorForCodegenStage5(Object[] references) {
/* 019 */     this.references = references;
/* 020 */   }
/* 021 */
/* 022 */   public void init(int index, scala.collection.Iterator[] inputs) {
/* 023 */     partitionIndex = index;
/* 024 */     this.inputs = inputs;
/* 025 */     smj_streamedInput_0 = inputs[0];
/* 026 */     smj_bufferedInput_0 = inputs[1];
/* 027 */
/* 028 */     smj_matches_0 = new org.apache.spark.sql.execution.ExternalAppendOnlyUnsafeRowArray(1, 2147483647);
/* 029 */     smj_mutableStateArray_0[0] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0);
/* 030 */     smj_mutableStateArray_0[1] = new org.apache.spark.sql.catalyst.expressions.codegen.UnsafeRowWriter(1, 0);
/* 031 */
/* 032 */   }
/* 033 */
/* 034 */   private boolean findNextJoinRows(
/* 035 */     scala.collection.Iterator streamedIter,
/* 036 */     scala.collection.Iterator bufferedIter) {
/* 037 */     smj_streamedRow_0 = null;
/* 038 */     int comp = 0;
/* 039 */     while (smj_streamedRow_0 == null) {
/* 040 */       if (!streamedIter.hasNext()) return false;
/* 041 */       smj_streamedRow_0 = (InternalRow) streamedIter.next();
/* 042 */       long smj_value_0 = smj_streamedRow_0.getLong(0);
/* 043 */       if (false) {
/* 044 */         smj_streamedRow_0 = null;
/* 045 */         continue;
/* 046 */
/* 047 */       }
/* 048 */       if (!smj_matches_0.isEmpty()) {
/* 049 */         comp = 0;
/* 050 */         if (comp == 0) {
/* 051 */           comp = (smj_value_0 > smj_value_3 ? 1 : smj_value_0 < smj_value_3 ? -1 : 0);
/* 052 */         }
/* 053 */
/* 054 */         if (comp == 0) {
/* 055 */           return true;
/* 056 */         }
/* 057 */         smj_matches_0.clear();
/* 058 */       }
/* 059 */
/* 060 */       do {
/* 061 */         if (smj_bufferedRow_0 == null) {
/* 062 */           if (!bufferedIter.hasNext()) {
/* 063 */             smj_value_3 = smj_value_0;
/* 064 */             return !smj_matches_0.isEmpty();
/* 065 */           }
/* 066 */           smj_bufferedRow_0 = (InternalRow) bufferedIter.next();
/* 067 */           long smj_value_1 = smj_bufferedRow_0.getLong(0);
/* 068 */           if (false) {
/* 069 */             smj_bufferedRow_0 = null;
/* 070 */             continue;
/* 071 */           }
/* 072 */           smj_value_2 = smj_value_1;
/* 073 */         }
/* 074 */
/* 075 */         comp = 0;
/* 076 */         if (comp == 0) {
/* 077 */           comp = (smj_value_0 > smj_value_2 ? 1 : smj_value_0 < smj_value_2 ? -1 : 0);
/* 078 */         }
/* 079 */
/* 080 */         if (comp > 0) {
/* 081 */           smj_bufferedRow_0 = null;
/* 082 */         } else if (comp < 0) {
/* 083 */           if (!smj_matches_0.isEmpty()) {
/* 084 */             smj_value_3 = smj_value_0;
/* 085 */             return true;
/* 086 */           } else {
/* 087 */             smj_streamedRow_0 = null;
/* 088 */           }
/* 089 */         } else {
/* 090 */           if (smj_matches_0.isEmpty()) {
/* 091 */             smj_matches_0.add((UnsafeRow) smj_bufferedRow_0);
/* 092 */           }
/* 093 */
/* 094 */           smj_bufferedRow_0 = null;
/* 095 */         }
/* 096 */       } while (smj_streamedRow_0 != null);
/* 097 */     }
/* 098 */     return false; // unreachable
/* 099 */   }
/* 100 */
/* 101 */   protected void processNext() throws java.io.IOException {
/* 102 */     while (findNextJoinRows(smj_streamedInput_0, smj_bufferedInput_0)) {
/* 103 */       long smj_value_4 = -1L;
/* 104 */       smj_value_4 = smj_streamedRow_0.getLong(0);
/* 105 */       scala.collection.Iterator<UnsafeRow> smj_iterator_0 = smj_matches_0.generateIterator();
/* 106 */       boolean smj_hasOutputRow_0 = false;
/* 107 */
/* 108 */       while (!smj_hasOutputRow_0 && smj_iterator_0.hasNext()) {
/* 109 */         InternalRow smj_bufferedRow_1 = (InternalRow) smj_iterator_0.next();
/* 110 */
/* 111 */         smj_hasOutputRow_0 = true;
/* 112 */         ((org.apache.spark.sql.execution.metric.SQLMetric) references[0] /* numOutputRows */).add(1);
/* 113 */
/* 114 */         // common sub-expressions
/* 115 */
/* 116 */         smj_mutableStateArray_0[1].reset();
/* 117 */
/* 118 */         smj_mutableStateArray_0[1].write(0, smj_value_4);
/* 119 */         append((smj_mutableStateArray_0[1].getRow()).copy());
/* 120 */
/* 121 */       }
/* 122 */       if (shouldStop()) return;
/* 123 */     }
/* 124 */     ((org.apache.spark.sql.execution.joins.SortMergeJoinExec) references[1] /* plan */).cleanupResources();
/* 125 */   }
/* 126 */
/* 127 */ }

Why are the changes needed?

Improve query CPU performance. Test with one query:

 def sortMergeJoin(): Unit = {
    val N = 2 << 20
    codegenBenchmark("left semi sort merge join", N) {
      val df1 = spark.range(N).selectExpr(s"id * 2 as k1")
      val df2 = spark.range(N).selectExpr(s"id * 3 as k2")
      val df = df1.join(df2, col("k1") === col("k2"), "left_semi")
      assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[SortMergeJoinExec]).isDefined)
      df.noop()
    }
  }

Seeing 30% of run-time improvement:

Running benchmark: left semi sort merge join
  Running case: left semi sort merge join code-gen off
  Stopped after 2 iterations, 1369 ms
  Running case: left semi sort merge join code-gen on
  Stopped after 5 iterations, 2743 ms

Java HotSpot(TM) 64-Bit Server VM 1.8.0_181-b13 on Mac OS X 10.16
Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz
left semi sort merge join:                Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
------------------------------------------------------------------------------------------------------------------------
left semi sort merge join code-gen off              676            685          13          3.1         322.2       1.0X
left semi sort merge join code-gen on               524            549          32          4.0         249.7       1.3X

Does this PR introduce any user-facing change?

No.

How was this patch tested?

Added unit test in WholeStageCodegenSuite.scala and ExistenceJoinSuite.scala.

c21 · 2021-05-12T23:53:02Z

cc @cloud-fan and @maropu could you help take a look when you have time? Thanks.

SparkQA · 2021-05-13T01:11:19Z

Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/42997/

SparkQA · 2021-05-13T01:11:20Z

Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/42997/

maropu

I left minor comments and it looks otherwise. Thank you, @c21

maropu · 2021-05-13T02:44:15Z

sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala

@@ -724,9 +749,32 @@ case class SortMergeJoinExec(
       """.stripMargin
    }

+    lazy val semiJoin = {


How about extracting this block as a private method like codegenXXXX just like HashJoin?

@maropu - yes I was thinking at the first place but worried about number of parameters to be too many. Refined the code a bit and updated now.

maropu · 2021-05-13T02:46:35Z

sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala

+      case _ => false
+    }
+    val inMemoryThreshold =
+      if (onlyBufferFirstMatchedRow) {


How about moving this branch into the getInMemoryThreshold side?

// Flag to only buffer first matched row, to avoid buffering unnecessary rows. private lazy val onlyBufferFirstMatchedRow = (joinType, condition) match { case (LeftSemi, None) => true case _ => false } private def getInMemoryThreshold: Int = { if (onlyBufferFirstMatchedRow) { 1 } else { sqlContext.conf.sortMergeJoinExecBufferInMemoryThreshold } }

+1, lazy val can probably be def as the logic is super simple

Good call. Actually the non-code-gen path can also depend on this, so I make it just a val now.

SparkQA · 2021-05-13T03:44:10Z

Test build #138476 has finished for PR 32528 at commit 8eb55c3.

This patch fails Spark unit tests.
This patch merges cleanly.
This patch adds no public classes.

c21 · 2021-05-13T07:04:44Z

To ease for review, the change for all plan files is used by followed command:

SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/testOnly *PlanStabilitySuite" 
SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "sql/testOnly *PlanStabilityWithStatsSuite"

None of them are updated manually.

cloud-fan · 2021-05-13T07:22:59Z

sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala

-      case _: InnerLike => innerJoin
-      case LeftOuter | RightOuter => outerJoin
+      case _: InnerLike =>
+        codegenInner(findNextJoinRows, beforeLoop, iterator, bufferedRow, condCheck, outputRow,


shall we pass beforeLoop.trim so that we don't need to do it in all the 3 methods?

Actually after double checking, we do not need to do beforeLoop.trim as beforeLoop already has stripMargin, and has no trailing spaces. Also updated to avoid repeated conditionCheck.trim

cloud-fan · 2021-05-13T07:24:48Z

sql/core/src/main/scala/org/apache/spark/sql/execution/joins/SortMergeJoinExec.scala

+    s"""
+       |while ($findNextJoinRows) {
+       |  ${beforeLoop.trim}
+       |  boolean $hasOutputRow = false;


do we need this flag if we are sure matchIterator has at most one element?

@cloud-fan - matchIterator will only has at most one element if join condition is empty. So yes we don't need this if join condition is empty. But consider the extra code is just a while loop check on hasOutputRow, and set value of hasOutputRow, I don't see much value to specialize another code-gen for left semi join without join condition. WDYT?

I see, let's keep it

SparkQA · 2021-05-13T08:18:21Z

Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/43020/

SparkQA · 2021-05-13T08:18:22Z

Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/43020/

SparkQA · 2021-05-13T10:28:52Z

Kubernetes integration test starting
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/43026/

SparkQA · 2021-05-13T10:28:54Z

Kubernetes integration test status failure
URL: https://amplab.cs.berkeley.edu/jenkins/job/SparkPullRequestBuilder-K8s/43026/

SparkQA · 2021-05-13T11:41:11Z

Test build #138500 has finished for PR 32528 at commit 979c759.

This patch passes all tests.
This patch merges cleanly.
This patch adds no public classes.

cloud-fan · 2021-05-13T12:52:24Z

thanks, merging to master!

SparkQA · 2021-05-13T14:01:13Z

Test build #138506 has finished for PR 32528 at commit 6282a09.

This patch passes all tests.
This patch merges cleanly.
This patch adds no public classes.

c21 · 2021-05-13T17:22:07Z

Thank you @cloud-fan and @maropu for review!

Left semi sort merge join code-gen

8eb55c3

github-actions bot added the SQL label May 12, 2021

maropu approved these changes May 13, 2021

View reviewed changes

Address all comments and regenerate unit test plan files

979c759

cloud-fan reviewed May 13, 2021

View reviewed changes

Avoid call trim multiple times for beforeLoop and conditionCheck

6282a09

cloud-fan closed this in c1e995a May 13, 2021

c21 deleted the smj-left-semi branch May 13, 2021 17:22

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[SPARK-35350][SQL] Add code-gen for left semi sort merge join #32528

[SPARK-35350][SQL] Add code-gen for left semi sort merge join #32528

c21 commented May 12, 2021

c21 commented May 12, 2021

SparkQA commented May 13, 2021

SparkQA commented May 13, 2021

maropu left a comment

maropu May 13, 2021

c21 May 13, 2021

maropu May 13, 2021

cloud-fan May 13, 2021

c21 May 13, 2021

SparkQA commented May 13, 2021

c21 commented May 13, 2021

cloud-fan May 13, 2021

c21 May 13, 2021 •

edited

Loading

cloud-fan May 13, 2021

c21 May 13, 2021

cloud-fan May 13, 2021

SparkQA commented May 13, 2021

SparkQA commented May 13, 2021

SparkQA commented May 13, 2021

SparkQA commented May 13, 2021

SparkQA commented May 13, 2021

cloud-fan commented May 13, 2021

SparkQA commented May 13, 2021

c21 commented May 13, 2021

[SPARK-35350][SQL] Add code-gen for left semi sort merge join #32528

[SPARK-35350][SQL] Add code-gen for left semi sort merge join #32528

Conversation

c21 commented May 12, 2021

What changes were proposed in this pull request?

Why are the changes needed?

Does this PR introduce any user-facing change?

How was this patch tested?

c21 commented May 12, 2021

SparkQA commented May 13, 2021

SparkQA commented May 13, 2021

maropu left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

SparkQA commented May 13, 2021

c21 commented May 13, 2021

Choose a reason for hiding this comment

c21 May 13, 2021 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

SparkQA commented May 13, 2021

SparkQA commented May 13, 2021

SparkQA commented May 13, 2021

SparkQA commented May 13, 2021

SparkQA commented May 13, 2021

cloud-fan commented May 13, 2021

SparkQA commented May 13, 2021

c21 commented May 13, 2021

c21 May 13, 2021 •

edited

Loading