From a0b373c1be362ffcc7f131f29c608e02a6e47dbc Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 24 Oct 2023 08:58:50 -0700
Subject: [PATCH] [SLP]Fix PR70004: Do not change insert point for reduction
 gather nodes.

No need to change the insert point for reduction gather node, we can use
the ReductionRoot as insert point instead to avoid possible crashes.

(cherry picked from commit d79051f894129428ec36dedc6bbfdfdcc1bd0c17)
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  3 +-
 ...reduction-gather-non-scheduled-extracts.ll | 44 +++++++++++++++++++
 .../SLPVectorizer/X86/reduction-transpose.ll  | 40 ++++++++---------
 3 files changed, 66 insertions(+), 21 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 821a3fa22a85..9870ffbb586c 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10118,7 +10118,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   }
 
   if (E->State == TreeEntry::NeedToGather) {
-    if (E->getMainOp() && E->Idx == 0)
+    // Set insert point for non-reduction initial nodes.
+    if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList)
       setInsertPointAfterBundle(E);
     Value *Vec = createBuildVector(E);
     E->VectorizedValue = Vec;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll
new file mode 100644
index 000000000000..c760dfedbdae
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll
@@ -0,0 +1,44 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s | FileCheck %s
+
+define void @tes() {
+; CHECK-LABEL: define void @tes() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    br label [[TMP1:%.*]]
+; CHECK:       1:
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 false, i1 false, i1 false
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[TMP0]], <4 x i32> <i32 0, i32 0, i32 0, i32 2>
+; CHECK-NEXT:    [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = select i1 [[TMP5]], i1 false, i1 false
+; CHECK-NEXT:    [[OP_RDX1:%.*]] = select i1 [[TMP2]], i1 [[OP_RDX]], i1 false
+; CHECK-NEXT:    br i1 [[OP_RDX1]], label [[TMP6:%.*]], label [[TMP7:%.*]]
+; CHECK:       6:
+; CHECK-NEXT:    ret void
+; CHECK:       7:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = extractelement <2 x i1> zeroinitializer, i64 0
+  %1 = extractelement <2 x i1> zeroinitializer, i64 0
+  %2 = fcmp ole <2 x double> zeroinitializer, zeroinitializer
+  %3 = extractelement <2 x i1> %2, i64 0
+  %4 = extractelement <2 x i1> zeroinitializer, i64 0
+  br label %5
+
+5:
+  %6 = select i1 false, i1 false, i1 false
+  %7 = select i1 %6, i1 %0, i1 false
+  %8 = select i1 %7, i1 %1, i1 false
+  %9 = select i1 %8, i1 false, i1 false
+  %10 = select i1 %9, i1 %3, i1 false
+  %11 = select i1 %10, i1 %4, i1 false
+  br i1 %11, label %12, label %13
+
+12:
+  ret void
+
+13:
+  ret void
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
index 6c7d5e6324ca..4cecfb3fba98 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll
@@ -18,11 +18,11 @@
 define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
 ; SSE2-LABEL: @reduce_and4(
 ; SSE2-NEXT:  entry:
-; SSE2-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
-; SSE2-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
-; SSE2-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]]
+; SSE2-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
+; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
+; SSE2-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
 ; SSE2-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
 ; SSE2-NEXT:    ret i32 [[OP_RDX1]]
 ;
@@ -40,11 +40,11 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <
 ;
 ; AVX-LABEL: @reduce_and4(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
-; AVX-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; AVX-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
-; AVX-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]]
+; AVX-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; AVX-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]])
+; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 5, i32 4, i32 6, i32 7>
+; AVX-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
+; AVX-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]]
 ; AVX-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
 ; AVX-NEXT:    ret i32 [[OP_RDX1]]
 ;
@@ -94,11 +94,11 @@ entry:
 
 define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) {
 ; SSE2-LABEL: @reduce_and4_transpose(
-; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; SSE2-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; SSE2-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
-; SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; SSE2-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]]
+; SSE2-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; SSE2-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
+; SSE2-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; SSE2-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
+; SSE2-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
 ; SSE2-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
 ; SSE2-NEXT:    ret i32 [[OP_RDX1]]
 ;
@@ -114,11 +114,11 @@ define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i
 ; SSE42-NEXT:    ret i32 [[OP_RDX3]]
 ;
 ; AVX-LABEL: @reduce_and4_transpose(
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; AVX-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; AVX-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]])
-; AVX-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
-; AVX-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]]
+; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; AVX-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]])
+; AVX-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; AVX-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
+; AVX-NEXT:    [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]]
 ; AVX-NEXT:    [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]]
 ; AVX-NEXT:    ret i32 [[OP_RDX1]]
 ;