From a0b373c1be362ffcc7f131f29c608e02a6e47dbc Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Tue, 24 Oct 2023 08:58:50 -0700 Subject: [PATCH] [SLP]Fix PR70004: Do not change insert point for reduction gather nodes. No need to change the insert point for reduction gather node, we can use the ReductionRoot as insert point instead to avoid possible crashes. (cherry picked from commit d79051f894129428ec36dedc6bbfdfdcc1bd0c17) --- .../Transforms/Vectorize/SLPVectorizer.cpp | 3 +- ...reduction-gather-non-scheduled-extracts.ll | 44 +++++++++++++++++++ .../SLPVectorizer/X86/reduction-transpose.ll | 40 ++++++++--------- 3 files changed, 66 insertions(+), 21 deletions(-) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 821a3fa22a85..9870ffbb586c 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -10118,7 +10118,8 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } if (E->State == TreeEntry::NeedToGather) { - if (E->getMainOp() && E->Idx == 0) + // Set insert point for non-reduction initial nodes. + if (E->getMainOp() && E->Idx == 0 && !UserIgnoreList) setInsertPointAfterBundle(E); Value *Vec = createBuildVector(E); E->VectorizedValue = Vec; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll new file mode 100644 index 000000000000..c760dfedbdae --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s | FileCheck %s + +define void @tes() { +; CHECK-LABEL: define void @tes() { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = fcmp ole <2 x double> zeroinitializer, zeroinitializer +; CHECK-NEXT: br label [[TMP1:%.*]] +; CHECK: 1: +; CHECK-NEXT: [[TMP2:%.*]] = select i1 false, i1 false, i1 false +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i1> zeroinitializer, <2 x i1> [[TMP0]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = freeze <4 x i1> [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.and.v4i1(<4 x i1> [[TMP4]]) +; CHECK-NEXT: [[OP_RDX:%.*]] = select i1 [[TMP5]], i1 false, i1 false +; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP2]], i1 [[OP_RDX]], i1 false +; CHECK-NEXT: br i1 [[OP_RDX1]], label [[TMP6:%.*]], label [[TMP7:%.*]] +; CHECK: 6: +; CHECK-NEXT: ret void +; CHECK: 7: +; CHECK-NEXT: ret void +; +entry: + %0 = extractelement <2 x i1> zeroinitializer, i64 0 + %1 = extractelement <2 x i1> zeroinitializer, i64 0 + %2 = fcmp ole <2 x double> zeroinitializer, zeroinitializer + %3 = extractelement <2 x i1> %2, i64 0 + %4 = extractelement <2 x i1> zeroinitializer, i64 0 + br label %5 + +5: + %6 = select i1 false, i1 false, i1 false + %7 = select i1 %6, i1 %0, i1 false + %8 = select i1 %7, i1 %1, i1 false + %9 = select i1 %8, i1 false, i1 false + %10 = select i1 %9, i1 %3, i1 false + %11 = select i1 %10, i1 %4, i1 false + br i1 %11, label %12, label %13 + +12: + ret void + +13: + ret void +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll index 6c7d5e6324ca..4cecfb3fba98 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-transpose.ll @@ -18,11 +18,11 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { ; SSE2-LABEL: @reduce_and4( ; SSE2-NEXT: entry: -; SSE2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> -; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) -; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) -; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]] +; SSE2-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) +; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]] ; SSE2-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; SSE2-NEXT: ret i32 [[OP_RDX1]] ; @@ -40,11 +40,11 @@ define i32 @reduce_and4(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, < ; ; AVX-LABEL: @reduce_and4( ; AVX-NEXT: entry: -; AVX-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; AVX-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) -; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) -; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP3]] +; AVX-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; AVX-NEXT: [[TMP1:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP0]]) +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) +; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP1]], [[TMP3]] ; AVX-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; AVX-NEXT: ret i32 [[OP_RDX1]] ; @@ -94,11 +94,11 @@ entry: define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i32> %v3, <4 x i32> %v4) { ; SSE2-LABEL: @reduce_and4_transpose( -; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> -; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; SSE2-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) -; SSE2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) -; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]] +; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; SSE2-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) +; SSE2-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]] ; SSE2-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; SSE2-NEXT: ret i32 [[OP_RDX1]] ; @@ -114,11 +114,11 @@ define i32 @reduce_and4_transpose(i32 %acc, <4 x i32> %v1, <4 x i32> %v2, <4 x i ; SSE42-NEXT: ret i32 [[OP_RDX3]] ; ; AVX-LABEL: @reduce_and4_transpose( -; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> -; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> -; AVX-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP2]]) -; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) -; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP3]], [[TMP4]] +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[V4:%.*]], <4 x i32> [[V3:%.*]], <8 x i32> +; AVX-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP1]]) +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[V2:%.*]], <4 x i32> [[V1:%.*]], <8 x i32> +; AVX-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) +; AVX-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP2]], [[TMP4]] ; AVX-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[ACC:%.*]] ; AVX-NEXT: ret i32 [[OP_RDX1]] ;