Xilinx · ValentijnvdBeek · May 13, 2024 · Sep 20, 2024 · Sep 20, 2024 · Aug 15, 2024
@@ -27,6 +27,7 @@
 #include "llvm/CodeGenTypes/LowLevelType.h"
 #include "llvm/IR/InstrTypes.h"
 #include <functional>
+#include <optional>
 
 namespace llvm {
 
@@ -245,19 +246,33 @@ class CombinerHelper {
   /// or an implicit_def if \p Ops is empty.
   void applyCombineShuffleConcat(MachineInstr &MI, SmallVector<Register> &Ops);
 
-  /// Try to combine G_SHUFFLE_VECTOR into G_CONCAT_VECTORS.
+  /// A function type that returns either the next value in a
+  /// shufflemask or an empty value. Each iteration should return
+  /// one value, like a Python iterator or a Lisp stream.
+  using GeneratorType = std::function<std::optional<int32_t>()>;
+
+  /// Try to combine G_SHUFFLE_VECTOR into more efficient opcodes.
   /// Returns true if MI changed.
   ///
   /// \pre MI.getOpcode() == G_SHUFFLE_VECTOR.
   bool tryCombineShuffleVector(MachineInstr &MI);
-  /// Check if the G_SHUFFLE_VECTOR \p MI can be replaced by a
-  /// concat_vectors.
-  /// \p Ops will contain the operands needed to produce the flattened
-  /// concat_vectors.
+  /// Check if the G_SHUFFLE_VECTOR \p MI can be replaced by checking
+  /// whether the shufflemask given matches that of a given generator.
   ///
   /// \pre MI.getOpcode() == G_SHUFFLE_VECTOR.
-  bool matchCombineShuffleVector(MachineInstr &MI,
-                                 SmallVectorImpl<Register> &Ops);
+  bool matchCombineShuffleVector(MachineInstr &MI, GeneratorType Generator,
+                                 const size_t TargetDstSize);
+
+  /// Create G_UNMERGE_VECTOR instructions until the source has reached a
+  /// target vector size.
+  ///
+  /// Requires that the destination fits evenly in the source register. It
+  /// allows you to pass which of the different destination sized slices
+  /// you require.
+  Register createUnmergeValue(MachineInstr &MI, const Register SrcReg,
+                              const Register DstReg, uint8_t DestinationIndex,
+                              const uint32_t Start, const uint32_t End);
+
   /// Replace \p MI with a concat_vectors with \p Ops.
   void applyCombineShuffleVector(MachineInstr &MI,
                                  const ArrayRef<Register> Ops);

@@ -42,6 +42,8 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cmath>
+#include <cstdint>
+#include <functional>
 #include <optional>
 #include <tuple>
 
@@ -384,17 +386,220 @@ void CombinerHelper::applyCombineShuffleConcat(MachineInstr &MI,
   MI.eraseFromParent();
 }
 
+// Create a stream from 0 to n with a specified number of steps
+CombinerHelper::GeneratorType
+adderGenerator(const int32_t From, const int32_t To, const int32_t StepSize) {
+  int32_t Counter = From;
+  return [Counter, To, StepSize]() mutable {
+    std::optional<int32_t> OldCount = std::optional<int32_t>(Counter);
+    Counter += StepSize;
+    if (OldCount == (To + StepSize))
+      OldCount = {};
+    return OldCount;
+  };
+}
+
+// Move to the next generator if it is exhausted allowing to chain generators
+CombinerHelper::GeneratorType
+concatGenerators(SmallVector<CombinerHelper::GeneratorType> &Generators) {
+  auto *GeneratorIterator = Generators.begin();
+
+  return [GeneratorIterator, Generators]() mutable {
+    std::optional<int32_t> GenValue = (*GeneratorIterator)();
+    if (!GenValue.has_value() && GeneratorIterator != Generators.end()) {
+      GeneratorIterator++;
+      GenValue = (*GeneratorIterator)();
+    }
+    return GenValue;
+  };
+}
+
+Register CombinerHelper::createUnmergeValue(
+    MachineInstr &MI, const Register SrcReg, const Register DstReg,
+    const uint8_t DestinationIndex, const uint32_t Start, const uint32_t End) {
+  Builder.setInsertPt(*MI.getParent(), MI);
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(SrcReg);
+  assert((DstTy.isScalar() ||
+          (SrcTy.getNumElements() % DstTy.getNumElements()) == 0) &&
+         "destination vector must divide source cleanly");
+
+  const unsigned HalfElements = SrcTy.getNumElements() / 2;
+  const LLT ScalarTy = SrcTy.getScalarType();
+  const LLT HalfSizeTy = (HalfElements == 1)
+                             ? ScalarTy
+                             : LLT::fixed_vector(HalfElements, ScalarTy);
+  const Register TmpReg = MRI.createGenericVirtualRegister(HalfSizeTy);
+  Register TargetReg = DstReg;
+  if (DstTy != HalfSizeTy) {
+    TargetReg = MRI.createGenericVirtualRegister(HalfSizeTy);
+  }
+
+  // Each destination fits n times into the source and each iteration we exactly
+  // half the source. Therefore we need to pick on which side we want to iterate
+  // on.
+  const uint32_t DstNumElements = DstTy.isVector() ? DstTy.getNumElements() : 1;
+  const uint32_t HalfWay = Start + ((End - Start) / 2);
+  const uint32_t Position = DestinationIndex * DstNumElements;
+
+  uint32_t NextStart, NextEnd;
+  if (Position < HalfWay) {
+    Builder.buildInstr(TargetOpcode::G_UNMERGE_VALUES, {TargetReg, TmpReg},
+                       {SrcReg});
+    NextStart = Start;
+    NextEnd = HalfWay;
+  } else {
+    Builder.buildInstr(TargetOpcode::G_UNMERGE_VALUES, {TmpReg, TargetReg},
+                       {SrcReg});
+    NextStart = HalfWay;
+    NextEnd = End;
+  }
+
+  if (HalfSizeTy.isVector() && DstTy != HalfSizeTy)
+    return createUnmergeValue(MI, TargetReg, DstReg, DestinationIndex,
+                              NextStart, NextEnd);
+
+  return DstReg;
+}
+
 bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) {
+  const Register DstReg = MI.getOperand(0).getReg();
+  const Register SrcReg1 = MI.getOperand(1).getReg();
+  const Register SrcReg2 = MI.getOperand(2).getReg();
+
+  const LLT DstTy = MRI.getType(DstReg);
+  const LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+
+  const unsigned DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1;
+  const unsigned SrcNumElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1;
+
+  // This test is a bit silly, but it is required because some tests rely on
+  // the legalizer changing the type of the shufflevector.
+  if (DstTy.getScalarSizeInBits() == 1)
+    return false;
+
+  // {1, 2, ..., n} -> G_CONCAT_VECTOR
+  // Turns a shuffle vector that only increments into a concat vector
+  // instruction
+  GeneratorType CountUp = adderGenerator(0, DstNumElts - 1, 1);
   SmallVector<Register, 4> Ops;
-  if (matchCombineShuffleVector(MI, Ops)) {
+
+  if (matchCombineShuffleVector(MI, CountUp, 2 * SrcNumElts)) {
+    // The shuffle is concatenating multiple vectors together.
+    // Collect the different operands for that.
+    Register UndefReg;
+    const Register Src1 = MI.getOperand(1).getReg();
+    const Register Src2 = MI.getOperand(2).getReg();
+    const ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+
+    // The destination can be longer than the source, so we separate them into
+    // equal blocks and check them separately to see if one of the blocks can be
+    // copied whole.
+    unsigned NumConcat = DstNumElts / SrcNumElts;
+    unsigned Index = 0;
+    for (unsigned Concat = 0; Concat < NumConcat; Concat++) {
+      unsigned Target = (Concat + 1) * SrcNumElts;
+      while (Index < Target) {
+        int MaskElt = Mask[Index];
+        if (MaskElt >= 0) {
+          Ops.push_back((MaskElt < (int)SrcNumElts) ? Src1 : Src2);
+          break;
+        }
+        Index++;
+      }
+
+      if (Index == Target) {
+        if (!UndefReg) {
+          Builder.setInsertPt(*MI.getParent(), MI);
+          UndefReg = Builder.buildUndef(SrcTy).getReg(0);
+        }
+        Ops.push_back(UndefReg);
+      }
+
+      Index = Target;
+    }
+
     applyCombineShuffleVector(MI, Ops);
     return true;
   }
+
+  // {1, 2, ..., |DstVector|} -> G_UNMERGE_VALUES
+  // Extracts the first chunk of the same size of the destination vector from
+  // the source
+  GeneratorType FirstQuarter = adderGenerator(0, DstNumElts - 1, 1);
+  if (matchCombineShuffleVector(MI, FirstQuarter, DstNumElts - 1)) {
+    // This optimization does not work if the target type is not a multiple of
+    // two, this can happen in some backends that support uneven vector types.
+    // We also need to make sure that the vector can be split into two.
+    if (SrcTy == DstTy || ((SrcNumElts / 2) % 2) != 0 ||
+        SrcNumElts % DstNumElts != 0)
+      return false;
+    ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+    const Register TargetReg = Mask[0] < (int)SrcNumElts ? SrcReg1 : SrcReg2;
+    createUnmergeValue(MI, TargetReg, DstReg, 0, 0, SrcNumElts);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // {|DstVector|, |DstVector|+1, ..., 2 * |DstVector|} -> G_UNMERGE_VALUES
+  // Extracts the second chunk of the same size of the destination vector from
+  // the source
+  GeneratorType SecondQuarter =
+      adderGenerator(DstNumElts, (DstNumElts * 2) - 1, 1);
+  if (matchCombineShuffleVector(MI, SecondQuarter, DstNumElts - 1)) {
+    if (((SrcNumElts / 2) % 2) != 0 || SrcNumElts % DstNumElts != 0)
+      return false;
+    ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+    const Register TargetReg = Mask[0] < (int)SrcNumElts ? SrcReg1 : SrcReg2;
+    createUnmergeValue(MI, TargetReg, DstReg, 1, 0, SrcNumElts);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // After this point, it is assumed our shufflevectors work on vectors that can
+  // be splint into two
+  if ((DstNumElts % 2) != 0)
+    return false;
+
+  // {1, 2, ..., n/4, n/2, n/2+1, .... 3n/4} -> G_UNMERGE_VALUES
+  // Take the first halfs of the two vectors and concatenate them into one
+  // vector.
+  GeneratorType FirstEightA = adderGenerator(0, (DstNumElts / 2) - 1, 1);
+  GeneratorType FirstEightB =
+      adderGenerator(DstNumElts, DstNumElts + (DstNumElts / 2) - 1, 1);
+
+  auto UnmergeMatcher = SmallVector<GeneratorType>{FirstEightA, FirstEightB};
+  GeneratorType FirstAndThird = concatGenerators(UnmergeMatcher);
+  if (matchCombineShuffleVector(MI, FirstAndThird, (DstNumElts / 2) - 1)) {
+    if (DstNumElts <= 2)
+      return false;
+    const Register DstReg = MI.getOperand(0).getReg();
+    const LLT HalfSrcTy =
+        LLT::fixed_vector(SrcNumElts / 2, SrcTy.getScalarType());
+    const Register HalfOfA = createUnmergeValue(
+        MI, MI.getOperand(1).getReg(),
+        MRI.createGenericVirtualRegister(HalfSrcTy), 0, 0, SrcNumElts);
+    const Register HalfOfB = createUnmergeValue(
+        MI, MI.getOperand(2).getReg(),
+        MRI.createGenericVirtualRegister(HalfSrcTy), 0, 0, SrcNumElts);
+
+    const ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+    if (Mask[0] <= 0) {
+      Builder.buildMergeLikeInstr(DstReg, {HalfOfA, HalfOfB});
+    } else {
+      Builder.buildMergeLikeInstr(DstReg, {HalfOfB, HalfOfA});
+    }
+
+    MI.eraseFromParent();
+    return true;
+  }
+
   return false;
 }
 
 bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
-                                               SmallVectorImpl<Register> &Ops) {
+                                               GeneratorType Generator,
+                                               const size_t TargetDstSize) {
   assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
          "Invalid instruction kind");
   LLT DstType = MRI.getType(MI.getOperand(0).getReg());
@@ -421,51 +626,24 @@ bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI,
   //
   // TODO: If the size between the source and destination don't match
   //       we could still emit an extract vector element in that case.
-  if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1)
-    return false;
-
-  // Check that the shuffle mask can be broken evenly between the
-  // different sources.
-  if (DstNumElts % SrcNumElts != 0)
+  if ((DstNumElts < TargetDstSize) && DstNumElts != 1)
     return false;
 
-  // Mask length is a multiple of the source vector length.
-  // Check if the shuffle is some kind of concatenation of the input
-  // vectors.
-  unsigned NumConcat = DstNumElts / SrcNumElts;
-  SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
   ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
   for (unsigned i = 0; i != DstNumElts; ++i) {
     int Idx = Mask[i];
+    const int32_t ShiftIndex = Generator().value_or(-1);
+
     // Undef value.
-    if (Idx < 0)
+    if (Idx < 0 || ShiftIndex < 0)
       continue;
+
     // Ensure the indices in each SrcType sized piece are sequential and that
     // the same source is used for the whole piece.
-    if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
-        (ConcatSrcs[i / SrcNumElts] >= 0 &&
-         ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts)))
+    if ((Idx % SrcNumElts != (ShiftIndex % SrcNumElts)))
       return false;
-    // Remember which source this index came from.
-    ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
   }
 
-  // The shuffle is concatenating multiple vectors together.
-  // Collect the different operands for that.
-  Register UndefReg;
-  Register Src2 = MI.getOperand(2).getReg();
-  for (auto Src : ConcatSrcs) {
-    if (Src < 0) {
-      if (!UndefReg) {
-        Builder.setInsertPt(*MI.getParent(), MI);
-        UndefReg = Builder.buildUndef(SrcType).getReg(0);
-      }
-      Ops.push_back(UndefReg);
-    } else if (Src == 0)
-      Ops.push_back(Src1);
-    else
-      Ops.push_back(Src2);
-  }
   return true;
 }
 

@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/IR/IntrinsicsAIE2.h"
 #include "llvm/InitializePasses.h"
 
@@ -167,6 +168,9 @@ bool AIE2PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const {
   case TargetOpcode::G_INTRINSIC: {
     return tryToCombineIntrinsic(MI);
   }
+  case TargetOpcode::G_SHUFFLE_VECTOR: {
+    return Helper.tryCombineShuffleVector(MI);
+  }
   default:
     break;
   }

@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s
+# Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
 
 ---
 name:            shuffle_concat_1
@@ -101,7 +102,9 @@ body:             |
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %a(<4 x s8>), %b(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>)
     ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %c(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>)
-    ; CHECK-NEXT: %z:_(<16 x s8>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<16 x s8>), [[CONCAT_VECTORS1]], shufflemask(0, undef, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, undef, undef, undef, undef)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s8>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s8>), [[UV3:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<16 x s8>)
+    ; CHECK-NEXT: %z:_(<16 x s8>) = G_CONCAT_VECTORS [[UV]](<8 x s8>), [[UV2]](<8 x s8>)
     ; CHECK-NEXT: $q0 = COPY %z(<16 x s8>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %p1:_(p0) = COPY $x0
@@ -179,7 +182,9 @@ body:             |
     ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF
     ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %a(<4 x s8>), %b(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>)
     ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %c(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>)
-    ; CHECK-NEXT: %z:_(<16 x s8>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<16 x s8>), [[CONCAT_VECTORS1]], shufflemask(undef, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, undef, undef, undef, undef)
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s8>)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s8>), [[UV3:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<16 x s8>)
+    ; CHECK-NEXT: %z:_(<16 x s8>) = G_CONCAT_VECTORS [[UV]](<8 x s8>), [[UV2]](<8 x s8>)
     ; CHECK-NEXT: $q0 = COPY %z(<16 x s8>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     %p1:_(p0) = COPY $x0