[VectorCombine] Try to reduce shuffle cost for commutative reduction operands

Given a shuffle feeding a commutative reduction, the lane ordering of the shuffle will not alter the result. This is also true if there are a number of operations between the reduction and the shuffle, providing they only operate lane-wise. This patch searches for cases like that in Vector Combine, allowing us to check the cost of the shuffle vs an in-order identity shuffle and replace the order if possible. This only handles a single shuffle at the moment to keep things simple, and is able to ignore splats that produce results where every result is the same. This is a more powerful version of a combine that already happens in instrcombine, capable of optimizing more cases by looking through more instructions and being able to cost the shuffle. Differential Revision: https://reviews.llvm.org/D123494
2022-04-28 19:46:12 +01:00 · 2022-04-28 19:46:12 +01:00 · ded8187e35
parent 9a4472c56c
commit ded8187e35
2 changed files with 135 additions and 20 deletions
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//

 #include "llvm/Transforms/Vectorize/VectorCombine.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
@ -103,11 +104,12 @@ private:
  bool foldSingleElementStore(Instruction &I);
  bool scalarizeLoadExtract(Instruction &I);
  bool foldShuffleOfBinops(Instruction &I);
+  bool foldShuffleFromReductions(Instruction &I);

  void replaceValue(Value &Old, Value &New) {
    Old.replaceAllUsesWith(&New);
-    New.takeName(&Old);
    if (auto *NewI = dyn_cast<Instruction>(&New)) {
+      New.takeName(&Old);
      Worklist.pushUsersToWorkList(*NewI);
      Worklist.pushValue(NewI);
    }
@ -1113,6 +1115,118 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
  return true;
 }

+/// Given a commutative reduction, the order of the input lanes does not alter
+/// the results. We can use this to remove certain shuffles feeding the
+/// reduction, removing the need to shuffle at all.
+bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
+  auto *II = dyn_cast<IntrinsicInst>(&I);
+  if (!II)
+    return false;
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::vector_reduce_add:
+  case Intrinsic::vector_reduce_mul:
+  case Intrinsic::vector_reduce_and:
+  case Intrinsic::vector_reduce_or:
+  case Intrinsic::vector_reduce_xor:
+  case Intrinsic::vector_reduce_smin:
+  case Intrinsic::vector_reduce_smax:
+  case Intrinsic::vector_reduce_umin:
+  case Intrinsic::vector_reduce_umax:
+    break;
+  default:
+    return false;
+  }
+
+  // Find all the inputs when looking through operations that do not alter the
+  // lane order (binops, for example). Currently we look for a single shuffle,
+  // and can ignore splat values.
+  std::queue<Value *> Worklist;
+  SmallPtrSet<Value *, 4> Visited;
+  ShuffleVectorInst *Shuffle = nullptr;
+  if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
+    Worklist.push(Op);
+
+  while (!Worklist.empty()) {
+    Value *CV = Worklist.front();
+    Worklist.pop();
+    if (Visited.contains(CV))
+      continue;
+
+    // Splats don't change the order, so can be safely ignored.
+    if (isSplatValue(CV))
+      continue;
+
+    Visited.insert(CV);
+
+    if (auto *CI = dyn_cast<Instruction>(CV)) {
+      if (CI->isBinaryOp()) {
+        for (auto *Op : CI->operand_values())
+          Worklist.push(Op);
+        continue;
+      } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
+        if (Shuffle && Shuffle != SV)
+          return false;
+        Shuffle = SV;
+        continue;
+      }
+    }
+
+    // Anything else is currently an unknown node.
+    return false;
+  }
+
+  if (!Shuffle)
+    return false;
+
+  // Check all uses of the binary ops and shuffles are also included in the
+  // lane-invariant operations (Visited should be the list of lanewise
+  // instructions, including the shuffle that we found).
+  for (auto *V : Visited)
+    for (auto *U : V->users())
+      if (!Visited.contains(U) && U != &I)
+        return false;
+
+  FixedVectorType *VecType =
+      dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
+  if (!VecType)
+    return false;
+  FixedVectorType *ShuffleInputType =
+      dyn_cast<FixedVectorType>(Shuffle->getOperand(0)->getType());
+  if (!ShuffleInputType)
+    return false;
+  int NumInputElts = ShuffleInputType->getNumElements();
+
+  // Find the mask from sorting the lanes into order. This is most likely to
+  // become a identity or concat mask. Undef elements are pushed to the end.
+  SmallVector<int> ConcatMask;
+  Shuffle->getShuffleMask(ConcatMask);
+  sort(ConcatMask, [](int X, int Y) {
+    return Y == UndefMaskElem ? true : (X == UndefMaskElem ? false : X < Y);
+  });
+  bool UsesSecondVec =
+      any_of(ConcatMask, [&](int M) { return M >= NumInputElts; });
+  InstructionCost OldCost = TTI.getShuffleCost(
+      UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
+      Shuffle->getShuffleMask());
+  InstructionCost NewCost = TTI.getShuffleCost(
+      UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
+      ConcatMask);
+
+  LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
+                    << "\n");
+  LLVM_DEBUG(dbgs() << "  OldCost: " << OldCost << " vs NewCost: " << NewCost
+                    << "\n");
+  if (NewCost < OldCost) {
+    Builder.SetInsertPoint(Shuffle);
+    Value *NewShuffle = Builder.CreateShuffleVector(
+        Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
+    LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
+    replaceValue(*Shuffle, *NewShuffle);
+  }
+
+  return false;
+}
+
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@ -1132,6 +1246,7 @@ bool VectorCombine::run() {
      MadeChange |= foldBitcastShuf(I);
      MadeChange |= foldExtractedCmps(I);
      MadeChange |= foldShuffleOfBinops(I);
+      MadeChange |= foldShuffleFromReductions(I);
    }
    MadeChange |= scalarizeBinopOrCmp(I);
    MadeChange |= scalarizeLoadExtract(I);
--- a/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/vecreduce-shuffle.ll
@ -16,7 +16,7 @@ define i32 @reducebase_v4i32(<4 x i32> %a, <4 x i32> %b) {

 define i32 @reduceshuffle_onein_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: @reduceshuffle_onein_v4i32(
-; CHECK-NEXT:    [[X:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    [[X:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[X]])
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
@ -27,7 +27,7 @@ define i32 @reduceshuffle_onein_v4i32(<4 x i32> %a) {

 define i32 @reduceshuffle_onein_const_v4i32(<4 x i32> %a) {
 ; CHECK-LABEL: @reduceshuffle_onein_const_v4i32(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[X:%.*]] = xor <4 x i32> [[S]], <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[X]])
 ; CHECK-NEXT:    ret i32 [[R]]
@ -77,7 +77,7 @@ define i32 @reduceshuffle_twoin_OK_v4i32(<4 x i32> %a, <4 x i32> %b) {

 define i32 @reduceshuffle_twoin_concat_v4i32(<2 x i32> %a, <2 x i32> %b) {
 ; CHECK-LABEL: @reduceshuffle_twoin_concat_v4i32(
-; CHECK-NEXT:    [[X:%.*]] = shufflevector <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    [[X:%.*]] = shufflevector <2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[X]])
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
@ -99,7 +99,7 @@ define i32 @reduceshuffle_twoin_lowelts_v4i32(<4 x i32> %a, <4 x i32> %b) {

 define i32 @reduceshuffle_twoin_notlowelts_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @reduceshuffle_twoin_notlowelts_v4i32(
-; CHECK-NEXT:    [[X:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 6, i32 1, i32 4>
+; CHECK-NEXT:    [[X:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 4, i32 6>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[X]])
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
@ -121,7 +121,7 @@ define i32 @reduceshuffle_twoin_repeat_v4i32(<4 x i32> %a, <4 x i32> %b) {

 define i32 @reduceshuffle_twoin_uneven_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @reduceshuffle_twoin_uneven_v4i32(
-; CHECK-NEXT:    [[X:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 1, i32 4>
+; CHECK-NEXT:    [[X:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 4>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[X]])
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
@ -143,7 +143,7 @@ define i32 @reduceshuffle_twoin_undef_v4i32(<4 x i32> %a, <4 x i32> %b) {

 define i32 @reduceshuffle_twoin_undef2_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @reduceshuffle_twoin_undef2_v4i32(
-; CHECK-NEXT:    [[X:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 4, i32 undef, i32 1, i32 0>
+; CHECK-NEXT:    [[X:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 4, i32 undef>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[X]])
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
@ -154,7 +154,7 @@ define i32 @reduceshuffle_twoin_undef2_v4i32(<4 x i32> %a, <4 x i32> %b) {

 define i32 @reduceshuffle_twoin_multiundef_v4i32(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: @reduceshuffle_twoin_multiundef_v4i32(
-; CHECK-NEXT:    [[X:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 undef, i32 undef, i32 1>
+; CHECK-NEXT:    [[X:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[X]])
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
@ -222,7 +222,7 @@ define i32 @reducebase_v16i32(<16 x i32> %a, <16 x i32> %b) {

 define i32 @reduceshuffle_onein_v16i32(<16 x i32> %a) {
 ; CHECK-LABEL: @reduceshuffle_onein_v16i32(
-; CHECK-NEXT:    [[X:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[X:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[X]])
 ; CHECK-NEXT:    ret i32 [[R]]
 ;
@ -233,7 +233,7 @@ define i32 @reduceshuffle_onein_v16i32(<16 x i32> %a) {

 define i32 @reduceshuffle_onein_ext_v16i32(<16 x i32> %a) {
 ; CHECK-LABEL: @reduceshuffle_onein_ext_v16i32(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[X:%.*]] = xor <16 x i32> [[S]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[X]])
 ; CHECK-NEXT:    ret i32 [[R]]
@ -246,7 +246,7 @@ define i32 @reduceshuffle_onein_ext_v16i32(<16 x i32> %a) {

 define i32 @reduceshuffle_twoin_concat_v16i32(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-LABEL: @reduceshuffle_twoin_concat_v16i32(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[X:%.*]] = xor <16 x i32> [[S]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[X]])
 ; CHECK-NEXT:    ret i32 [[R]]
@ -259,7 +259,7 @@ define i32 @reduceshuffle_twoin_concat_v16i32(<8 x i32> %a, <8 x i32> %b) {

 define i32 @reduceshuffle_twoin_lowelt_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: @reduceshuffle_twoin_lowelt_v16i32(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[X:%.*]] = xor <16 x i32> [[S]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[X]])
 ; CHECK-NEXT:    ret i32 [[R]]
@ -285,7 +285,7 @@ define i32 @reduceshuffle_twoin_notlowelt_v16i32(<16 x i32> %a, <16 x i32> %b) {

 define i32 @reduceshuffle_twoin_uneven_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: @reduceshuffle_twoin_uneven_v16i32(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 8>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22>
 ; CHECK-NEXT:    [[X:%.*]] = xor <16 x i32> [[S]], <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK-NEXT:    [[R:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[X]])
 ; CHECK-NEXT:    ret i32 [[R]]
@ -298,7 +298,7 @@ define i32 @reduceshuffle_twoin_uneven_v16i32(<16 x i32> %a, <16 x i32> %b) {

 define i32 @reduceshuffle_twoin_shr1_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: @reduceshuffle_twoin_shr1_v16i32(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 16, i32 17, i32 5, i32 18, i32 19, i32 6, i32 20, i32 21, i32 7, i32 22, i32 23>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[A1:%.*]] = lshr <16 x i32> [[S]], <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>
 ; CHECK-NEXT:    [[A2:%.*]] = and <16 x i32> [[A1]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
 ; CHECK-NEXT:    [[A3:%.*]] = mul nuw <16 x i32> [[A2]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
@ -319,7 +319,7 @@ define i32 @reduceshuffle_twoin_shr1_v16i32(<16 x i32> %a, <16 x i32> %b) {

 define i32 @reduceshuffle_twoin_shr2_v16i32(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-LABEL: @reduceshuffle_twoin_shr2_v16i32(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 16, i32 17, i32 5, i32 18, i32 19, i32 6, i32 20, i32 21, i32 7, i32 22, i32 23>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[A1:%.*]] = lshr <16 x i32> <i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15, i32 15>, [[S]]
 ; CHECK-NEXT:    [[A2:%.*]] = and <16 x i32> [[A1]], <i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537, i32 65537>
 ; CHECK-NEXT:    [[A3:%.*]] = mul nuw <16 x i32> [[A2]], <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>
@ -353,7 +353,7 @@ define i16 @reducebase_v16i16(<16 x i16> %a, <16 x i16> %b) {

 define i16 @reduceshuffle_onein_v16i16(<16 x i16> %a) {
 ; CHECK-LABEL: @reduceshuffle_onein_v16i16(
-; CHECK-NEXT:    [[X:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[X:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[R:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[X]])
 ; CHECK-NEXT:    ret i16 [[R]]
 ;
@ -364,7 +364,7 @@ define i16 @reduceshuffle_onein_v16i16(<16 x i16> %a) {

 define i16 @reduceshuffle_onein_ext_v16i16(<16 x i16> %a) {
 ; CHECK-LABEL: @reduceshuffle_onein_ext_v16i16(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[X:%.*]] = xor <16 x i16> [[S]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 ; CHECK-NEXT:    [[R:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[X]])
 ; CHECK-NEXT:    ret i16 [[R]]
@ -377,7 +377,7 @@ define i16 @reduceshuffle_onein_ext_v16i16(<16 x i16> %a) {

 define i16 @reduceshuffle_twoin_concat_v16i16(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: @reduceshuffle_twoin_concat_v16i16(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    [[X:%.*]] = xor <16 x i16> [[S]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 ; CHECK-NEXT:    [[R:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[X]])
 ; CHECK-NEXT:    ret i16 [[R]]
@ -390,7 +390,7 @@ define i16 @reduceshuffle_twoin_concat_v16i16(<8 x i16> %a, <8 x i16> %b) {

 define i16 @reduceshuffle_twoin_lowelt_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-LABEL: @reduceshuffle_twoin_lowelt_v16i16(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 1, i32 3, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[X:%.*]] = xor <16 x i16> [[S]], <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
 ; CHECK-NEXT:    [[R:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[X]])
 ; CHECK-NEXT:    ret i16 [[R]]
@ -429,7 +429,7 @@ define i16 @reduceshuffle_twoin_uneven_v16i16(<16 x i16> %a, <16 x i16> %b) {

 define i16 @reduceshuffle_twoin_ext_v16i16(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-LABEL: @reduceshuffle_twoin_ext_v16i16(
-; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 16, i32 17, i32 5, i32 18, i32 19, i32 6, i32 20, i32 21, i32 7, i32 22, i32 23>
+; CHECK-NEXT:    [[S:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:    [[A1:%.*]] = lshr <16 x i16> [[S]], <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
 ; CHECK-NEXT:    [[A2:%.*]] = and <16 x i16> [[A1]], <i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257, i16 257>
 ; CHECK-NEXT:    [[A3:%.*]] = mul nuw <16 x i16> [[A2]], <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>