Detecte vector reduction operations just before instruction selection.

This patch detects vector reductions before instruction selection. Vector reductions are vectorized reduction operations, and for such operations we have freedom to reorganize the elements of the result as long as the reduction of them stay unchanged. This will enable some reduction pattern recognition during instruction combine such as SAD/dot-product on X86. A flag is added to SDNodeFlags to mark those vector reduction nodes to be checked during instruction combine. To detect those vector reductions, we search def-use chains starting from the given instruction, and check if all uses fall into two categories: 1. Reduction with another vector. 2. Reduction on all elements. in which 2 is detected by recognizing the pattern that the loop vectorizer generates to reduce all elements in the vector outside of the loop, which includes several ShuffleVector and one ExtractElement instructions. Differential revision: http://reviews.llvm.org/D15250 llvm-svn: 261070
2016-02-17 06:37:04 +00:00 · 2016-02-17 06:37:04 +00:00 · bbd4e3b400
parent a2b1f45ded
commit bbd4e3b400
3 changed files with 215 additions and 0 deletions
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@ -328,6 +328,7 @@ private:
  bool NoInfs : 1;
  bool NoSignedZeros : 1;
  bool AllowReciprocal : 1;
+  bool VectorReduction : 1;

 public:
  /// Default constructor turns off all optimization flags.
@ -340,6 +341,7 @@ public:
    NoInfs = false;
    NoSignedZeros = false;
    AllowReciprocal = false;
+    VectorReduction = false;
  }

  // These are mutators for each flag.
@ -351,6 +353,7 @@ public:
  void setNoInfs(bool b) { NoInfs = b; }
  void setNoSignedZeros(bool b) { NoSignedZeros = b; }
  void setAllowReciprocal(bool b) { AllowReciprocal = b; }
+  void setVectorReduction(bool b) { VectorReduction = b; }

  // These are accessors for each flag.
  bool hasNoUnsignedWrap() const { return NoUnsignedWrap; }
@ -361,6 +364,7 @@ public:
  bool hasNoInfs() const { return NoInfs; }
  bool hasNoSignedZeros() const { return NoSignedZeros; }
  bool hasAllowReciprocal() const { return AllowReciprocal; }
+  bool hasVectorReduction() const { return VectorReduction; }

  /// Return a raw encoding of the flags.
  /// This function should only be used to add data to the NodeID value.
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@ -2308,6 +2308,125 @@ void SelectionDAGBuilder::visitFSub(const User &I) {
  visitBinary(I, ISD::FSUB);
 }

+/// Checks if the given instruction performs a vector reduction, in which case
+/// we have the freedom to alter the elements in the result as long as the
+/// reduction of them stays unchanged.
+static bool isVectorReductionOp(const User *I) {
+  const Instruction *Inst = dyn_cast<Instruction>(I);
+  if (!Inst || !Inst->getType()->isVectorTy())
+    return false;
+
+  auto OpCode = Inst->getOpcode();
+  switch (OpCode) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    break;
+  case Instruction::FAdd:
+  case Instruction::FMul:
+    if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
+      if (FPOp->getFastMathFlags().unsafeAlgebra())
+        break;
+    // Fall through.
+  default:
+    return false;
+  }
+
+  unsigned ElemNum = Inst->getType()->getVectorNumElements();
+  unsigned ElemNumToReduce = ElemNum;
+
+  // Do DFS search on the def-use chain from the given instruction. We only
+  // allow four kinds of operations during the search until we reach the
+  // instruction that extracts the first element from the vector:
+  //
+  //   1. The reduction operation of the same opcode as the given instruction.
+  //
+  //   2. PHI node.
+  //
+  //   3. ShuffleVector instruction together with a reduction operation that
+  //      does a partial reduction.
+  //
+  //   4. ExtractElement that extracts the first element from the vector, and we
+  //      stop searching the def-use chain here.
+  //
+  // 3 & 4 above perform a reduction on all elements of the vector. We push defs
+  // from 1-3 to the stack to continue the DFS. The given instruction is not
+  // a reduction operation if we meet any other instructions other than those
+  // listed above.
+
+  SmallVector<const User *, 16> UsersToVisit{Inst};
+  SmallPtrSet<const User *, 16> Visited;
+  bool ReduxExtracted = false;
+
+  while (!UsersToVisit.empty()) {
+    auto User = UsersToVisit.back();
+    UsersToVisit.pop_back();
+    if (!Visited.insert(User).second)
+      continue;
+
+    for (const auto &U : User->users()) {
+      auto Inst = dyn_cast<Instruction>(U);
+      if (!Inst)
+        return false;
+
+      if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
+        if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
+          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().unsafeAlgebra())
+            return false;
+        UsersToVisit.push_back(U);
+      } else if (const ShuffleVectorInst *ShufInst =
+                     dyn_cast<ShuffleVectorInst>(U)) {
+        // Detect the following pattern: A ShuffleVector instruction together
+        // with a reduction that do partial reduction on the first and second
+        // ElemNumToReduce / 2 elements, and store the result in
+        // ElemNumToReduce / 2 elements in another vector.
+
+        if (ElemNumToReduce == 1)
+          return false;
+        if (!isa<UndefValue>(U->getOperand(1)))
+          return false;
+        for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
+          if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
+            return false;
+        for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
+          if (ShufInst->getMaskValue(i) != -1)
+            return false;
+
+        // There is only one user of this ShuffleVector instruction, which must
+        // be a reduction operation.
+        if (!U->hasOneUse())
+          return false;
+
+        auto U2 = dyn_cast<Instruction>(*U->user_begin());
+        if (!U2 || U2->getOpcode() != OpCode)
+          return false;
+
+        // Check operands of the reduction operation.
+        if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
+            (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
+          UsersToVisit.push_back(U2);
+          ElemNumToReduce /= 2;
+        } else
+          return false;
+      } else if (isa<ExtractElementInst>(U)) {
+        // At this moment we should have reduced all elements in the vector.
+        if (ElemNumToReduce != 1)
+          return false;
+
+        const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
+        if (!Val || Val->getZExtValue() != 0)
+          return false;
+
+        ReduxExtracted = true;
+      } else
+        return false;
+    }
+  }
+  return ReduxExtracted;
+}
+
 void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
  SDValue Op1 = getValue(I.getOperand(0));
  SDValue Op2 = getValue(I.getOperand(1));
@ -2315,6 +2434,7 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
  bool nuw = false;
  bool nsw = false;
  bool exact = false;
+  bool vec_redux = false;
  FastMathFlags FMF;

  if (const OverflowingBinaryOperator *OFBinOp =
@ -2328,10 +2448,16 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
  if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&I))
    FMF = FPOp->getFastMathFlags();

+  if (isVectorReductionOp(&I)) {
+    vec_redux = true;
+    DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
+  }
+
  SDNodeFlags Flags;
  Flags.setExact(exact);
  Flags.setNoSignedWrap(nsw);
  Flags.setNoUnsignedWrap(nuw);
+  Flags.setVectorReduction(vec_redux);
  if (EnableFMFInDAG) {
    Flags.setAllowReciprocal(FMF.allowReciprocal());
    Flags.setNoInfs(FMF.noInfs());
--- a/llvm/test/CodeGen/Generic/vector-redux.ll
+++ b/llvm/test/CodeGen/Generic/vector-redux.ll
@ -0,0 +1,85 @@
+; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+@a = global [1024 x i32] zeroinitializer, align 16
+
+define float @reduce_add_float(float* nocapture readonly %a) {
+; CHECK-LABEL: reduce_add_float
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+; CHECK:       Detected a reduction operation: {{.*}} fadd fast
+;
+entry:
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ 0, %entry ], [ %index.next.4, %vector.body ]
+  %vec.phi = phi <4 x float> [ zeroinitializer, %entry ], [ %28, %vector.body ]
+  %vec.phi9 = phi <4 x float> [ zeroinitializer, %entry ], [ %29, %vector.body ]
+  %0 = getelementptr inbounds float, float* %a, i64 %index
+  %1 = bitcast float* %0 to <4 x float>*
+  %wide.load = load <4 x float>, <4 x float>* %1, align 4
+  %2 = getelementptr float, float* %0, i64 4
+  %3 = bitcast float* %2 to <4 x float>*
+  %wide.load10 = load <4 x float>, <4 x float>* %3, align 4
+  %4 = fadd fast <4 x float> %wide.load, %vec.phi
+  %5 = fadd fast <4 x float> %wide.load10, %vec.phi9
+  %index.next = add nuw nsw i64 %index, 8
+  %6 = getelementptr inbounds float, float* %a, i64 %index.next
+  %7 = bitcast float* %6 to <4 x float>*
+  %wide.load.1 = load <4 x float>, <4 x float>* %7, align 4
+  %8 = getelementptr float, float* %6, i64 4
+  %9 = bitcast float* %8 to <4 x float>*
+  %wide.load10.1 = load <4 x float>, <4 x float>* %9, align 4
+  %10 = fadd fast <4 x float> %wide.load.1, %4
+  %11 = fadd fast <4 x float> %wide.load10.1, %5
+  %index.next.1 = add nsw i64 %index, 16
+  %12 = getelementptr inbounds float, float* %a, i64 %index.next.1
+  %13 = bitcast float* %12 to <4 x float>*
+  %wide.load.2 = load <4 x float>, <4 x float>* %13, align 4
+  %14 = getelementptr float, float* %12, i64 4
+  %15 = bitcast float* %14 to <4 x float>*
+  %wide.load10.2 = load <4 x float>, <4 x float>* %15, align 4
+  %16 = fadd fast <4 x float> %wide.load.2, %10
+  %17 = fadd fast <4 x float> %wide.load10.2, %11
+  %index.next.2 = add nsw i64 %index, 24
+  %18 = getelementptr inbounds float, float* %a, i64 %index.next.2
+  %19 = bitcast float* %18 to <4 x float>*
+  %wide.load.3 = load <4 x float>, <4 x float>* %19, align 4
+  %20 = getelementptr float, float* %18, i64 4
+  %21 = bitcast float* %20 to <4 x float>*
+  %wide.load10.3 = load <4 x float>, <4 x float>* %21, align 4
+  %22 = fadd fast <4 x float> %wide.load.3, %16
+  %23 = fadd fast <4 x float> %wide.load10.3, %17
+  %index.next.3 = add nsw i64 %index, 32
+  %24 = getelementptr inbounds float, float* %a, i64 %index.next.3
+  %25 = bitcast float* %24 to <4 x float>*
+  %wide.load.4 = load <4 x float>, <4 x float>* %25, align 4
+  %26 = getelementptr float, float* %24, i64 4
+  %27 = bitcast float* %26 to <4 x float>*
+  %wide.load10.4 = load <4 x float>, <4 x float>* %27, align 4
+  %28 = fadd fast <4 x float> %wide.load.4, %22
+  %29 = fadd fast <4 x float> %wide.load10.4, %23
+  %index.next.4 = add nsw i64 %index, 40
+  %30 = icmp eq i64 %index.next.4, 1000
+  br i1 %30, label %middle.block, label %vector.body
+
+middle.block:
+  %.lcssa15 = phi <4 x float> [ %29, %vector.body ]
+  %.lcssa = phi <4 x float> [ %28, %vector.body ]
+  %bin.rdx = fadd fast <4 x float> %.lcssa15, %.lcssa
+  %rdx.shuf = shufflevector <4 x float> %bin.rdx, <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx11 = fadd fast <4 x float> %bin.rdx, %rdx.shuf
+  %rdx.shuf12 = shufflevector <4 x float> %bin.rdx11, <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx13 = fadd fast <4 x float> %bin.rdx11, %rdx.shuf12
+  %31 = extractelement <4 x float> %bin.rdx13, i32 0
+  ret float %31
+}