[AggressiveInstCombine] Generalize foldGuardedRotateToFunnelShift to generic funnel shifts

The fold currently only handles rotation patterns, but with the maturation of backend funnel shift handling we can now realistically handle all funnel shift patterns. This should allow us to begin resolving PR46896 et al. Differential Revision: https://reviews.llvm.org/D90625
2020-11-03 10:49:33 +00:00 · 2020-11-03 10:49:33 +00:00 · 59b22e495c
parent 9925168576
commit 59b22e495c
3 changed files with 52 additions and 77 deletions
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@ -39,6 +39,8 @@ using namespace PatternMatch;
 STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded");
 STATISTIC(NumGuardedRotates,
          "Number of guarded rotates transformed into funnel shifts");
 STATISTIC(NumGuardedFunnelShifts,
          "Number of guarded funnel shifts transformed into funnel shifts");
 STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized");
 namespace {
@ -67,17 +69,17 @@ public:
 };
 } // namespace
-/// Match a pattern for a bitwise rotate operation that partially guards
+/// Match a pattern for a bitwise funnel/rotate operation that partially guards
-/// against undefined behavior by branching around the rotation when the shift
+/// against undefined behavior by branching around the funnel-shift/rotation
-/// amount is 0.
+/// when the shift amount is 0.
-static bool foldGuardedRotateToFunnelShift(Instruction &I) {
+static bool foldGuardedFunnelShift(Instruction &I) {
  if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2)
    return false;
  // As with the one-use checks below, this is not strictly necessary, but we
  // are being cautious to avoid potential perf regressions on targets that
-  // do not actually have a rotate instruction (where the funnel shift would be
+  // do not actually have a funnel/rotate instruction (where the funnel shift
-  // expanded back into math/shift/logic ops).
+  // would be expanded back into math/shift/logic ops).
  if (!isPowerOf2_32(I.getType()->getScalarSizeInBits()))
    return false;
@ -111,27 +113,33 @@ static bool foldGuardedRotateToFunnelShift(Instruction &I) {
    return Intrinsic::not_intrinsic;
  };
-  // One phi operand must be a rotate operation, and the other phi operand must
+  // One phi operand must be a funnel/rotate operation, and the other phi
-  // be the source value of that rotate operation:
+  // operand must be the source value of that funnel/rotate operation:
  // phi [ rotate(RotSrc, ShAmt), FunnelBB ], [ RotSrc, GuardBB ]
  // phi [ fshl(ShlVal0, ShlVal1, ShAmt), FunnelBB ], [ ShlVal0, GuardBB ]
  // phi [ fshr(ShlVal0, ShlVal1, ShAmt), FunnelBB ], [ ShlVal1, GuardBB ]
  PHINode &Phi = cast<PHINode>(I);
  unsigned FunnelOp = 0, GuardOp = 1;
  Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1);
  Value *ShVal0, *ShVal1, *ShAmt;
  Intrinsic::ID IID = matchFunnelShift(P0, ShVal0, ShVal1, ShAmt);
-  if (IID == Intrinsic::not_intrinsic || ShVal0 != ShVal1 || ShVal0 != P1) {
+  if (IID == Intrinsic::not_intrinsic ||
      (IID == Intrinsic::fshl && ShVal0 != P1) ||
      (IID == Intrinsic::fshr && ShVal1 != P1)) {
    IID = matchFunnelShift(P1, ShVal0, ShVal1, ShAmt);
-    if (IID == Intrinsic::not_intrinsic || ShVal0 != ShVal1 || ShVal0 != P0)
+    if (IID == Intrinsic::not_intrinsic ||
        (IID == Intrinsic::fshl && ShVal0 != P0) ||
        (IID == Intrinsic::fshr && ShVal1 != P0))
      return false;
    assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) &&
           "Pattern must match funnel shift left or right");
    std::swap(FunnelOp, GuardOp);
  }
  assert(ShVal0 == ShVal1 && "Rotation funnel shift pattern expected");
  // The incoming block with our source operand must be the "guard" block.
-  // That must contain a cmp+branch to avoid the rotate when the shift amount
+  // That must contain a cmp+branch to avoid the funnel/rotate when the shift
-  // is equal to 0. The other incoming block is the block with the rotate.
+  // amount is equal to 0. The other incoming block is the block with the
  // funnel/rotate.
  BasicBlock *GuardBB = Phi.getIncomingBlock(GuardOp);
  BasicBlock *FunnelBB = Phi.getIncomingBlock(FunnelOp);
  Instruction *TermI = GuardBB->getTerminator();
@ -150,18 +158,21 @@ static bool foldGuardedRotateToFunnelShift(Instruction &I) {
  //   br i1 %cmp, label %PhiBB, label %FunnelBB
  // FunnelBB:
  //   %sub = sub i32 32, %ShAmt
-  //   %shr = lshr i32 %RotSrc, %sub
+  //   %shr = lshr i32 %ShVal1, %sub
-  //   %shl = shl i32 %RotSrc, %ShAmt
+  //   %shl = shl i32 %ShVal0, %ShAmt
-  //   %rot = or i32 %shr, %shl
+  //   %fsh = or i32 %shr, %shl
  //   br label %PhiBB
  // PhiBB:
-  //   %cond = phi i32 [ %RotSrc, %FunnelBB ], [ %RotSrc, %GuardBB ]
+  //   %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ]
  // -->
-  // llvm.fshl.i32(i32 %RotSrc, i32 %RotSrc, i32 %ShAmt)
+  // llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt)
  IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
  Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType());
  Phi.replaceAllUsesWith(Builder.CreateCall(F, {ShVal0, ShVal1, ShAmt}));
-  ++NumGuardedRotates;
+  if (ShVal0 == ShVal1)
    ++NumGuardedRotates;
  else
    ++NumGuardedFunnelShifts;
  return true;
 }
@ -350,7 +361,7 @@ static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
    // iteratively in this loop rather than waiting until the end.
    for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
      MadeChange |= foldAnyOrAllBitsSet(I);
-      MadeChange |= foldGuardedRotateToFunnelShift(I);
+      MadeChange |= foldGuardedFunnelShift(I);
      MadeChange |= tryToRecognizePopCount(I); 
    }
  }
--- a/llvm/test/Transforms/AggressiveInstCombine/funnel.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/funnel.ll
@ -7,14 +7,10 @@ define i32 @fshl(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[SUB]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[C]]
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[OR]], [[FSHBB]] ], [ [[A]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C]])
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
  %cmp = icmp eq i32 %c, 0
@ -38,14 +34,10 @@ define i32 @fshl_commute_phi(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[SUB]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[C]]
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[A]], [[ENTRY:%.*]] ], [ [[OR]], [[FSHBB]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C]])
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
  %cmp = icmp eq i32 %c, 0
@ -69,14 +61,10 @@ define i32 @fshl_commute_or(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[SUB]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[C]]
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[A]], [[ENTRY:%.*]] ], [ [[OR]], [[FSHBB]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C]])
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
  %cmp = icmp eq i32 %c, 0
@ -102,15 +90,11 @@ define i32 @fshl_insert_valid_location(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[SUB]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[C]]
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[OR]], [[FSHBB]] ], [ [[A]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[OTHER:%.*]] = phi i32 [ 1, [[FSHBB]] ], [ 2, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[OTHER:%.*]] = phi i32 [ 1, [[FSHBB]] ], [ 2, [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshl.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C]])
-; CHECK-NEXT:    [[RES:%.*]] = or i32 [[COND]], [[OTHER]]
+; CHECK-NEXT:    [[RES:%.*]] = or i32 [[TMP0]], [[OTHER]]
 ; CHECK-NEXT:    ret i32 [[RES]]
 ;
 entry:
@ -137,14 +121,10 @@ define i32 @fshr(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[SUB]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[C]]
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[OR]], [[FSHBB]] ], [ [[B]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C]])
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
  %cmp = icmp eq i32 %c, 0
@ -168,14 +148,10 @@ define i32 @fshr_commute_phi(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[SUB]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[C]]
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHR]], [[SHL]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[B]], [[ENTRY:%.*]] ], [ [[OR]], [[FSHBB]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C]])
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
  %cmp = icmp eq i32 %c, 0
@ -199,14 +175,10 @@ define i32 @fshr_commute_or(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[A:%.*]], [[SUB]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[C]]
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[B]], [[ENTRY:%.*]] ], [ [[OR]], [[FSHBB]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshr.i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 [[C]])
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
  %cmp = icmp eq i32 %c, 0
@ -396,7 +368,7 @@ end:
  ret i32 %cond
 }
-; Negative test - wrong shift.
+; Negative test - wrong shift for rotate (but can be folded to a generic funnel shift).
 define i32 @not_fshr_5(i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: @not_fshr_5(
@ -404,14 +376,10 @@ define i32 @not_fshr_5(i32 %a, i32 %b, i32 %c) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[FSHBB:%.*]]
 ; CHECK:       fshbb:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[C]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[C]], [[SUB]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[B:%.*]], [[C]]
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[B]], [[ENTRY:%.*]] ], [ [[OR]], [[FSHBB]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshr.i32(i32 [[C]], i32 [[B:%.*]], i32 [[C]])
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
  %cmp = icmp eq i32 %c, 0
--- a/llvm/test/Transforms/AggressiveInstCombine/rotate.ll
+++ b/llvm/test/Transforms/AggressiveInstCombine/rotate.ll
@ -370,7 +370,7 @@ end:
  ret i32 %cond
 }
-; Negative test - wrong shift.
+; Negative test - wrong shift for rotate (but can be folded to a generic funnel shift).
 define i32 @not_rotr_5(i32 %a, i32 %b) {
 ; CHECK-LABEL: @not_rotr_5(
@ -378,14 +378,10 @@ define i32 @not_rotr_5(i32 %a, i32 %b) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[B:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[ROTBB:%.*]]
 ; CHECK:       rotbb:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i32 32, [[B]]
 ; CHECK-NEXT:    [[SHL:%.*]] = shl i32 [[B]], [[SUB]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[A:%.*]], [[B]]
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SHL]], [[SHR]]
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
-; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[A]], [[ENTRY:%.*]] ], [ [[OR]], [[ROTBB]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.fshr.i32(i32 [[B]], i32 [[A:%.*]], i32 [[B]])
-; CHECK-NEXT:    ret i32 [[COND]]
+; CHECK-NEXT:    ret i32 [[TMP0]]
 ;
 entry:
  %cmp = icmp eq i32 %b, 0