[IndVarSimplify] Extend previous special case for load use instruction to any narrow type loop variant to avoid extra trunc instruction

Summary: The widenIVUse avoids generating trunc by evaluating the use as AddRec, this will not work when: 1) SCEV traces back to an instruction inside the loop that SCEV can not expand, eg. add %indvar, (load %addr) 2) SCEV finds a loop variant, eg. add %indvar, %loopvariant While SCEV fails to avoid trunc, we can still try to use instruction combining approach to prove trunc is not required. This can be further extended with other instruction combining checks, but for now we handle the following case (sub can be "add" and "mul", "nsw + sext" can be "nus + zext") ``` Src: %c = sub nsw %b, %indvar %d = sext %c to i64 Dst: %indvar.ext1 = sext %indvar to i64 %m = sext %b to i64 %d = sub nsw i64 %m, %indvar.ext1 ``` Therefore, as long as the result of add/sub/mul is extended to wide type with right extension and overflow wrap combination, no trunc is required regardless of how %b is generated. This pattern is common when calculating address in 64 bit architecture. Note that this patch reuse almost all the code from D49151 by @az: https://reviews.llvm.org/D49151 It extends it by providing proof of why trunc is unnecessary in more general case, it should also resolve some of the concerns from the following discussion with @reames. http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20180910/585945.html Reviewers: sanjoy, efriedma, sebpop, reames, az, javed.absar, amehsan Reviewed By: az, amehsan Subscribers: hiraditya, llvm-commits, amehsan, reames, az Tags: #llvm Differential Revision: https://reviews.llvm.org/D73059
2020-03-05 16:24:47 -05:00 · 2020-03-05 16:24:47 -05:00 · eae228a292
parent 0d924700a6
commit eae228a292
2 changed files with 77 additions and 27 deletions
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@ -738,8 +738,8 @@ protected:
  Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);

  bool widenLoopCompare(NarrowIVDefUse DU);
-  bool widenWithVariantLoadUse(NarrowIVDefUse DU);
-  void widenWithVariantLoadUseCodegen(NarrowIVDefUse DU);
+  bool widenWithVariantUse(NarrowIVDefUse DU);
+  void widenWithVariantUseCodegen(NarrowIVDefUse DU);

  void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
 };
@ -1077,20 +1077,27 @@ bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
  return true;
 }

-/// If the narrow use is an instruction whose two operands are the defining
-/// instruction of DU and a load instruction, then we have the following:
-/// if the load is hoisted outside the loop, then we do not reach this function
-/// as scalar evolution analysis works fine in widenIVUse with variables
-/// hoisted outside the loop and efficient code is subsequently generated by
-/// not emitting truncate instructions. But when the load is not hoisted
-/// (whether due to limitation in alias analysis or due to a true legality),
-/// then scalar evolution can not proceed with loop variant values and
-/// inefficient code is generated. This function handles the non-hoisted load
-/// special case by making the optimization generate the same type of code for
-/// hoisted and non-hoisted load (widen use and eliminate sign extend
-/// instruction). This special case is important especially when the induction
-/// variables are affecting addressing mode in code generation.
-bool WidenIV::widenWithVariantLoadUse(NarrowIVDefUse DU) {
+// The widenIVUse avoids generating trunc by evaluating the use as AddRec, this
+// will not work when:
+//    1) SCEV traces back to an instruction inside the loop that SCEV can not
+// expand, eg. add %indvar, (load %addr)
+//    2) SCEV finds a loop variant, eg. add %indvar, %loopvariant
+// While SCEV fails to avoid trunc, we can still try to use instruction
+// combining approach to prove trunc is not required. This can be further
+// extended with other instruction combining checks, but for now we handle the
+// following case (sub can be "add" and "mul", "nsw + sext" can be "nus + zext")
+//
+// Src:
+//   %c = sub nsw %b, %indvar
+//   %d = sext %c to i64
+// Dst:
+//   %indvar.ext1 = sext %indvar to i64
+//   %m = sext %b to i64
+//   %d = sub nsw i64 %m, %indvar.ext1
+// Therefore, as long as the result of add/sub/mul is extended to wide type, no
+// trunc is required regardless of how %b is generated. This pattern is common
+// when calculating address in 64 bit architecture
+bool WidenIV::widenWithVariantUse(NarrowIVDefUse DU) {
  Instruction *NarrowUse = DU.NarrowUse;
  Instruction *NarrowDef = DU.NarrowDef;
  Instruction *WideDef = DU.WideDef;
@ -1121,12 +1128,6 @@ bool WidenIV::widenWithVariantLoadUse(NarrowIVDefUse DU) {
  else
    return false;

-  // We are interested in the other operand being a load instruction.
-  // But, we should look into relaxing this restriction later on.
-  auto *I = dyn_cast<Instruction>(NarrowUse->getOperand(ExtendOperIdx));
-  if (I && I->getOpcode() != Instruction::Load)
-    return false;
-
  // Verifying that Defining operand is an AddRec
  const SCEV *Op1 = SE->getSCEV(WideDef);
  const SCEVAddRecExpr *AddRecOp1 = dyn_cast<SCEVAddRecExpr>(Op1);
@ -1158,9 +1159,9 @@ bool WidenIV::widenWithVariantLoadUse(NarrowIVDefUse DU) {
  return true;
 }

-/// Special Case for widening with variant Loads (see
-/// WidenIV::widenWithVariantLoadUse). This is the code generation part.
-void WidenIV::widenWithVariantLoadUseCodegen(NarrowIVDefUse DU) {
+/// Special Case for widening with loop variant (see
+/// WidenIV::widenWithVariant). This is the code generation part.
+void WidenIV::widenWithVariantUseCodegen(NarrowIVDefUse DU) {
  Instruction *NarrowUse = DU.NarrowUse;
  Instruction *NarrowDef = DU.NarrowDef;
  Instruction *WideDef = DU.WideDef;
@ -1308,8 +1309,8 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
    // in WideAddRec.first does not indicate a polynomial induction expression.
    // In that case, look at the operands of the use instruction to determine
    // if we can still widen the use instead of truncating its operand.
-    if (widenWithVariantLoadUse(DU)) {
-      widenWithVariantLoadUseCodegen(DU);
+    if (widenWithVariantUse(DU)) {
+      widenWithVariantUseCodegen(DU);
      return nullptr;
    }

--- a/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
+++ b/llvm/test/Transforms/IndVarSimplify/iv-widen-elim-ext.ll
@ -419,3 +419,52 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
  %cmp = icmp slt i32 %add, %length
  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
 }
+
+define i32 @foo6(%struct.image* %input, i32 %length, i32* %in) {
+entry:
+  %stride = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 1
+  %0 = load i32, i32* %stride, align 4
+  %cmp17 = icmp sgt i32 %length, 1
+  br i1 %cmp17, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %channel = getelementptr inbounds %struct.image, %struct.image* %input, i64 0, i32 0
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %1 = phi i32 [ %6, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %2 = phi i32 [ 0, %entry ], [ %1, %for.cond.cleanup.loopexit ]
+  ret i32 %2
+
+; Extend foo4 so that any loop variants (%3 and %or) with mul/sub/add then extend will not
+; need a trunc instruction
+; CHECK: for.body:
+; CHECK-NOT: trunc
+; CHECK:      [[TMP0:%.*]] = and i32 %length, %0
+; CHECK-NEXT: zext i32 [[TMP0]] to i64
+; CHECK:      [[TMP1:%.*]] = or i32 %length, [[TMP2:%.*]]
+; CHECK-NEXT: zext i32 [[TMP1]] to i64
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %x.018 = phi i32 [ 1, %for.body.lr.ph ], [ %add, %for.body ]
+  %add = add nuw nsw i32 %x.018, 1
+  %3 = and i32 %length, %0
+  %mul = mul nuw i32 %3, %add
+  %idx.ext = zext i32 %mul to i64
+  %add.ptr = getelementptr inbounds i32, i32* %in, i64 %idx.ext
+  %4 = load i32, i32* %add.ptr, align 4
+  %mul1 = mul nuw i32 %0, %add
+  %idx.ext1 = zext i32 %mul1 to i64
+  %add.ptr1 = getelementptr inbounds i32, i32* %in, i64 %idx.ext1
+  %5 = load i32, i32* %add.ptr1, align 4
+  %or = or i32 %length, %5
+  %sub.or = sub nuw i32 %or, %add
+  %or.ext = zext i32 %sub.or to i64
+  %ptr.or = getelementptr inbounds i32, i32* %in, i64 %or.ext
+  %val.or = load i32, i32* %ptr.or
+  %6 = add i32 %4, %val.or
+  %cmp = icmp ult i32 %add, %length
+  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
+}