From 305816ff1e737d9589f34dab69a3f5abd884f7d5 Mon Sep 17 00:00:00 2001
From: "Sander.DeSmalen@arm.com" <Sander.DeSmalen@arm.com>
Date: Tue, 16 Nov 2021 11:14:04 +0000
Subject: [PATCH] [IndVarSimplify] Reduce nondeterministic behaviour in
 visitIVCast.

rGf39978b84f1d3a1da6c32db48f64c8daae64b3ad led to and/or exposed
an issue with IndVarSimplification for a loop where a i32 phi node is
no longer replaced by a widened (i64) phi node, because the SCEVs of a
sign-extend no longer folded the same way. I'm unsure how to properly
explain this because it's all rather complicated, but in short: SCEVs
don't fold as nicely as they used to and this caused a difference.

While investigating this, I found that IndVarSimplify can actually
optimise the case in the way we want to if it chooses the widened IV to
be 'signed' (the i32 IV is both sign and zero-extended). Oddly enough,
there is some level of indeterminism in the way the algorithm works,
it just picks the sign of the 'first' zext/sext user, where the order of
the users-iterator is not guaranteed to be the same on each invocation
of the pass (e.g. shown by first running loop-rotate, which puts the
users in a different order).

While I think the fix is valid in the sense that consistently picking
_any_ order is better than having an nondeterministic order, I can
use a bit of advice from people more familiar in this area of the
code-base.

For example, I'm not sure if this fix is hiding another issue where the
IndVarSimplify pass could actually draw the same conclusions (i.e. that
it only needs an i64 phi node) if it does a bit more work, regardless
of whether it chooses the induction variable to be signed or unsigned.

I'm also not sure if choosing signed is better than unsigned, or whether
that just happens to be beneficial only in this individual case.

Any feedback would be much appreciated!

Reviewed By: reames

Differential Revision: https://reviews.llvm.org/D112573
---
 llvm/lib/Transforms/Scalar/IndVarSimplify.cpp | 14 ++--
 .../IndVarSimplify/deterministic-sign.ll      | 65 +++++++++++++++++++
 2 files changed, 72 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/Transforms/IndVarSimplify/deterministic-sign.ll

diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index dfdb6b20f62e..ae2fe2767074 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -547,18 +547,18 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI,
     return;
   }
 
-  if (!WI.WidestNativeType) {
+  if (!WI.WidestNativeType ||
+      Width > SE->getTypeSizeInBits(WI.WidestNativeType)) {
     WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
     WI.IsSigned = IsSigned;
     return;
   }
 
-  // We extend the IV to satisfy the sign of its first user, arbitrarily.
-  if (WI.IsSigned != IsSigned)
-    return;
-
-  if (Width > SE->getTypeSizeInBits(WI.WidestNativeType))
-    WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+  // We extend the IV to satisfy the sign of its user(s), or 'signed'
+  // if there are multiple users with both sign- and zero extensions,
+  // in order not to introduce nondeterministic behaviour based on the
+  // unspecified order of a PHI nodes' users-iterator.
+  WI.IsSigned |= IsSigned;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/Transforms/IndVarSimplify/deterministic-sign.ll b/llvm/test/Transforms/IndVarSimplify/deterministic-sign.ll
new file mode 100644
index 000000000000..62afded30b21
--- /dev/null
+++ b/llvm/test/Transforms/IndVarSimplify/deterministic-sign.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt "-passes=loop-rotate,indvars" -S < %s | FileCheck %s
+; RUN: opt "-passes=loop-rotate" < %s | opt "-passes=indvars" -S - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local float @foo(float* noalias %dst, float* %src, i32 %offset, i32 %N) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 1, [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[OFFSET:%.*]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.for.cond.cleanup_crit_edge:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret float undef
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 1, [[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[ARRAYIDX3]], align 4
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store float [[ADD4]], float* [[ARRAYIDX6]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 1, %entry ], [ %inc, %for.body ]
+  %cmp = icmp slt i32 %i.0, %N
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  ret float undef
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %i.0, %offset
+  %idxprom = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds float, float* %src, i64 %idxprom
+  %0 = load float, float* %arrayidx, align 4
+  %add1 = add nsw i32 %add, 1
+  %idxprom2 = sext i32 %add1 to i64
+  %arrayidx3 = getelementptr inbounds float, float* %src, i64 %idxprom2
+  %1 = load float, float* %arrayidx3, align 4
+  %add4 = fadd fast float %0, %1
+  %idxprom5 = zext i32 %i.0 to i64
+  %arrayidx6 = getelementptr inbounds float, float* %dst, i64 %idxprom5
+  store float %add4, float* %arrayidx6, align 4
+  %inc = add nuw nsw i32 %i.0, 1
+  br label %for.cond, !llvm.loop !1
+}
+
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}