From 305816ff1e737d9589f34dab69a3f5abd884f7d5 Mon Sep 17 00:00:00 2001 From: "Sander.DeSmalen@arm.com" Date: Tue, 16 Nov 2021 11:14:04 +0000 Subject: [PATCH] [IndVarSimplify] Reduce nondeterministic behaviour in visitIVCast. rGf39978b84f1d3a1da6c32db48f64c8daae64b3ad led to and/or exposed an issue with IndVarSimplification for a loop where a i32 phi node is no longer replaced by a widened (i64) phi node, because the SCEVs of a sign-extend no longer folded the same way. I'm unsure how to properly explain this because it's all rather complicated, but in short: SCEVs don't fold as nicely as they used to and this caused a difference. While investigating this, I found that IndVarSimplify can actually optimise the case in the way we want to if it chooses the widened IV to be 'signed' (the i32 IV is both sign and zero-extended). Oddly enough, there is some level of indeterminism in the way the algorithm works, it just picks the sign of the 'first' zext/sext user, where the order of the users-iterator is not guaranteed to be the same on each invocation of the pass (e.g. shown by first running loop-rotate, which puts the users in a different order). While I think the fix is valid in the sense that consistently picking _any_ order is better than having an nondeterministic order, I can use a bit of advice from people more familiar in this area of the code-base. For example, I'm not sure if this fix is hiding another issue where the IndVarSimplify pass could actually draw the same conclusions (i.e. that it only needs an i64 phi node) if it does a bit more work, regardless of whether it chooses the induction variable to be signed or unsigned. I'm also not sure if choosing signed is better than unsigned, or whether that just happens to be beneficial only in this individual case. Any feedback would be much appreciated! Reviewed By: reames Differential Revision: https://reviews.llvm.org/D112573 --- llvm/lib/Transforms/Scalar/IndVarSimplify.cpp | 14 ++-- .../IndVarSimplify/deterministic-sign.ll | 65 +++++++++++++++++++ 2 files changed, 72 insertions(+), 7 deletions(-) create mode 100644 llvm/test/Transforms/IndVarSimplify/deterministic-sign.ll diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index dfdb6b20f62e..ae2fe2767074 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -547,18 +547,18 @@ static void visitIVCast(CastInst *Cast, WideIVInfo &WI, return; } - if (!WI.WidestNativeType) { + if (!WI.WidestNativeType || + Width > SE->getTypeSizeInBits(WI.WidestNativeType)) { WI.WidestNativeType = SE->getEffectiveSCEVType(Ty); WI.IsSigned = IsSigned; return; } - // We extend the IV to satisfy the sign of its first user, arbitrarily. - if (WI.IsSigned != IsSigned) - return; - - if (Width > SE->getTypeSizeInBits(WI.WidestNativeType)) - WI.WidestNativeType = SE->getEffectiveSCEVType(Ty); + // We extend the IV to satisfy the sign of its user(s), or 'signed' + // if there are multiple users with both sign- and zero extensions, + // in order not to introduce nondeterministic behaviour based on the + // unspecified order of a PHI nodes' users-iterator. + WI.IsSigned |= IsSigned; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/Transforms/IndVarSimplify/deterministic-sign.ll b/llvm/test/Transforms/IndVarSimplify/deterministic-sign.ll new file mode 100644 index 000000000000..62afded30b21 --- /dev/null +++ b/llvm/test/Transforms/IndVarSimplify/deterministic-sign.ll @@ -0,0 +1,65 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt "-passes=loop-rotate,indvars" -S < %s | FileCheck %s +; RUN: opt "-passes=loop-rotate" < %s | opt "-passes=indvars" -S - | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define dso_local float @foo(float* noalias %dst, float* %src, i32 %offset, i32 %N) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP1:%.*]] = icmp slt i32 1, [[N:%.*]] +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.lr.ph: +; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[OFFSET:%.*]] to i64 +; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.for.cond.cleanup_crit_edge: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret float undef +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 1, [[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[TMP1:%.*]] = add nsw i64 [[INDVARS_IV]], [[TMP0]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[SRC:%.*]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = load float, float* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[SRC]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = load float, float* [[ARRAYIDX3]], align 4 +; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, float* [[DST:%.*]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store float [[ADD4]], float* [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; +entry: + br label %for.cond + +for.cond: ; preds = %for.body, %entry + %i.0 = phi i32 [ 1, %entry ], [ %inc, %for.body ] + %cmp = icmp slt i32 %i.0, %N + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + ret float undef + +for.body: ; preds = %for.cond + %add = add nsw i32 %i.0, %offset + %idxprom = sext i32 %add to i64 + %arrayidx = getelementptr inbounds float, float* %src, i64 %idxprom + %0 = load float, float* %arrayidx, align 4 + %add1 = add nsw i32 %add, 1 + %idxprom2 = sext i32 %add1 to i64 + %arrayidx3 = getelementptr inbounds float, float* %src, i64 %idxprom2 + %1 = load float, float* %arrayidx3, align 4 + %add4 = fadd fast float %0, %1 + %idxprom5 = zext i32 %i.0 to i64 + %arrayidx6 = getelementptr inbounds float, float* %dst, i64 %idxprom5 + store float %add4, float* %arrayidx6, align 4 + %inc = add nuw nsw i32 %i.0, 1 + br label %for.cond, !llvm.loop !1 +} + +!1 = distinct !{!1, !2} +!2 = !{!"llvm.loop.mustprogress"}