[PowerPC] Add profitablilty check for conversion to mtctr loops

Add profitability checks for modifying counted loops to use the mtctr instruction.

The latency of mtctr is only justified if there are more than 4 comparisons that
will be removed as a result.  Usually counted loops are formed relatively early
and before unrolling, so most low trip count loops often don't survive.  However
we want to ensure that if they do, we do not mistakenly update them to mtctr loops.

Use CodeMetrics to ensure we are only doing this for small loops with small trip counts.

Differential Revision: https://reviews.llvm.org/D38212

llvm-svn: 315592
This commit is contained in:
Lei Huang 2017-10-12 16:43:33 +00:00
parent c8ffffe462
commit 0724fea2da
3 changed files with 190 additions and 6 deletions

View File

@ -26,12 +26,17 @@
#include "PPC.h"
#include "PPCSubtarget.h"
#include "PPCTargetMachine.h"
#include "PPCTargetTransformInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/CodeMetrics.h"
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Dominators.h"
@ -64,6 +69,13 @@ using namespace llvm;
static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
#endif
// The latency of mtctr is only justified if there are more than 4
// comparisons that will be removed as a result.
static cl::opt<unsigned>
SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
cl::desc("Loops with a constant trip count smaller than "
"this value will not use the count register."));
STATISTIC(NumCTRLoops, "Number of loops converted to CTR loops");
namespace llvm {
@ -95,6 +107,8 @@ namespace {
AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addRequired<ScalarEvolutionWrapperPass>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
}
private:
@ -107,10 +121,12 @@ namespace {
const PPCTargetLowering *TLI;
const DataLayout *DL;
const TargetLibraryInfo *LibInfo;
const TargetTransformInfo *TTI;
LoopInfo *LI;
ScalarEvolution *SE;
DominatorTree *DT;
bool PreserveLCSSA;
TargetSchedModel SchedModel;
};
char PPCCTRLoops::ID = 0;
@ -179,6 +195,7 @@ bool PPCCTRLoops::runOnFunction(Function &F) {
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
DL = &F.getParent()->getDataLayout();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
LibInfo = TLIP ? &TLIP->getTLI() : nullptr;
@ -462,10 +479,24 @@ bool PPCCTRLoops::mightUseCTR(BasicBlock *BB) {
return false;
}
bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
bool MadeChange = false;
// Do not convert small short loops to CTR loop.
unsigned ConstTripCount = SE->getSmallConstantTripCount(L);
if (ConstTripCount && ConstTripCount < SmallCTRLoopThreshold) {
SmallPtrSet<const Value *, 32> EphValues;
auto AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
*L->getHeader()->getParent());
CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
CodeMetrics Metrics;
for (BasicBlock *BB : L->blocks())
Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
// 6 is an approximate latency for the mtctr instruction.
if (Metrics.NumInsts <= (6 * SchedModel.getIssueWidth()))
return false;
}
// Process nested loops first.
for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
MadeChange |= convertToCTRLoop(*I);

View File

@ -1,6 +1,5 @@
; RUN: llc -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
; RUN: llc -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX
target triple = "powerpc64-unknown-linux-gnu"
; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr7 < %s | FileCheck %s
; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q < %s | FileCheck %s --check-prefix=QPX
declare float @fabsf(float)
@ -38,6 +37,9 @@ loop_exit:
; CHECK-LABEL: test1:
; CHECK-NOT: mtctr
; CHECK: bl fminf
; CHECK-NOT: bl fminf
; CHECK-NOT: mtctr
; CHECK: blr
define void @test1v(<4 x float> %f, <4 x float>* %fp) {
entry:
@ -48,7 +50,7 @@ loop_body:
%0 = call <4 x float> @llvm.minnum.v4f32(<4 x float> %f, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
store <4 x float> %0, <4 x float>* %fp, align 16
%1 = add i64 %invar_address.dim.0.01, 1
%2 = icmp eq i64 %1, 2
%2 = icmp eq i64 %1, 4
br i1 %2, label %loop_exit, label %loop_body
loop_exit:
@ -56,8 +58,15 @@ loop_exit:
}
; CHECK-LABEL: test1v:
; CHECK: bl fminf
; CHECK-NOT: mtctr
; CHECK: bl fminf
; CHECK-NOT: mtctr
; CHECK: bl fminf
; CHECK-NOT: mtctr
; CHECK: bl fminf
; CHECK-NOT: bl fminf
; CHECK: blr
; QPX-LABEL: test1v:
; QPX: mtctr
@ -83,6 +92,9 @@ loop_exit:
; CHECK-LABEL: test1a:
; CHECK-NOT: mtctr
; CHECK: bl fminf
; CHECK-NOT: bl fminf
; CHECK-NOT: mtctr
; CHECK: blr
define void @test2(float %f, float* %fp) {
entry:
@ -103,6 +115,9 @@ loop_exit:
; CHECK-LABEL: test2:
; CHECK-NOT: mtctr
; CHECK: bl fmaxf
; CHECK-NOT: bl fmaxf
; CHECK-NOT: mtctr
; CHECK: blr
define void @test2v(<4 x double> %f, <4 x double>* %fp) {
entry:
@ -113,7 +128,7 @@ loop_body:
%0 = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %f, <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>)
store <4 x double> %0, <4 x double>* %fp, align 16
%1 = add i64 %invar_address.dim.0.01, 1
%2 = icmp eq i64 %1, 2
%2 = icmp eq i64 %1, 4
br i1 %2, label %loop_exit, label %loop_body
loop_exit:
@ -121,8 +136,15 @@ loop_exit:
}
; CHECK-LABEL: test2v:
; CHECK: bl fmax
; CHECK-NOT: mtctr
; CHECK: bl fmax
; CHECK-NOT: mtctr
; CHECK: bl fmax
; CHECK-NOT: mtctr
; CHECK: bl fmax
; CHECK-NOT: bl fmax
; CHECK: blr
; QPX-LABEL: test2v:
; QPX: mtctr
@ -148,6 +170,9 @@ loop_exit:
; CHECK-LABEL: test2a:
; CHECK-NOT: mtctr
; CHECK: bl fmaxf
; CHECK-NOT: bl fmaxf
; CHECK-NOT: mtctr
; CHECK: blr
define void @test3(double %f, double* %fp) {
entry:
@ -168,6 +193,9 @@ loop_exit:
; CHECK-LABEL: test3:
; CHECK-NOT: mtctr
; CHECK: bl fmin
; CHECK-NOT: bl fmin
; CHECK-NOT: mtctr
; CHECK: blr
define void @test3a(double %f, double* %fp) {
entry:
@ -188,6 +216,9 @@ loop_exit:
; CHECK-LABEL: test3a:
; CHECK-NOT: mtctr
; CHECK: bl fmin
; CHECK-NOT: bl fmin
; CHECK-NOT: mtctr
; CHECK: blr
define void @test4(double %f, double* %fp) {
entry:
@ -208,6 +239,9 @@ loop_exit:
; CHECK-LABEL: test4:
; CHECK-NOT: mtctr
; CHECK: bl fmax
; CHECK-NOT: bl fmax
; CHECK-NOT: mtctr
; CHECK: blr
define void @test4a(double %f, double* %fp) {
entry:
@ -228,4 +262,7 @@ loop_exit:
; CHECK-LABEL: test4a:
; CHECK-NOT: mtctr
; CHECK: bl fmax
; CHECK-NOT: bl fmax
; CHECK-NOT: mtctr
; CHECK: blr

View File

@ -0,0 +1,116 @@
; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=pwr8 | FileCheck %s
; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -mcpu=a2q | FileCheck %s
; Verify that we do NOT generate the mtctr instruction for loop trip counts < 4
; The latency of the mtctr is only justified if there are more than 4 comparisons that are removed as a result.
@a = common local_unnamed_addr global i32 0, align 4
@arr = common local_unnamed_addr global [5 x i32] zeroinitializer, align 4
; Function Attrs: norecurse nounwind readonly
define signext i32 @testTripCount2(i32 signext %a) {
; CHECK-LABEL: testTripCount2:
; CHECK-NOT: mtctr
; CHECK: blr
entry:
br label %for.body
for.cond.cleanup: ; preds = %for.body
ret i32 %add
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
%Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %0, %Sum.05
%indvars.iv.next = add nsw i64 %indvars.iv, -1
%tobool = icmp eq i64 %indvars.iv, 0
br i1 %tobool, label %for.cond.cleanup, label %for.body
}
; Function Attrs: norecurse nounwind readonly
define signext i32 @testTripCount3(i32 signext %a) {
; CHECK-LABEL: testTripCount3:
; CHECK-NOT: mtctr
; CHECK: blr
entry:
br label %for.body
for.cond.cleanup: ; preds = %for.body
ret i32 %add
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 2, %entry ], [ %indvars.iv.next, %for.body ]
%Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %0, %Sum.05
%indvars.iv.next = add nsw i64 %indvars.iv, -1
%tobool = icmp eq i64 %indvars.iv, 0
br i1 %tobool, label %for.cond.cleanup, label %for.body
}
; Function Attrs: norecurse nounwind readonly
define signext i32 @testTripCount4(i32 signext %a) {
; CHECK-LABEL: testTripCount4:
; CHECK: mtctr
; CHECK: bdnz
entry:
br label %for.body
for.cond.cleanup: ; preds = %for.body
ret i32 %add
for.body: ; preds = %entry, %for.body
%indvars.iv = phi i64 [ 3, %entry ], [ %indvars.iv.next, %for.body ]
%Sum.05 = phi i32 [ 0, %entry ], [ %add, %for.body ]
%arrayidx = getelementptr inbounds [5 x i32], [5 x i32]* @arr, i64 0, i64 %indvars.iv
%0 = load i32, i32* %arrayidx, align 4
%add = add nsw i32 %0, %Sum.05
%indvars.iv.next = add nsw i64 %indvars.iv, -1
%tobool = icmp eq i64 %indvars.iv, 0
br i1 %tobool, label %for.cond.cleanup, label %for.body
}
; Function Attrs: norecurse nounwind
define signext i32 @testTripCount2NonSmallLoop() {
; CHECK-LABEL: testTripCount2NonSmallLoop:
; CHECK: mtctr
; CHECK: blr
entry:
%.pre = load i32, i32* @a, align 4
br label %for.body
for.body: ; preds = %entry, %if.end
%0 = phi i32 [ %.pre, %entry ], [ %1, %if.end ]
%dec4 = phi i32 [ 1, %entry ], [ %dec, %if.end ]
%b.03 = phi i8 [ 0, %entry ], [ %b.1, %if.end ]
%tobool1 = icmp eq i32 %0, 0
br i1 %tobool1, label %if.end, label %if.then
if.then: ; preds = %for.body
store i32 2, i32* @a, align 4
br label %if.end
if.end: ; preds = %for.body, %if.then
%1 = phi i32 [ 2, %if.then ], [ 0, %for.body ]
%b.1 = phi i8 [ 2, %if.then ], [ %b.03, %for.body ]
%dec = add nsw i32 %dec4, -1
%tobool = icmp eq i32 %dec4, 0
br i1 %tobool, label %for.end, label %for.body
for.end: ; preds = %if.end
%conv = zext i8 %b.1 to i32
ret i32 %conv
}