[ARM][MVE] Tail-predication: clean-up of unused code

After the rewrite of this pass (D79175) I missed one thing: the inserted VCTP
intrinsic can be cloned to exit blocks if there are instructions present in it
that perform the same operation, but this wasn't triggering anymore. However,
it turns out that for handling reductions, see D75533, it's actually easier not
not to have the VCTP in exit blocks, so this removes that code.

This was possible because it turned out that some other code that depended on
this, rematerialization of the trip count enabling more dead code removal
later, wasn't doing much anymore due to more aggressive dead code removal that
was added to the low-overhead loops pass.

Differential Revision: https://reviews.llvm.org/D82773
This commit is contained in:
Sjoerd Meijer 2020-06-29 15:40:03 +01:00
parent 63b3933d0c
commit af45907653
2 changed files with 10 additions and 69 deletions

View File

@ -79,14 +79,9 @@ namespace {
class MVETailPredication : public LoopPass {
SmallVector<IntrinsicInst*, 4> MaskedInsts;
Loop *L = nullptr;
LoopInfo *LI = nullptr;
const DataLayout *DL;
DominatorTree *DT = nullptr;
ScalarEvolution *SE = nullptr;
TargetTransformInfo *TTI = nullptr;
const ARMSubtarget *ST = nullptr;
TargetLibraryInfo *TLI = nullptr;
bool ClonedVCTPInExitBlock = false;
public:
static char ID;
@ -98,8 +93,6 @@ public:
AU.addRequired<LoopInfoWrapperPass>();
AU.addRequired<TargetPassConfig>();
AU.addRequired<TargetTransformInfoWrapperPass>();
AU.addRequired<DominatorTreeWrapperPass>();
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addPreserved<LoopInfoWrapperPass>();
AU.setPreservesCFG();
}
@ -123,8 +116,7 @@ private:
/// Insert the intrinsic to represent the effect of tail predication.
void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
FixedVectorType *VecTy,
DenseMap<Instruction *, Instruction *> &NewPredicates);
FixedVectorType *VecTy);
/// Rematerialize the iteration count in exit blocks, which enables
/// ARMLowOverheadLoops to better optimise away loop update statements inside
@ -153,16 +145,6 @@ static bool IsMasked(Instruction *I) {
return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load;
}
void MVETailPredication::RematerializeIterCount() {
SmallVector<WeakTrackingVH, 16> DeadInsts;
SCEVExpander Rewriter(*SE, *DL, "mvetp");
ReplaceExitVal ReplaceExitValue = AlwaysRepl;
formLCSSARecursively(*L, *DT, LI, SE);
rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT, ReplaceExitValue,
DeadInsts);
}
bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
if (skipLoop(L) || DisableTailPredication)
return false;
@ -172,13 +154,8 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
auto &TPC = getAnalysis<TargetPassConfig>();
auto &TM = TPC.getTM<TargetMachine>();
ST = &TM.getSubtarget<ARMSubtarget>(F);
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
DL = &L->getHeader()->getModule()->getDataLayout();
this->L = L;
// The MVE and LOB extensions are combined to enable tail-predication, but
@ -232,7 +209,6 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
if (!Decrement)
return false;
ClonedVCTPInExitBlock = false;
LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
<< *Decrement << "\n");
@ -241,8 +217,6 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
return false;
}
if (ClonedVCTPInExitBlock)
RematerializeIterCount();
return true;
}
@ -319,32 +293,11 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
// in the block. This means that the VPR doesn't have to be live into the
// exit block which should make it easier to convert this loop into a proper
// tail predicated loop.
static bool Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
SetVector<Instruction*> &MaybeDead, Loop *L) {
static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
BasicBlock *Exit = L->getUniqueExitBlock();
if (!Exit) {
LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
return false;
}
bool ClonedVCTPInExitBlock = false;
for (auto &Pair : NewPredicates) {
Instruction *OldPred = Pair.first;
Instruction *NewPred = Pair.second;
for (auto &I : *Exit) {
if (I.isSameOperationAs(OldPred)) {
Instruction *PredClone = NewPred->clone();
PredClone->insertBefore(&I);
I.replaceAllUsesWith(PredClone);
MaybeDead.insert(&I);
ClonedVCTPInExitBlock = true;
LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump();
dbgs() << "ARM TP: with: "; PredClone->dump());
break;
}
}
return;
}
// Drop references and add operands to check for dead.
@ -369,8 +322,6 @@ static bool Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
for (auto I : L->blocks())
DeleteDeadPHIs(I);
return ClonedVCTPInExitBlock;
}
// The active lane intrinsic has this form:
@ -549,8 +500,7 @@ static Value *getNumElements(BasicBlock *Preheader, Value *BTC) {
}
void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
Value *TripCount, FixedVectorType *VecTy,
DenseMap<Instruction*, Instruction*> &NewPredicates) {
Value *TripCount, FixedVectorType *VecTy) {
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
Module *M = L->getHeader()->getModule();
Type *Ty = IntegerType::get(M->getContext(), 32);
@ -591,7 +541,6 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
Value *VCTPCall = Builder.CreateCall(VCTP, Processed);
ActiveLaneMask->replaceAllUsesWith(VCTPCall);
NewPredicates[ActiveLaneMask] = cast<Instruction>(VCTPCall);
// Add the incoming value to the new phi.
// TODO: This add likely already exists in the loop.
@ -609,9 +558,7 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
}
LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
SetVector<Instruction*> Predicates;
DenseMap<Instruction*, Instruction*> NewPredicates;
// Walk through the masked intrinsics and try to find whether the predicate
// operand is generated by intrinsic @llvm.get.active.lane.mask().
@ -636,11 +583,10 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
return false;
}
LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n");
InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy, NewPredicates);
InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy);
}
// Now clean up.
ClonedVCTPInExitBlock = Cleanup(NewPredicates, Predicates, L);
Cleanup(Predicates, L);
return true;
}

View File

@ -15,8 +15,7 @@
; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
; CHECK: middle.block:
; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[REMAT_ITER:%.*]])
; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP]],
; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
@ -64,16 +63,12 @@ vector.body: ; preds = %vector.body, %vecto
br i1 %11, label %vector.body, label %middle.block
middle.block: ; preds = %vector.body
; TODO: check that the intrinsic is also emitted here by the loop vectoriser
; %12 = icmp ule <4 x i32> %induction, %broadcast.splat12
%12 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
%13 = select <4 x i1> %12, <4 x i32> %9, <4 x i32> %vec.phi
%14 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %13)
%12 = select <4 x i1> %7, <4 x i32> %9, <4 x i32> %vec.phi
%13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12)
br label %for.cond.cleanup
for.cond.cleanup: ; preds = %middle.block, %entry
%res.0.lcssa = phi i32 [ 0, %entry ], [ %14, %middle.block ]
%res.0.lcssa = phi i32 [ 0, %entry ], [ %13, %middle.block ]
ret i32 %res.0.lcssa
}