forked from OSchip/llvm-project
[ARM][MVE] Tail-predication: clean-up of unused code
After the rewrite of this pass (D79175) I missed one thing: the inserted VCTP intrinsic can be cloned to exit blocks if there are instructions present in it that perform the same operation, but this wasn't triggering anymore. However, it turns out that for handling reductions, see D75533, it's actually easier not not to have the VCTP in exit blocks, so this removes that code. This was possible because it turned out that some other code that depended on this, rematerialization of the trip count enabling more dead code removal later, wasn't doing much anymore due to more aggressive dead code removal that was added to the low-overhead loops pass. Differential Revision: https://reviews.llvm.org/D82773
This commit is contained in:
parent
63b3933d0c
commit
af45907653
|
@ -79,14 +79,9 @@ namespace {
|
|||
class MVETailPredication : public LoopPass {
|
||||
SmallVector<IntrinsicInst*, 4> MaskedInsts;
|
||||
Loop *L = nullptr;
|
||||
LoopInfo *LI = nullptr;
|
||||
const DataLayout *DL;
|
||||
DominatorTree *DT = nullptr;
|
||||
ScalarEvolution *SE = nullptr;
|
||||
TargetTransformInfo *TTI = nullptr;
|
||||
const ARMSubtarget *ST = nullptr;
|
||||
TargetLibraryInfo *TLI = nullptr;
|
||||
bool ClonedVCTPInExitBlock = false;
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
@ -98,8 +93,6 @@ public:
|
|||
AU.addRequired<LoopInfoWrapperPass>();
|
||||
AU.addRequired<TargetPassConfig>();
|
||||
AU.addRequired<TargetTransformInfoWrapperPass>();
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
AU.addRequired<TargetLibraryInfoWrapperPass>();
|
||||
AU.addPreserved<LoopInfoWrapperPass>();
|
||||
AU.setPreservesCFG();
|
||||
}
|
||||
|
@ -123,8 +116,7 @@ private:
|
|||
|
||||
/// Insert the intrinsic to represent the effect of tail predication.
|
||||
void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
|
||||
FixedVectorType *VecTy,
|
||||
DenseMap<Instruction *, Instruction *> &NewPredicates);
|
||||
FixedVectorType *VecTy);
|
||||
|
||||
/// Rematerialize the iteration count in exit blocks, which enables
|
||||
/// ARMLowOverheadLoops to better optimise away loop update statements inside
|
||||
|
@ -153,16 +145,6 @@ static bool IsMasked(Instruction *I) {
|
|||
return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load;
|
||||
}
|
||||
|
||||
void MVETailPredication::RematerializeIterCount() {
|
||||
SmallVector<WeakTrackingVH, 16> DeadInsts;
|
||||
SCEVExpander Rewriter(*SE, *DL, "mvetp");
|
||||
ReplaceExitVal ReplaceExitValue = AlwaysRepl;
|
||||
|
||||
formLCSSARecursively(*L, *DT, LI, SE);
|
||||
rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT, ReplaceExitValue,
|
||||
DeadInsts);
|
||||
}
|
||||
|
||||
bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
|
||||
if (skipLoop(L) || DisableTailPredication)
|
||||
return false;
|
||||
|
@ -172,13 +154,8 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
|
|||
auto &TPC = getAnalysis<TargetPassConfig>();
|
||||
auto &TM = TPC.getTM<TargetMachine>();
|
||||
ST = &TM.getSubtarget<ARMSubtarget>(F);
|
||||
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
||||
TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
||||
SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
|
||||
auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
|
||||
TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
|
||||
DL = &L->getHeader()->getModule()->getDataLayout();
|
||||
this->L = L;
|
||||
|
||||
// The MVE and LOB extensions are combined to enable tail-predication, but
|
||||
|
@ -232,7 +209,6 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
|
|||
if (!Decrement)
|
||||
return false;
|
||||
|
||||
ClonedVCTPInExitBlock = false;
|
||||
LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
|
||||
<< *Decrement << "\n");
|
||||
|
||||
|
@ -241,8 +217,6 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
|
|||
return false;
|
||||
}
|
||||
|
||||
if (ClonedVCTPInExitBlock)
|
||||
RematerializeIterCount();
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -319,32 +293,11 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
|
|||
// in the block. This means that the VPR doesn't have to be live into the
|
||||
// exit block which should make it easier to convert this loop into a proper
|
||||
// tail predicated loop.
|
||||
static bool Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
|
||||
SetVector<Instruction*> &MaybeDead, Loop *L) {
|
||||
static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
|
||||
BasicBlock *Exit = L->getUniqueExitBlock();
|
||||
if (!Exit) {
|
||||
LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
bool ClonedVCTPInExitBlock = false;
|
||||
|
||||
for (auto &Pair : NewPredicates) {
|
||||
Instruction *OldPred = Pair.first;
|
||||
Instruction *NewPred = Pair.second;
|
||||
|
||||
for (auto &I : *Exit) {
|
||||
if (I.isSameOperationAs(OldPred)) {
|
||||
Instruction *PredClone = NewPred->clone();
|
||||
PredClone->insertBefore(&I);
|
||||
I.replaceAllUsesWith(PredClone);
|
||||
MaybeDead.insert(&I);
|
||||
ClonedVCTPInExitBlock = true;
|
||||
LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump();
|
||||
dbgs() << "ARM TP: with: "; PredClone->dump());
|
||||
break;
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Drop references and add operands to check for dead.
|
||||
|
@ -369,8 +322,6 @@ static bool Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
|
|||
|
||||
for (auto I : L->blocks())
|
||||
DeleteDeadPHIs(I);
|
||||
|
||||
return ClonedVCTPInExitBlock;
|
||||
}
|
||||
|
||||
// The active lane intrinsic has this form:
|
||||
|
@ -549,8 +500,7 @@ static Value *getNumElements(BasicBlock *Preheader, Value *BTC) {
|
|||
}
|
||||
|
||||
void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
|
||||
Value *TripCount, FixedVectorType *VecTy,
|
||||
DenseMap<Instruction*, Instruction*> &NewPredicates) {
|
||||
Value *TripCount, FixedVectorType *VecTy) {
|
||||
IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
|
||||
Module *M = L->getHeader()->getModule();
|
||||
Type *Ty = IntegerType::get(M->getContext(), 32);
|
||||
|
@ -591,7 +541,6 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
|
|||
Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
|
||||
Value *VCTPCall = Builder.CreateCall(VCTP, Processed);
|
||||
ActiveLaneMask->replaceAllUsesWith(VCTPCall);
|
||||
NewPredicates[ActiveLaneMask] = cast<Instruction>(VCTPCall);
|
||||
|
||||
// Add the incoming value to the new phi.
|
||||
// TODO: This add likely already exists in the loop.
|
||||
|
@ -609,9 +558,7 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
|
|||
}
|
||||
|
||||
LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
|
||||
|
||||
SetVector<Instruction*> Predicates;
|
||||
DenseMap<Instruction*, Instruction*> NewPredicates;
|
||||
|
||||
// Walk through the masked intrinsics and try to find whether the predicate
|
||||
// operand is generated by intrinsic @llvm.get.active.lane.mask().
|
||||
|
@ -636,11 +583,10 @@ bool MVETailPredication::TryConvert(Value *TripCount) {
|
|||
return false;
|
||||
}
|
||||
LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n");
|
||||
InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy, NewPredicates);
|
||||
InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy);
|
||||
}
|
||||
|
||||
// Now clean up.
|
||||
ClonedVCTPInExitBlock = Cleanup(NewPredicates, Predicates, L);
|
||||
Cleanup(Predicates, L);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -15,8 +15,7 @@
|
|||
; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* {{.*}}, i32 4, <4 x i1> [[VCTP]],
|
||||
|
||||
; CHECK: middle.block:
|
||||
; CHECK: [[VCTP_CLONE:%[^ ]+]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[REMAT_ITER:%.*]])
|
||||
; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP_CLONE]],
|
||||
; CHECK: [[VPSEL:%[^ ]+]] = select <4 x i1> [[VCTP]],
|
||||
; CHECK: call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> [[VPSEL]])
|
||||
|
||||
define i32 @vec_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32 %N) {
|
||||
|
@ -64,16 +63,12 @@ vector.body: ; preds = %vector.body, %vecto
|
|||
br i1 %11, label %vector.body, label %middle.block
|
||||
|
||||
middle.block: ; preds = %vector.body
|
||||
; TODO: check that the intrinsic is also emitted here by the loop vectoriser
|
||||
; %12 = icmp ule <4 x i32> %induction, %broadcast.splat12
|
||||
%12 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
|
||||
|
||||
%13 = select <4 x i1> %12, <4 x i32> %9, <4 x i32> %vec.phi
|
||||
%14 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %13)
|
||||
%12 = select <4 x i1> %7, <4 x i32> %9, <4 x i32> %vec.phi
|
||||
%13 = call i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32> %12)
|
||||
br label %for.cond.cleanup
|
||||
|
||||
for.cond.cleanup: ; preds = %middle.block, %entry
|
||||
%res.0.lcssa = phi i32 [ 0, %entry ], [ %14, %middle.block ]
|
||||
%res.0.lcssa = phi i32 [ 0, %entry ], [ %13, %middle.block ]
|
||||
ret i32 %res.0.lcssa
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue