[CSSPGO] Unblock optimizations with pseudo probe instrumentation.

The IR/MIR pseudo probe intrinsics don't get materialized into real machine instructions and therefore they don't incur runtime cost directly. However, they come with indirect cost by blocking certain optimizations. Some of the blocking are intentional (such as blocking code merge) for better counts quality while the others are accidental. This change unblocks perf-critical optimizations that do not affect counts quality. They include:

1. IR InstCombine, sinking load operation to shorten lifetimes.
2. MIR LiveRangeShrink, similar to #1
3. MIR TwoAddressInstructionPass, i.e, opeq transform
4. MIR function argument copy elision
5. IR stack protection. (though not perf-critical but nice to have).

Reviewed By: wmi

Differential Revision: https://reviews.llvm.org/D95982
This commit is contained in:
Hongtao Yu 2021-02-07 22:49:20 -08:00
parent c81d52997a
commit 1cb47a063e
15 changed files with 209 additions and 13 deletions

View File

@ -1156,6 +1156,10 @@ public:
return getOpcode() == TargetOpcode::CFI_INSTRUCTION;
}
bool isPseudoProbe() const {
return getOpcode() == TargetOpcode::PSEUDO_PROBE;
}
// True if the instruction represents a position in the function.
bool isPosition() const { return isLabel() || isCFIInstruction(); }
@ -1165,6 +1169,9 @@ public:
bool isDebugInstr() const {
return isDebugValue() || isDebugLabel() || isDebugRef();
}
bool isDebugOrPseudoInstr() const {
return isDebugInstr() || isPseudoProbe();
}
bool isDebugOffsetImm() const { return getDebugOffset().isImm(); }

View File

@ -650,6 +650,9 @@ public:
/// llvm.lifetime.end marker.
bool isLifetimeStartOrEnd() const;
/// Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst.
bool isDebugOrPseudoInst() const;
/// Return a pointer to the next non-debug instruction in the same basic
/// block as 'this', or nullptr if no such instruction exists. Skip any pseudo
/// operations if \c SkipPseudoOp is true.

View File

@ -156,7 +156,8 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
// If MI has side effects, it should become a barrier for code motion.
// IOM is rebuild from the next instruction to prevent later
// instructions from being moved before this MI.
if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) {
if (MI.hasUnmodeledSideEffects() && !MI.isPseudoProbe() &&
Next != MBB.end()) {
BuildInstOrderMap(Next, IOM);
SawStore = false;
}

View File

@ -1462,7 +1462,8 @@ bool MachineInstr::hasUnmodeledSideEffects() const {
}
bool MachineInstr::isLoadFoldBarrier() const {
return mayStore() || isCall() || hasUnmodeledSideEffects();
return mayStore() || isCall() ||
(hasUnmodeledSideEffects() && !isPseudoProbe());
}
/// allDefsAreDead - Return true if all the defs of this instruction are dead.

View File

@ -9681,8 +9681,9 @@ findArgumentCopyElisionCandidates(const DataLayout &DL,
// We will look through cast uses, so ignore them completely.
if (I.isCast())
continue;
// Ignore debug info intrinsics, they don't escape or store to allocas.
if (isa<DbgInfoIntrinsic>(I))
// Ignore debug info and pseudo op intrinsics, they don't escape or store
// to allocas.
if (I.isDebugOrPseudoInst())
continue;
// This is an unknown instruction. Assume it escapes or writes to all
// static alloca operands.

View File

@ -193,7 +193,7 @@ bool StackProtector::HasAddressTaken(const Instruction *AI,
// Ignore intrinsics that do not become real instructions.
// TODO: Narrow this to intrinsics that have store-like effects.
const auto *CI = cast<CallInst>(I);
if (!isa<DbgInfoIntrinsic>(CI) && !CI->isLifetimeStartOrEnd())
if (!CI->isDebugOrPseudoInst() && !CI->isLifetimeStartOrEnd())
return true;
break;
}

View File

@ -801,8 +801,8 @@ bool TwoAddressInstructionPass::rescheduleMIBelowKill(
MachineBasicBlock::iterator KillPos = KillMI;
++KillPos;
for (MachineInstr &OtherMI : make_range(End, KillPos)) {
// Debug instructions cannot be counted against the limit.
if (OtherMI.isDebugInstr())
// Debug or pseudo instructions cannot be counted against the limit.
if (OtherMI.isDebugOrPseudoInstr())
continue;
if (NumVisited > 10) // FIXME: Arbitrary limit to reduce compile time cost.
return false;
@ -974,8 +974,8 @@ bool TwoAddressInstructionPass::rescheduleKillAboveMI(
unsigned NumVisited = 0;
for (MachineInstr &OtherMI :
make_range(mi, MachineBasicBlock::iterator(KillMI))) {
// Debug instructions cannot be counted against the limit.
if (OtherMI.isDebugInstr())
// Debug or pseudo instructions cannot be counted against the limit.
if (OtherMI.isDebugOrPseudoInstr())
continue;
if (NumVisited > 10) // FIXME: Arbitrary limit to reduce compile time cost.
return false;

View File

@ -641,6 +641,10 @@ bool Instruction::isLifetimeStartOrEnd() const {
return ID == Intrinsic::lifetime_start || ID == Intrinsic::lifetime_end;
}
bool Instruction::isDebugOrPseudoInst() const {
return isa<DbgInfoIntrinsic>(this) || isa<PseudoProbeInst>(this);
}
const Instruction *
Instruction::getNextNonDebugInstruction(bool SkipPseudoOp) const {
for (const Instruction *I = getNextNode(); I; I = I->getNextNode())

View File

@ -149,6 +149,13 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
if (isNoModRef(MRI))
continue;
// A pseudo probe call shouldn't change any function attribute since it
// doesn't translate to a real instruction. It comes with a memory access
// tag to prevent itself being removed by optimizations and not block
// other instructions being optimized.
if (isa<PseudoProbeInst>(I))
continue;
if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) {
// The call could access any memory. If that includes writes, note it.
if (isModSet(MRI))

View File

@ -592,8 +592,14 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end();
for (++BBI; BBI != E; ++BBI)
if (BBI->mayWriteToMemory())
if (BBI->mayWriteToMemory()) {
// Calls that only access inaccessible memory do not block sinking the
// load.
if (auto *CB = dyn_cast<CallBase>(BBI))
if (CB->onlyAccessesInaccessibleMemory())
continue;
return false;
}
// Check for non-address taken alloca. If not address-taken already, it isn't
// profitable to do this xform.

View File

@ -3877,9 +3877,10 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
}
}
// Skip processing debug intrinsics in InstCombine. Processing these call instructions
// consumes non-trivial amount of time and provides no value for the optimization.
if (!isa<DbgInfoIntrinsic>(Inst)) {
// Skip processing debug and pseudo intrinsics in InstCombine. Processing
// these call instructions consumes non-trivial amount of time and
// provides no value for the optimization.
if (!Inst->isDebugOrPseudoInst()) {
InstrsForInstCombineWorklist.push_back(Inst);
SeenAliasScopes.analyse(Inst);
}

View File

@ -0,0 +1,66 @@
; RUN: opt -passes=instcombine -S < %s | FileCheck %s
%struct.nonbonded = type { [2 x %struct.CompAtom*], [2 x %struct.CompAtomExt*], [2 x %struct.CompAtom*], [2 x %class.Vector*], [2 x %class.Vector*], [2 x i32], %class.Vector, double*, double*, %class.ComputeNonbondedWorkArrays*, %class.Pairlists*, i32, i32, double, double, i32, i32, i32, i32 }
%struct.CompAtomExt = type { i32 }
%struct.CompAtom = type { %class.Vector, float, i16, i8, i8 }
%class.Vector = type { double, double, double }
%class.ComputeNonbondedWorkArrays = type { %class.ResizeArray, %class.ResizeArray.0, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray.2, %class.ResizeArray.2 }
%class.ResizeArray.0 = type { i32 (...)**, %class.ResizeArrayRaw.1* }
%class.ResizeArrayRaw.1 = type <{ double*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
%class.ResizeArray = type { i32 (...)**, %class.ResizeArrayRaw* }
%class.ResizeArrayRaw = type <{ i16*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
%class.ResizeArray.2 = type { i32 (...)**, %class.ResizeArrayRaw.3* }
%class.ResizeArrayRaw.3 = type <{ %class.Vector*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
%class.Pairlists = type { i16*, i32, i32 }
;; Check the minPart4 and minPart assignments are merged.
; CHECK-COUNT-1: getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
; CHECK-NOT: getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
define dso_local void @_ZN20ComputeNonbondedUtil9calc_pairEP9nonbonded(%struct.nonbonded* nocapture readonly %params) local_unnamed_addr align 2 {
entry:
%savePairlists3 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 11
%0 = load i32, i32* %savePairlists3, align 8
%usePairlists4 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 12
%1 = load i32, i32* %usePairlists4, align 4
%tobool54.not = icmp eq i32 %0, 0
br i1 %tobool54.not, label %lor.lhs.false55, label %if.end109
lor.lhs.false55: ; preds = %entry
%tobool56.not = icmp eq i32 %1, 0
br i1 %tobool56.not, label %if.end109, label %if.end109.thread
if.end109.thread: ; preds = %lor.lhs.false55
%minPart4 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
%2 = load i32, i32* %minPart4, align 4
call void @llvm.pseudoprobe(i64 -6172701105289426098, i64 2, i32 0, i64 -1)
br label %if.then138
if.end109: ; preds = %lor.lhs.false55, %entry
%minPart = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
%3 = load i32, i32* %minPart, align 4
call void @llvm.pseudoprobe(i64 -6172701105289426098, i64 3, i32 0, i64 -1)
%tobool116.not = icmp eq i32 %1, 0
br i1 %tobool116.not, label %if.then117, label %if.then138
if.then117: ; preds = %if.end109
ret void
if.then138: ; preds = %if.end109.thread, %if.end109
%4 = phi i32 [ %2, %if.end109.thread ], [ %3, %if.end109 ]
%tobool139.not = icmp eq i32 %4, 0
br i1 %tobool139.not, label %if.else147, label %if.then140
if.then140: ; preds = %if.then138
ret void
if.else147: ; preds = %if.then138
ret void
}
declare dso_local void @_ZN9Pairlists8addIndexEv() align 2
; Function Attrs: inaccessiblememonly nounwind willreturn
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0
attributes #0 = { inaccessiblememonly nounwind willreturn }

View File

@ -0,0 +1,33 @@
; PR1075
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-apple-darwin -pseudo-probe-for-profiling -O3 | FileCheck %s
define float @foo(float %x) #0 {
%tmp1 = fmul float %x, 3.000000e+00
%tmp3 = fmul float %x, 5.000000e+00
%tmp5 = fmul float %x, 7.000000e+00
%tmp7 = fmul float %x, 1.100000e+01
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 1, i32 0, i64 -1)
%tmp10 = fadd float %tmp1, %tmp3
%tmp12 = fadd float %tmp10, %tmp5
%tmp14 = fadd float %tmp12, %tmp7
ret float %tmp14
; CHECK: mulss
; CHECK: mulss
; CHECK: addss
; CHECK: mulss
; CHECK: addss
; CHECK: mulss
; CHECK: addss
; CHECK: ret
}
; Function Attrs: inaccessiblememonly nounwind willreturn
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #1
attributes #0 = { nounwind }
attributes #1 = { inaccessiblememonly nounwind willreturn }
!llvm.pseudo_probe_desc = !{!0}
!0 = !{i64 6699318081062747564, i64 4294967295, !"foo", null}

View File

@ -0,0 +1,29 @@
; RUN: llc -mtriple=x86_64-- -stop-after=peephole-opt -o - %s | FileCheck %s
define internal i32 @arc_compare() {
entry:
%0 = load i64, i64* undef, align 8
br i1 undef, label %return, label %if.end
if.end: ; preds = %entry
; Chek a register copy has been sinked into the compare instruction.
; CHECK: %[[#REG:]]:gr64 = IMPLICIT_DEF
; CHECK-NOT: %[[#]]:gr64 = MOV64rm %[[#REG]]
; CHECK: PSEUDO_PROBE 5116412291814990879, 3, 0, 0
; CHECK: CMP64mr %[[#REG]], 1
call void @llvm.pseudoprobe(i64 5116412291814990879, i64 3, i32 0, i64 -1)
%cmp4 = icmp slt i64 %0, undef
br i1 %cmp4, label %return, label %if.end6
if.end6: ; preds = %if.end
call void @llvm.pseudoprobe(i64 5116412291814990879, i64 5, i32 0, i64 -1)
br label %return
return: ; preds = %if.end6, %if.end, %entry
ret i32 undef
}
; Function Attrs: inaccessiblememonly nounwind willreturn
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0
attributes #0 = { inaccessiblememonly nounwind willreturn }

View File

@ -0,0 +1,37 @@
; RUN: llc -stop-after=twoaddressinstruction -mtriple=x86_64-- -o - %s | FileCheck %s
define dso_local double @twoaddressinstruction() local_unnamed_addr {
for.end:
%0 = load i64, i64* undef, align 8
br label %for.body14.preheader
for.body14.preheader: ; preds = %for.end
br i1 undef, label %for.cond25.preheader.loopexit.unr-lcssa, label %for.body14.preheader.new
for.body14.preheader.new: ; preds = %for.body14.preheader
%unroll_iter136 = and i64 %0, -4
br label %for.body14
for.cond25.preheader.loopexit.unr-lcssa: ; preds = %for.body14, %for.body14.preheader
%indvars.iv127.unr = phi i64 [ 1, %for.body14.preheader ], [ %indvars.iv.next128.3, %for.body14 ]
ret double undef
for.body14: ; preds = %for.body14, %for.body14.preheader.new
%indvars.iv127 = phi i64 [ 1, %for.body14.preheader.new ], [ %indvars.iv.next128.3, %for.body14 ]
%niter137 = phi i64 [ %unroll_iter136, %for.body14.preheader.new ], [ %niter137.nsub.3, %for.body14 ]
%indvars.iv.next128.3 = add nuw nsw i64 %indvars.iv127, 4
; CHECK: PSEUDO_PROBE -6878943695821059507, 9, 0, 0
call void @llvm.pseudoprobe(i64 -6878943695821059507, i64 9, i32 0, i64 -1)
;; Check an opeq form of instruction is created.
; CHECK: %[[#REG:]]:gr64_nosp = COPY killed %[[#]]
; CHECK: %[[#REG]]:gr64_nosp = nuw ADD64ri8 %[[#REG]], 4, implicit-def dead $eflags
%niter137.nsub.3 = add i64 %niter137, -4
%niter137.ncmp.3 = icmp eq i64 %niter137.nsub.3, 0
br i1 %niter137.ncmp.3, label %for.cond25.preheader.loopexit.unr-lcssa, label %for.body14
}
; Function Attrs: inaccessiblememonly nounwind willreturn
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0
attributes #0 = { inaccessiblememonly nounwind willreturn }