[memcpyopt] Restructure store(load src, dest) form of callslotopt for compile time

The search for the clobbering call is fairly expensive if uses are not optimized at construction.  Defer the clobber walk to the point in the implementation we need it; there are a bunch of bailouts before that point.  (e.g. If the source pointer is not an alloca, we can't do callslotopt.)

On a test case which involves a bunch of copies from argument pointers, this switches memcpyopt from > 1/2 second to < 10ms.
This commit is contained in:
Philip Reames 2022-03-28 20:29:41 -07:00
parent c0f90c84b1
commit 7c51669c21
2 changed files with 32 additions and 28 deletions

View File

@ -61,7 +61,7 @@ private:
bool processMemMove(MemMoveInst *M); bool processMemMove(MemMoveInst *M);
bool performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore, bool performCallSlotOptzn(Instruction *cpyLoad, Instruction *cpyStore,
Value *cpyDst, Value *cpySrc, TypeSize cpyLen, Value *cpyDst, Value *cpySrc, TypeSize cpyLen,
Align cpyAlign, CallInst *C); Align cpyAlign, std::function<CallInst *()> GetC);
bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep); bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
bool processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet); bool processMemSetMemCpyDependence(MemCpyInst *MemCpy, MemSetInst *MemSet);
bool performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet); bool performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet);

View File

@ -761,27 +761,25 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
// Detect cases where we're performing call slot forwarding, but // Detect cases where we're performing call slot forwarding, but
// happen to be using a load-store pair to implement it, rather than // happen to be using a load-store pair to implement it, rather than
// a memcpy. // a memcpy.
CallInst *C = nullptr; auto GetCall = [&]() -> CallInst * {
if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>( // We defer this expensive clobber walk until the cheap checks
MSSA->getWalker()->getClobberingMemoryAccess(LI))) { // have been done on the source inside performCallSlotOptzn.
// The load most post-dom the call. Limit to the same block for now. if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
// TODO: Support non-local call-slot optimization? MSSA->getWalker()->getClobberingMemoryAccess(LI)))
if (LoadClobber->getBlock() == SI->getParent()) return dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst());
C = dyn_cast_or_null<CallInst>(LoadClobber->getMemoryInst()); return nullptr;
} };
if (C) { bool changed = performCallSlotOptzn(
bool changed = performCallSlotOptzn( LI, SI, SI->getPointerOperand()->stripPointerCasts(),
LI, SI, SI->getPointerOperand()->stripPointerCasts(), LI->getPointerOperand()->stripPointerCasts(),
LI->getPointerOperand()->stripPointerCasts(), DL.getTypeStoreSize(SI->getOperand(0)->getType()),
DL.getTypeStoreSize(SI->getOperand(0)->getType()), commonAlignment(SI->getAlign(), LI->getAlign()), GetCall);
commonAlignment(SI->getAlign(), LI->getAlign()), C); if (changed) {
if (changed) { eraseInstruction(SI);
eraseInstruction(SI); eraseInstruction(LI);
eraseInstruction(LI); ++NumMemCpyInstr;
++NumMemCpyInstr; return true;
return true;
}
} }
} }
} }
@ -856,7 +854,8 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad, bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
Instruction *cpyStore, Value *cpyDest, Instruction *cpyStore, Value *cpyDest,
Value *cpySrc, TypeSize cpySize, Value *cpySrc, TypeSize cpySize,
Align cpyAlign, CallInst *C) { Align cpyAlign,
std::function<CallInst *()> GetC) {
// The general transformation to keep in mind is // The general transformation to keep in mind is
// //
// call @func(..., src, ...) // call @func(..., src, ...)
@ -875,11 +874,6 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
if (cpySize.isScalable()) if (cpySize.isScalable())
return false; return false;
// Lifetime marks shouldn't be operated on.
if (Function *F = C->getCalledFunction())
if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
return false;
// Require that src be an alloca. This simplifies the reasoning considerably. // Require that src be an alloca. This simplifies the reasoning considerably.
auto *srcAlloca = dyn_cast<AllocaInst>(cpySrc); auto *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
if (!srcAlloca) if (!srcAlloca)
@ -896,6 +890,16 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
if (cpySize < srcSize) if (cpySize < srcSize)
return false; return false;
CallInst *C = GetC();
if (!C)
return false;
// Lifetime marks shouldn't be operated on.
if (Function *F = C->getCalledFunction())
if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
return false;
if (C->getParent() != cpyStore->getParent()) { if (C->getParent() != cpyStore->getParent()) {
LLVM_DEBUG(dbgs() << "Call Slot: block local restriction\n"); LLVM_DEBUG(dbgs() << "Call Slot: block local restriction\n");
return false; return false;
@ -1459,7 +1463,7 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
if (performCallSlotOptzn( if (performCallSlotOptzn(
M, M, M->getDest(), M->getSource(), M, M, M->getDest(), M->getSource(),
TypeSize::getFixed(CopySize->getZExtValue()), Alignment, TypeSize::getFixed(CopySize->getZExtValue()), Alignment,
C)) { [C]() -> CallInst * { return C; })) {
LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n" LLVM_DEBUG(dbgs() << "Performed call slot optimization:\n"
<< " call: " << *C << "\n" << " call: " << *C << "\n"
<< " memcpy: " << *M << "\n"); << " memcpy: " << *M << "\n");