forked from OSchip/llvm-project
[OpenMPIRBuilder] introduce createStaticWorkshareLoop
Introduce a function that creates a statically-scheduled workshare loop out of a canonical loop created earlier by the OpenMPIRBuilder. This basically amounts to injecting runtime calls to the preheader and the after block and updating the trip count. Static scheduling kind is currently hardcoded and needs to be extracted from the runtime library into common TableGen definitions. Differential Revision: https://reviews.llvm.org/D92476
This commit is contained in:
parent
6249bfeefe
commit
c102c783cd
|
@ -260,6 +260,32 @@ public:
|
|||
Value *Start, Value *Stop, Value *Step,
|
||||
bool IsSigned, bool InclusiveStop);
|
||||
|
||||
/// Modifies the canonical loop to be a statically-scheduled workshare loop.
|
||||
///
|
||||
/// This takes a \p LoopInfo representing a canonical loop, such as the one
|
||||
/// created by \p createCanonicalLoop and emits additional instructions to
|
||||
/// turn it into a workshare loop. In particular, it calls to an OpenMP
|
||||
/// runtime function in the preheader to obtain the loop bounds to be used in
|
||||
/// the current thread, updates the relevant instructions in the canonical
|
||||
/// loop and calls to an OpenMP runtime finalization function after the loop.
|
||||
///
|
||||
/// \param Loc The source location description, the insertion location
|
||||
/// is not used.
|
||||
/// \param CLI A descriptor of the canonical loop to workshare.
|
||||
/// \param AllocaIP An insertion point for Alloca instructions usable in the
|
||||
/// preheader of the loop.
|
||||
/// \param NeedsBarrier Indicates whether a barrier must be insterted after
|
||||
/// the loop.
|
||||
/// \param Chunk The size of loop chunk considered as a unit when
|
||||
/// scheduling. If \p nullptr, defaults to 1.
|
||||
///
|
||||
/// \returns Updated CanonicalLoopInfo.
|
||||
CanonicalLoopInfo *createStaticWorkshareLoop(const LocationDescription &Loc,
|
||||
CanonicalLoopInfo *CLI,
|
||||
InsertPointTy AllocaIP,
|
||||
bool NeedsBarrier,
|
||||
Value *Chunk = nullptr);
|
||||
|
||||
/// Generator for '#omp flush'
|
||||
///
|
||||
/// \param Loc The location where the flush directive was encountered
|
||||
|
@ -636,7 +662,9 @@ private:
|
|||
/// | Cond---\
|
||||
/// | | |
|
||||
/// | Body |
|
||||
/// | | |
|
||||
/// | | | |
|
||||
/// | <...> |
|
||||
/// | | | |
|
||||
/// \--Latch |
|
||||
/// |
|
||||
/// Exit
|
||||
|
@ -644,7 +672,9 @@ private:
|
|||
/// After
|
||||
///
|
||||
/// Code in the header, condition block, latch and exit block must not have any
|
||||
/// side-effect.
|
||||
/// side-effect. The body block is the single entry point into the loop body,
|
||||
/// which may contain arbitrary control flow as long as all control paths
|
||||
/// eventually branch to the latch block.
|
||||
///
|
||||
/// Defined outside OpenMPIRBuilder because one cannot forward-declare nested
|
||||
/// classes.
|
||||
|
@ -701,7 +731,7 @@ public:
|
|||
/// statements/cancellations).
|
||||
BasicBlock *getAfter() const { return After; }
|
||||
|
||||
/// Returns the llvm::Value containing the number of loop iterations. I must
|
||||
/// Returns the llvm::Value containing the number of loop iterations. It must
|
||||
/// be valid in the preheader and always interpreted as an unsigned integer of
|
||||
/// any bit-width.
|
||||
Value *getTripCount() const {
|
||||
|
|
|
@ -999,6 +999,118 @@ CanonicalLoopInfo *OpenMPIRBuilder::createCanonicalLoop(
|
|||
return createCanonicalLoop(Builder.saveIP(), BodyGen, TripCount);
|
||||
}
|
||||
|
||||
// Returns an LLVM function to call for initializing loop bounds using OpenMP
|
||||
// static scheduling depending on `type`. Only i32 and i64 are supported by the
|
||||
// runtime. Always interpret integers as unsigned similarly to
|
||||
// CanonicalLoopInfo.
|
||||
static FunctionCallee getKmpcForStaticInitForType(Type *Ty, Module &M,
|
||||
OpenMPIRBuilder &OMPBuilder) {
|
||||
unsigned Bitwidth = Ty->getIntegerBitWidth();
|
||||
if (Bitwidth == 32)
|
||||
return OMPBuilder.getOrCreateRuntimeFunction(
|
||||
M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_4u);
|
||||
if (Bitwidth == 64)
|
||||
return OMPBuilder.getOrCreateRuntimeFunction(
|
||||
M, omp::RuntimeFunction::OMPRTL___kmpc_for_static_init_8u);
|
||||
llvm_unreachable("unknown OpenMP loop iterator bitwidth");
|
||||
}
|
||||
|
||||
// Sets the number of loop iterations to the given value. This value must be
|
||||
// valid in the condition block (i.e., defined in the preheader) and is
|
||||
// interpreted as an unsigned integer.
|
||||
void setCanonicalLoopTripCount(CanonicalLoopInfo *CLI, Value *TripCount) {
|
||||
Instruction *CmpI = &CLI->getCond()->front();
|
||||
assert(isa<CmpInst>(CmpI) && "First inst must compare IV with TripCount");
|
||||
CmpI->setOperand(1, TripCount);
|
||||
CLI->assertOK();
|
||||
}
|
||||
|
||||
CanonicalLoopInfo *OpenMPIRBuilder::createStaticWorkshareLoop(
|
||||
const LocationDescription &Loc, CanonicalLoopInfo *CLI,
|
||||
InsertPointTy AllocaIP, bool NeedsBarrier, Value *Chunk) {
|
||||
// Set up the source location value for OpenMP runtime.
|
||||
if (!updateToLocation(Loc))
|
||||
return nullptr;
|
||||
|
||||
Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
|
||||
Value *SrcLoc = getOrCreateIdent(SrcLocStr);
|
||||
|
||||
// Declare useful OpenMP runtime functions.
|
||||
Value *IV = CLI->getIndVar();
|
||||
Type *IVTy = IV->getType();
|
||||
FunctionCallee StaticInit = getKmpcForStaticInitForType(IVTy, M, *this);
|
||||
FunctionCallee StaticFini =
|
||||
getOrCreateRuntimeFunction(M, omp::OMPRTL___kmpc_for_static_fini);
|
||||
|
||||
// Allocate space for computed loop bounds as expected by the "init" function.
|
||||
Builder.restoreIP(AllocaIP);
|
||||
Type *I32Type = Type::getInt32Ty(M.getContext());
|
||||
Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
|
||||
Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
|
||||
Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
|
||||
Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
|
||||
|
||||
// At the end of the preheader, prepare for calling the "init" function by
|
||||
// storing the current loop bounds into the allocated space. A canonical loop
|
||||
// always iterates from 0 to trip-count with step 1. Note that "init" expects
|
||||
// and produces an inclusive upper bound.
|
||||
Builder.SetInsertPoint(CLI->getPreheader()->getTerminator());
|
||||
Constant *Zero = ConstantInt::get(IVTy, 0);
|
||||
Constant *One = ConstantInt::get(IVTy, 1);
|
||||
Builder.CreateStore(Zero, PLowerBound);
|
||||
Value *UpperBound = Builder.CreateSub(CLI->getTripCount(), One);
|
||||
Builder.CreateStore(UpperBound, PUpperBound);
|
||||
Builder.CreateStore(One, PStride);
|
||||
|
||||
if (!Chunk)
|
||||
Chunk = One;
|
||||
|
||||
Value *ThreadNum = getOrCreateThreadID(SrcLoc);
|
||||
|
||||
// TODO: extract scheduling type and map it to OMP constant. This is curently
|
||||
// happening in kmp.h and its ilk and needs to be moved to OpenMP.td first.
|
||||
constexpr int StaticSchedType = 34;
|
||||
Constant *SchedulingType = ConstantInt::get(I32Type, StaticSchedType);
|
||||
|
||||
// Call the "init" function and update the trip count of the loop with the
|
||||
// value it produced.
|
||||
Builder.CreateCall(StaticInit,
|
||||
{SrcLoc, ThreadNum, SchedulingType, PLastIter, PLowerBound,
|
||||
PUpperBound, PStride, One, Chunk});
|
||||
Value *LowerBound = Builder.CreateLoad(PLowerBound);
|
||||
Value *InclusiveUpperBound = Builder.CreateLoad(PUpperBound);
|
||||
Value *TripCountMinusOne = Builder.CreateSub(InclusiveUpperBound, LowerBound);
|
||||
Value *TripCount = Builder.CreateAdd(TripCountMinusOne, One);
|
||||
setCanonicalLoopTripCount(CLI, TripCount);
|
||||
|
||||
// Update all uses of the induction variable except the one in the condition
|
||||
// block that compares it with the actual upper bound, and the increment in
|
||||
// the latch block.
|
||||
// TODO: this can eventually move to CanonicalLoopInfo or to a new
|
||||
// CanonicalLoopInfoUpdater interface.
|
||||
Builder.SetInsertPoint(CLI->getBody(), CLI->getBody()->getFirstInsertionPt());
|
||||
Value *UpdatedIV = Builder.CreateAdd(IV, LowerBound);
|
||||
IV->replaceUsesWithIf(UpdatedIV, [&](Use &U) {
|
||||
auto *Instr = dyn_cast<Instruction>(U.getUser());
|
||||
return !Instr ||
|
||||
(Instr->getParent() != CLI->getCond() &&
|
||||
Instr->getParent() != CLI->getLatch() && Instr != UpdatedIV);
|
||||
});
|
||||
|
||||
// In the "exit" block, call the "fini" function.
|
||||
Builder.SetInsertPoint(CLI->getExit(),
|
||||
CLI->getExit()->getTerminator()->getIterator());
|
||||
Builder.CreateCall(StaticFini, {SrcLoc, ThreadNum});
|
||||
|
||||
// Add the barrier if requested.
|
||||
if (NeedsBarrier)
|
||||
createBarrier(Loc, omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
|
||||
/* CheckCancelFlag */ false);
|
||||
|
||||
CLI->assertOK();
|
||||
return CLI;
|
||||
}
|
||||
|
||||
void CanonicalLoopInfo::eraseFromParent() {
|
||||
assert(IsValid && "can only erase previously valid loop cfg");
|
||||
IsValid = false;
|
||||
|
|
|
@ -1071,6 +1071,92 @@ TEST_F(OpenMPIRBuilderTest, CanonicalLoopBounds) {
|
|||
EXPECT_FALSE(verifyModule(*M, &errs()));
|
||||
}
|
||||
|
||||
TEST_F(OpenMPIRBuilderTest, StaticWorkShareLoop) {
|
||||
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
|
||||
OpenMPIRBuilder OMPBuilder(*M);
|
||||
OMPBuilder.initialize();
|
||||
IRBuilder<> Builder(BB);
|
||||
OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
|
||||
|
||||
Type *LCTy = Type::getInt32Ty(Ctx);
|
||||
Value *StartVal = ConstantInt::get(LCTy, 10);
|
||||
Value *StopVal = ConstantInt::get(LCTy, 52);
|
||||
Value *StepVal = ConstantInt::get(LCTy, 2);
|
||||
auto LoopBodyGen = [&](InsertPointTy, llvm::Value *) {};
|
||||
|
||||
CanonicalLoopInfo *CLI = OMPBuilder.createCanonicalLoop(
|
||||
Loc, LoopBodyGen, StartVal, StopVal, StepVal,
|
||||
/*IsSigned=*/false, /*InclusiveStop=*/false);
|
||||
|
||||
Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
|
||||
InsertPointTy AllocaIP = Builder.saveIP();
|
||||
|
||||
CLI = OMPBuilder.createStaticWorkshareLoop(Loc, CLI, AllocaIP,
|
||||
/*NeedsBarrier=*/true);
|
||||
auto AllocaIter = BB->begin();
|
||||
ASSERT_GE(std::distance(BB->begin(), BB->end()), 4);
|
||||
AllocaInst *PLastIter = dyn_cast<AllocaInst>(&*(AllocaIter++));
|
||||
AllocaInst *PLowerBound = dyn_cast<AllocaInst>(&*(AllocaIter++));
|
||||
AllocaInst *PUpperBound = dyn_cast<AllocaInst>(&*(AllocaIter++));
|
||||
AllocaInst *PStride = dyn_cast<AllocaInst>(&*(AllocaIter++));
|
||||
EXPECT_NE(PLastIter, nullptr);
|
||||
EXPECT_NE(PLowerBound, nullptr);
|
||||
EXPECT_NE(PUpperBound, nullptr);
|
||||
EXPECT_NE(PStride, nullptr);
|
||||
|
||||
auto PreheaderIter = CLI->getPreheader()->begin();
|
||||
ASSERT_GE(
|
||||
std::distance(CLI->getPreheader()->begin(), CLI->getPreheader()->end()),
|
||||
7);
|
||||
StoreInst *LowerBoundStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
|
||||
StoreInst *UpperBoundStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
|
||||
StoreInst *StrideStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
|
||||
ASSERT_NE(LowerBoundStore, nullptr);
|
||||
ASSERT_NE(UpperBoundStore, nullptr);
|
||||
ASSERT_NE(StrideStore, nullptr);
|
||||
|
||||
auto *OrigLowerBound =
|
||||
dyn_cast<ConstantInt>(LowerBoundStore->getValueOperand());
|
||||
auto *OrigUpperBound =
|
||||
dyn_cast<ConstantInt>(UpperBoundStore->getValueOperand());
|
||||
auto *OrigStride = dyn_cast<ConstantInt>(StrideStore->getValueOperand());
|
||||
ASSERT_NE(OrigLowerBound, nullptr);
|
||||
ASSERT_NE(OrigUpperBound, nullptr);
|
||||
ASSERT_NE(OrigStride, nullptr);
|
||||
EXPECT_EQ(OrigLowerBound->getValue(), 0);
|
||||
EXPECT_EQ(OrigUpperBound->getValue(), 20);
|
||||
EXPECT_EQ(OrigStride->getValue(), 1);
|
||||
|
||||
// Check that the loop IV is updated to account for the lower bound returned
|
||||
// by the OpenMP runtime call.
|
||||
BinaryOperator *Add = dyn_cast<BinaryOperator>(&CLI->getBody()->front());
|
||||
EXPECT_EQ(Add->getOperand(0), CLI->getIndVar());
|
||||
auto *LoadedLowerBound = dyn_cast<LoadInst>(Add->getOperand(1));
|
||||
ASSERT_NE(LoadedLowerBound, nullptr);
|
||||
EXPECT_EQ(LoadedLowerBound->getPointerOperand(), PLowerBound);
|
||||
|
||||
// Check that the trip count is updated to account for the lower and upper
|
||||
// bounds return by the OpenMP runtime call.
|
||||
auto *AddOne = dyn_cast<Instruction>(CLI->getTripCount());
|
||||
ASSERT_NE(AddOne, nullptr);
|
||||
ASSERT_TRUE(AddOne->isBinaryOp());
|
||||
auto *One = dyn_cast<ConstantInt>(AddOne->getOperand(1));
|
||||
ASSERT_NE(One, nullptr);
|
||||
EXPECT_EQ(One->getValue(), 1);
|
||||
auto *Difference = dyn_cast<Instruction>(AddOne->getOperand(0));
|
||||
ASSERT_NE(Difference, nullptr);
|
||||
ASSERT_TRUE(Difference->isBinaryOp());
|
||||
EXPECT_EQ(Difference->getOperand(1), LoadedLowerBound);
|
||||
auto *LoadedUpperBound = dyn_cast<LoadInst>(Difference->getOperand(0));
|
||||
ASSERT_NE(LoadedUpperBound, nullptr);
|
||||
EXPECT_EQ(LoadedUpperBound->getPointerOperand(), PUpperBound);
|
||||
|
||||
// The original loop iterator should only be used in the condition, in the
|
||||
// increment and in the statement that adds the lower bound to it.
|
||||
Value *IV = CLI->getIndVar();
|
||||
EXPECT_EQ(std::distance(IV->use_begin(), IV->use_end()), 3);
|
||||
}
|
||||
|
||||
TEST_F(OpenMPIRBuilderTest, MasterDirective) {
|
||||
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
|
||||
OpenMPIRBuilder OMPBuilder(*M);
|
||||
|
|
Loading…
Reference in New Issue