[OpenMP IRBuilder, MLIR] Add support for OpenMP do schedule dynamic

The implementation supports static schedule for Fortran do loops. This
implements the dynamic variant of the same concept.

Reviewed By: Meinersbur

Differential Revision: https://reviews.llvm.org/D97393
This commit is contained in:
Mats Petersson 2021-04-16 15:08:56 +01:00 committed by Kiran Chandramohan
parent 8628ed0310
commit 517c3aee4d
5 changed files with 302 additions and 14 deletions

View File

@ -107,6 +107,17 @@ inline std::string getAllAssumeClauseOptions() {
return S + "'";
}
/// \note This needs to be kept in sync with kmp.h enum sched_type.
/// Todo: Update kmp.h to include this file, and remove the enums in kmp.h
/// To complete this, more enum values will need to be moved here.
enum class OMPScheduleType {
Static = 34, /**< static unspecialized */
DynamicChunked = 35,
ModifierNonmonotonic =
(1 << 30), /**< Set if the nonmonotonic schedule modifier was present */
LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierNonmonotonic)
};
} // end namespace omp
} // end namespace llvm

View File

@ -355,7 +355,7 @@ public:
/// \param CLI A descriptor of the canonical loop to workshare.
/// \param AllocaIP An insertion point for Alloca instructions usable in the
/// preheader of the loop.
/// \param NeedsBarrier Indicates whether a barrier must be insterted after
/// \param NeedsBarrier Indicates whether a barrier must be inserted after
/// the loop.
/// \param Chunk The size of loop chunk considered as a unit when
/// scheduling. If \p nullptr, defaults to 1.
@ -367,6 +367,30 @@ public:
bool NeedsBarrier,
Value *Chunk = nullptr);
/// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
///
/// This takes a \p LoopInfo representing a canonical loop, such as the one
/// created by \p createCanonicalLoop and emits additional instructions to
/// turn it into a workshare loop. In particular, it calls to an OpenMP
/// runtime function in the preheader to obtain, and then in each iteration
/// to update the loop counter.
/// \param Loc The source location description, the insertion location
/// is not used.
/// \param CLI A descriptor of the canonical loop to workshare.
/// \param AllocaIP An insertion point for Alloca instructions usable in the
/// preheader of the loop.
/// \param NeedsBarrier Indicates whether a barrier must be insterted after
/// the loop.
/// \param Chunk The size of loop chunk considered as a unit when
/// scheduling. If \p nullptr, defaults to 1.
///
/// \returns Point where to insert code after the loop.
InsertPointTy createDynamicWorkshareLoop(const LocationDescription &Loc,
CanonicalLoopInfo *CLI,
InsertPointTy AllocaIP,
bool NeedsBarrier,
Value *Chunk = nullptr);
/// Modifies the canonical loop to be a workshare loop.
///
/// This takes a \p LoopInfo representing a canonical loop, such as the one

View File

@ -1168,10 +1168,8 @@ CanonicalLoopInfo *OpenMPIRBuilder::createStaticWorkshareLoop(
Value *ThreadNum = getOrCreateThreadID(SrcLoc);
// TODO: extract scheduling type and map it to OMP constant. This is curently
// happening in kmp.h and its ilk and needs to be moved to OpenMP.td first.
constexpr int StaticSchedType = 34;
Constant *SchedulingType = ConstantInt::get(I32Type, StaticSchedType);
Constant *SchedulingType =
ConstantInt::get(I32Type, static_cast<int>(OMPScheduleType::Static));
// Call the "init" function and update the trip count of the loop with the
// value it produced.
@ -1220,6 +1218,148 @@ CanonicalLoopInfo *OpenMPIRBuilder::createWorkshareLoop(
return createStaticWorkshareLoop(Loc, CLI, AllocaIP, NeedsBarrier);
}
/// Returns an LLVM function to call for initializing loop bounds using OpenMP
/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
/// the runtime. Always interpret integers as unsigned similarly to
/// CanonicalLoopInfo.
static FunctionCallee
getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
unsigned Bitwidth = Ty->getIntegerBitWidth();
if (Bitwidth == 32)
return OMPBuilder.getOrCreateRuntimeFunction(
M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
if (Bitwidth == 64)
return OMPBuilder.getOrCreateRuntimeFunction(
M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
llvm_unreachable("unknown OpenMP loop iterator bitwidth");
}
/// Returns an LLVM function to call for updating the next loop using OpenMP
/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
/// the runtime. Always interpret integers as unsigned similarly to
/// CanonicalLoopInfo.
static FunctionCallee
getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
unsigned Bitwidth = Ty->getIntegerBitWidth();
if (Bitwidth == 32)
return OMPBuilder.getOrCreateRuntimeFunction(
M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
if (Bitwidth == 64)
return OMPBuilder.getOrCreateRuntimeFunction(
M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
llvm_unreachable("unknown OpenMP loop iterator bitwidth");
}
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createDynamicWorkshareLoop(
const LocationDescription &Loc, CanonicalLoopInfo *CLI,
InsertPointTy AllocaIP, bool NeedsBarrier, Value *Chunk) {
// Set up the source location value for OpenMP runtime.
Builder.SetCurrentDebugLocation(Loc.DL);
Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
Value *SrcLoc = getOrCreateIdent(SrcLocStr);
// Declare useful OpenMP runtime functions.
Value *IV = CLI->getIndVar();
Type *IVTy = IV->getType();
FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
// Allocate space for computed loop bounds as expected by the "init" function.
Builder.restoreIP(AllocaIP);
Type *I32Type = Type::getInt32Ty(M.getContext());
Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
// At the end of the preheader, prepare for calling the "init" function by
// storing the current loop bounds into the allocated space. A canonical loop
// always iterates from 0 to trip-count with step 1. Note that "init" expects
// and produces an inclusive upper bound.
BasicBlock *PreHeader = CLI->getPreheader();
Builder.SetInsertPoint(PreHeader->getTerminator());
Constant *One = ConstantInt::get(IVTy, 1);
Builder.CreateStore(One, PLowerBound);
Value *UpperBound = CLI->getTripCount();
Builder.CreateStore(UpperBound, PUpperBound);
Builder.CreateStore(One, PStride);
BasicBlock *Header = CLI->getHeader();
BasicBlock *Exit = CLI->getExit();
BasicBlock *Cond = CLI->getCond();
InsertPointTy AfterIP = CLI->getAfterIP();
// The CLI will be "broken" in the code below, as the loop is no longer
// a valid canonical loop.
if (!Chunk)
Chunk = One;
Value *ThreadNum = getOrCreateThreadID(SrcLoc);
OMPScheduleType DynamicSchedType =
OMPScheduleType::DynamicChunked | OMPScheduleType::ModifierNonmonotonic;
Constant *SchedulingType =
ConstantInt::get(I32Type, static_cast<int>(DynamicSchedType));
// Call the "init" function.
Builder.CreateCall(DynamicInit,
{SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
UpperBound, /* step */ One, Chunk});
// An outer loop around the existing one.
BasicBlock *OuterCond = BasicBlock::Create(
PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
PreHeader->getParent());
// This needs to be 32-bit always, so can't use the IVTy Zero above.
Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
Value *Res =
Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
PLowerBound, PUpperBound, PStride});
Constant *Zero32 = ConstantInt::get(I32Type, 0);
Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
Value *LowerBound =
Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
Builder.CreateCondBr(MoreWork, Header, Exit);
// Change PHI-node in loop header to use outer cond rather than preheader,
// and set IV to the LowerBound.
Instruction *Phi = &Header->front();
auto *PI = cast<PHINode>(Phi);
PI->setIncomingBlock(0, OuterCond);
PI->setIncomingValue(0, LowerBound);
// Then set the pre-header to jump to the OuterCond
Instruction *Term = PreHeader->getTerminator();
auto *Br = cast<BranchInst>(Term);
Br->setSuccessor(0, OuterCond);
// Modify the inner condition:
// * Use the UpperBound returned from the DynamicNext call.
// * jump to the loop outer loop when done with one of the inner loops.
Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
Instruction *Comp = &*Builder.GetInsertPoint();
auto *CI = cast<CmpInst>(Comp);
CI->setOperand(1, UpperBound);
// Redirect the inner exit to branch to outer condition.
Instruction *Branch = &Cond->back();
auto *BI = cast<BranchInst>(Branch);
assert(BI->getSuccessor(1) == Exit);
BI->setSuccessor(1, OuterCond);
// Add the barrier if requested.
if (NeedsBarrier) {
Builder.SetInsertPoint(&Exit->back());
createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
/* CheckCancelFlag */ false);
}
return AfterIP;
}
/// Make \p Source branch to \p Target.
///
/// Handles two situations:
@ -1901,7 +2041,7 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
Function *Fn =
getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
return Builder.CreateCall(Fn, Args);
}

View File

@ -1708,6 +1708,105 @@ TEST_F(OpenMPIRBuilderTest, StaticWorkShareLoop) {
EXPECT_EQ(NumCallsInExitBlock, 3u);
}
TEST_F(OpenMPIRBuilderTest, DynamicWorkShareLoop) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);
OMPBuilder.initialize();
IRBuilder<> Builder(BB);
OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
Type *LCTy = Type::getInt32Ty(Ctx);
Value *StartVal = ConstantInt::get(LCTy, 10);
Value *StopVal = ConstantInt::get(LCTy, 52);
Value *StepVal = ConstantInt::get(LCTy, 2);
Value *ChunkVal = ConstantInt::get(LCTy, 7);
auto LoopBodyGen = [&](InsertPointTy, llvm::Value *) {};
CanonicalLoopInfo *CLI = OMPBuilder.createCanonicalLoop(
Loc, LoopBodyGen, StartVal, StopVal, StepVal,
/*IsSigned=*/false, /*InclusiveStop=*/false);
Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
InsertPointTy AllocaIP = Builder.saveIP();
// Collect all the info from CLI, as it isn't usable after the call to
// createDynamicWorkshareLoop.
InsertPointTy AfterIP = CLI->getAfterIP();
BasicBlock *Preheader = CLI->getPreheader();
BasicBlock *ExitBlock = CLI->getExit();
Value *IV = CLI->getIndVar();
InsertPointTy EndIP =
OMPBuilder.createDynamicWorkshareLoop(Loc, CLI, AllocaIP,
/*NeedsBarrier=*/true, ChunkVal);
// The returned value should be the "after" point.
ASSERT_EQ(EndIP.getBlock(), AfterIP.getBlock());
ASSERT_EQ(EndIP.getPoint(), AfterIP.getPoint());
auto AllocaIter = BB->begin();
ASSERT_GE(std::distance(BB->begin(), BB->end()), 4);
AllocaInst *PLastIter = dyn_cast<AllocaInst>(&*(AllocaIter++));
AllocaInst *PLowerBound = dyn_cast<AllocaInst>(&*(AllocaIter++));
AllocaInst *PUpperBound = dyn_cast<AllocaInst>(&*(AllocaIter++));
AllocaInst *PStride = dyn_cast<AllocaInst>(&*(AllocaIter++));
EXPECT_NE(PLastIter, nullptr);
EXPECT_NE(PLowerBound, nullptr);
EXPECT_NE(PUpperBound, nullptr);
EXPECT_NE(PStride, nullptr);
auto PreheaderIter = Preheader->begin();
ASSERT_GE(std::distance(Preheader->begin(), Preheader->end()), 6);
StoreInst *LowerBoundStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
StoreInst *UpperBoundStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
StoreInst *StrideStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
ASSERT_NE(LowerBoundStore, nullptr);
ASSERT_NE(UpperBoundStore, nullptr);
ASSERT_NE(StrideStore, nullptr);
CallInst *ThreadIdCall = dyn_cast<CallInst>(&*(PreheaderIter++));
ASSERT_NE(ThreadIdCall, nullptr);
EXPECT_EQ(ThreadIdCall->getCalledFunction()->getName(),
"__kmpc_global_thread_num");
CallInst *InitCall = dyn_cast<CallInst>(&*PreheaderIter);
ASSERT_NE(InitCall, nullptr);
EXPECT_EQ(InitCall->getCalledFunction()->getName(),
"__kmpc_dispatch_init_4u");
EXPECT_EQ(InitCall->getNumArgOperands(), 7U);
EXPECT_EQ(InitCall->getArgOperand(6),
ConstantInt::get(Type::getInt32Ty(Ctx), 7));
ConstantInt *OrigLowerBound =
dyn_cast<ConstantInt>(LowerBoundStore->getValueOperand());
ConstantInt *OrigUpperBound =
dyn_cast<ConstantInt>(UpperBoundStore->getValueOperand());
ConstantInt *OrigStride =
dyn_cast<ConstantInt>(StrideStore->getValueOperand());
ASSERT_NE(OrigLowerBound, nullptr);
ASSERT_NE(OrigUpperBound, nullptr);
ASSERT_NE(OrigStride, nullptr);
EXPECT_EQ(OrigLowerBound->getValue(), 1);
EXPECT_EQ(OrigUpperBound->getValue(), 21);
EXPECT_EQ(OrigStride->getValue(), 1);
// The original loop iterator should only be used in the condition, in the
// increment and in the statement that adds the lower bound to it.
EXPECT_EQ(std::distance(IV->use_begin(), IV->use_end()), 3);
// The exit block should contain the barrier call, plus the call to obtain
// the thread ID.
size_t NumCallsInExitBlock =
count_if(*ExitBlock, [](Instruction &I) { return isa<CallInst>(I); });
EXPECT_EQ(NumCallsInExitBlock, 2u);
// Add a termination to our block and check that it is internally consistent.
Builder.restoreIP(EndIP);
Builder.CreateRetVoid();
OMPBuilder.finalize();
EXPECT_FALSE(verifyModule(*M, &errs()));
}
TEST_F(OpenMPIRBuilderTest, MasterDirective) {
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
OpenMPIRBuilder OMPBuilder(*M);

View File

@ -179,11 +179,17 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
if (loop.getNumLoops() != 1)
return opInst.emitOpError("collapsed loops not yet supported");
if (loop.schedule_val().hasValue() &&
omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue()) !=
omp::ClauseScheduleKind::Static)
return opInst.emitOpError(
"only static (default) loop schedule is currently supported");
bool isStatic = true;
if (loop.schedule_val().hasValue()) {
auto schedule =
omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue());
if (schedule != omp::ClauseScheduleKind::Static &&
schedule != omp::ClauseScheduleKind::Dynamic)
return opInst.emitOpError("only static (default) and dynamic loop "
"schedule is currently supported");
isStatic = (schedule == omp::ClauseScheduleKind::Static);
}
// Find the loop configuration.
llvm::Value *lowerBound = moduleTranslation.lookupValue(loop.lowerBound()[0]);
@ -241,11 +247,19 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
// Put them at the start of the current block for now.
llvm::OpenMPIRBuilder::InsertPointTy allocaIP(
insertBlock, insertBlock->getFirstInsertionPt());
loopInfo = moduleTranslation.getOpenMPBuilder()->createStaticWorkshareLoop(
ompLoc, loopInfo, allocaIP, !loop.nowait(), chunk);
llvm::OpenMPIRBuilder::InsertPointTy afterIP;
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
if (isStatic) {
loopInfo = ompBuilder->createStaticWorkshareLoop(ompLoc, loopInfo, allocaIP,
!loop.nowait(), chunk);
afterIP = loopInfo->getAfterIP();
} else {
afterIP = ompBuilder->createDynamicWorkshareLoop(ompLoc, loopInfo, allocaIP,
!loop.nowait(), chunk);
}
// Continue building IR after the loop.
builder.restoreIP(loopInfo->getAfterIP());
builder.restoreIP(afterIP);
return success();
}