forked from OSchip/llvm-project
[OpenMP IRBuilder, MLIR] Add support for OpenMP do schedule dynamic
The implementation supports static schedule for Fortran do loops. This implements the dynamic variant of the same concept. Reviewed By: Meinersbur Differential Revision: https://reviews.llvm.org/D97393
This commit is contained in:
parent
8628ed0310
commit
517c3aee4d
|
@ -107,6 +107,17 @@ inline std::string getAllAssumeClauseOptions() {
|
|||
return S + "'";
|
||||
}
|
||||
|
||||
/// \note This needs to be kept in sync with kmp.h enum sched_type.
|
||||
/// Todo: Update kmp.h to include this file, and remove the enums in kmp.h
|
||||
/// To complete this, more enum values will need to be moved here.
|
||||
enum class OMPScheduleType {
|
||||
Static = 34, /**< static unspecialized */
|
||||
DynamicChunked = 35,
|
||||
ModifierNonmonotonic =
|
||||
(1 << 30), /**< Set if the nonmonotonic schedule modifier was present */
|
||||
LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue */ ModifierNonmonotonic)
|
||||
};
|
||||
|
||||
} // end namespace omp
|
||||
|
||||
} // end namespace llvm
|
||||
|
|
|
@ -355,7 +355,7 @@ public:
|
|||
/// \param CLI A descriptor of the canonical loop to workshare.
|
||||
/// \param AllocaIP An insertion point for Alloca instructions usable in the
|
||||
/// preheader of the loop.
|
||||
/// \param NeedsBarrier Indicates whether a barrier must be insterted after
|
||||
/// \param NeedsBarrier Indicates whether a barrier must be inserted after
|
||||
/// the loop.
|
||||
/// \param Chunk The size of loop chunk considered as a unit when
|
||||
/// scheduling. If \p nullptr, defaults to 1.
|
||||
|
@ -367,6 +367,30 @@ public:
|
|||
bool NeedsBarrier,
|
||||
Value *Chunk = nullptr);
|
||||
|
||||
/// Modifies the canonical loop to be a dynamically-scheduled workshare loop.
|
||||
///
|
||||
/// This takes a \p LoopInfo representing a canonical loop, such as the one
|
||||
/// created by \p createCanonicalLoop and emits additional instructions to
|
||||
/// turn it into a workshare loop. In particular, it calls to an OpenMP
|
||||
/// runtime function in the preheader to obtain, and then in each iteration
|
||||
/// to update the loop counter.
|
||||
/// \param Loc The source location description, the insertion location
|
||||
/// is not used.
|
||||
/// \param CLI A descriptor of the canonical loop to workshare.
|
||||
/// \param AllocaIP An insertion point for Alloca instructions usable in the
|
||||
/// preheader of the loop.
|
||||
/// \param NeedsBarrier Indicates whether a barrier must be insterted after
|
||||
/// the loop.
|
||||
/// \param Chunk The size of loop chunk considered as a unit when
|
||||
/// scheduling. If \p nullptr, defaults to 1.
|
||||
///
|
||||
/// \returns Point where to insert code after the loop.
|
||||
InsertPointTy createDynamicWorkshareLoop(const LocationDescription &Loc,
|
||||
CanonicalLoopInfo *CLI,
|
||||
InsertPointTy AllocaIP,
|
||||
bool NeedsBarrier,
|
||||
Value *Chunk = nullptr);
|
||||
|
||||
/// Modifies the canonical loop to be a workshare loop.
|
||||
///
|
||||
/// This takes a \p LoopInfo representing a canonical loop, such as the one
|
||||
|
|
|
@ -1168,10 +1168,8 @@ CanonicalLoopInfo *OpenMPIRBuilder::createStaticWorkshareLoop(
|
|||
|
||||
Value *ThreadNum = getOrCreateThreadID(SrcLoc);
|
||||
|
||||
// TODO: extract scheduling type and map it to OMP constant. This is curently
|
||||
// happening in kmp.h and its ilk and needs to be moved to OpenMP.td first.
|
||||
constexpr int StaticSchedType = 34;
|
||||
Constant *SchedulingType = ConstantInt::get(I32Type, StaticSchedType);
|
||||
Constant *SchedulingType =
|
||||
ConstantInt::get(I32Type, static_cast<int>(OMPScheduleType::Static));
|
||||
|
||||
// Call the "init" function and update the trip count of the loop with the
|
||||
// value it produced.
|
||||
|
@ -1220,6 +1218,148 @@ CanonicalLoopInfo *OpenMPIRBuilder::createWorkshareLoop(
|
|||
return createStaticWorkshareLoop(Loc, CLI, AllocaIP, NeedsBarrier);
|
||||
}
|
||||
|
||||
/// Returns an LLVM function to call for initializing loop bounds using OpenMP
|
||||
/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
|
||||
/// the runtime. Always interpret integers as unsigned similarly to
|
||||
/// CanonicalLoopInfo.
|
||||
static FunctionCallee
|
||||
getKmpcForDynamicInitForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
|
||||
unsigned Bitwidth = Ty->getIntegerBitWidth();
|
||||
if (Bitwidth == 32)
|
||||
return OMPBuilder.getOrCreateRuntimeFunction(
|
||||
M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_4u);
|
||||
if (Bitwidth == 64)
|
||||
return OMPBuilder.getOrCreateRuntimeFunction(
|
||||
M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_init_8u);
|
||||
llvm_unreachable("unknown OpenMP loop iterator bitwidth");
|
||||
}
|
||||
|
||||
/// Returns an LLVM function to call for updating the next loop using OpenMP
|
||||
/// dynamic scheduling depending on `type`. Only i32 and i64 are supported by
|
||||
/// the runtime. Always interpret integers as unsigned similarly to
|
||||
/// CanonicalLoopInfo.
|
||||
static FunctionCallee
|
||||
getKmpcForDynamicNextForType(Type *Ty, Module &M, OpenMPIRBuilder &OMPBuilder) {
|
||||
unsigned Bitwidth = Ty->getIntegerBitWidth();
|
||||
if (Bitwidth == 32)
|
||||
return OMPBuilder.getOrCreateRuntimeFunction(
|
||||
M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_4u);
|
||||
if (Bitwidth == 64)
|
||||
return OMPBuilder.getOrCreateRuntimeFunction(
|
||||
M, omp::RuntimeFunction::OMPRTL___kmpc_dispatch_next_8u);
|
||||
llvm_unreachable("unknown OpenMP loop iterator bitwidth");
|
||||
}
|
||||
|
||||
OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::createDynamicWorkshareLoop(
|
||||
const LocationDescription &Loc, CanonicalLoopInfo *CLI,
|
||||
InsertPointTy AllocaIP, bool NeedsBarrier, Value *Chunk) {
|
||||
// Set up the source location value for OpenMP runtime.
|
||||
Builder.SetCurrentDebugLocation(Loc.DL);
|
||||
|
||||
Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
|
||||
Value *SrcLoc = getOrCreateIdent(SrcLocStr);
|
||||
|
||||
// Declare useful OpenMP runtime functions.
|
||||
Value *IV = CLI->getIndVar();
|
||||
Type *IVTy = IV->getType();
|
||||
FunctionCallee DynamicInit = getKmpcForDynamicInitForType(IVTy, M, *this);
|
||||
FunctionCallee DynamicNext = getKmpcForDynamicNextForType(IVTy, M, *this);
|
||||
|
||||
// Allocate space for computed loop bounds as expected by the "init" function.
|
||||
Builder.restoreIP(AllocaIP);
|
||||
Type *I32Type = Type::getInt32Ty(M.getContext());
|
||||
Value *PLastIter = Builder.CreateAlloca(I32Type, nullptr, "p.lastiter");
|
||||
Value *PLowerBound = Builder.CreateAlloca(IVTy, nullptr, "p.lowerbound");
|
||||
Value *PUpperBound = Builder.CreateAlloca(IVTy, nullptr, "p.upperbound");
|
||||
Value *PStride = Builder.CreateAlloca(IVTy, nullptr, "p.stride");
|
||||
|
||||
// At the end of the preheader, prepare for calling the "init" function by
|
||||
// storing the current loop bounds into the allocated space. A canonical loop
|
||||
// always iterates from 0 to trip-count with step 1. Note that "init" expects
|
||||
// and produces an inclusive upper bound.
|
||||
BasicBlock *PreHeader = CLI->getPreheader();
|
||||
Builder.SetInsertPoint(PreHeader->getTerminator());
|
||||
Constant *One = ConstantInt::get(IVTy, 1);
|
||||
Builder.CreateStore(One, PLowerBound);
|
||||
Value *UpperBound = CLI->getTripCount();
|
||||
Builder.CreateStore(UpperBound, PUpperBound);
|
||||
Builder.CreateStore(One, PStride);
|
||||
|
||||
BasicBlock *Header = CLI->getHeader();
|
||||
BasicBlock *Exit = CLI->getExit();
|
||||
BasicBlock *Cond = CLI->getCond();
|
||||
InsertPointTy AfterIP = CLI->getAfterIP();
|
||||
|
||||
// The CLI will be "broken" in the code below, as the loop is no longer
|
||||
// a valid canonical loop.
|
||||
|
||||
if (!Chunk)
|
||||
Chunk = One;
|
||||
|
||||
Value *ThreadNum = getOrCreateThreadID(SrcLoc);
|
||||
|
||||
OMPScheduleType DynamicSchedType =
|
||||
OMPScheduleType::DynamicChunked | OMPScheduleType::ModifierNonmonotonic;
|
||||
Constant *SchedulingType =
|
||||
ConstantInt::get(I32Type, static_cast<int>(DynamicSchedType));
|
||||
|
||||
// Call the "init" function.
|
||||
Builder.CreateCall(DynamicInit,
|
||||
{SrcLoc, ThreadNum, SchedulingType, /* LowerBound */ One,
|
||||
UpperBound, /* step */ One, Chunk});
|
||||
|
||||
// An outer loop around the existing one.
|
||||
BasicBlock *OuterCond = BasicBlock::Create(
|
||||
PreHeader->getContext(), Twine(PreHeader->getName()) + ".outer.cond",
|
||||
PreHeader->getParent());
|
||||
// This needs to be 32-bit always, so can't use the IVTy Zero above.
|
||||
Builder.SetInsertPoint(OuterCond, OuterCond->getFirstInsertionPt());
|
||||
Value *Res =
|
||||
Builder.CreateCall(DynamicNext, {SrcLoc, ThreadNum, PLastIter,
|
||||
PLowerBound, PUpperBound, PStride});
|
||||
Constant *Zero32 = ConstantInt::get(I32Type, 0);
|
||||
Value *MoreWork = Builder.CreateCmp(CmpInst::ICMP_NE, Res, Zero32);
|
||||
Value *LowerBound =
|
||||
Builder.CreateSub(Builder.CreateLoad(IVTy, PLowerBound), One, "lb");
|
||||
Builder.CreateCondBr(MoreWork, Header, Exit);
|
||||
|
||||
// Change PHI-node in loop header to use outer cond rather than preheader,
|
||||
// and set IV to the LowerBound.
|
||||
Instruction *Phi = &Header->front();
|
||||
auto *PI = cast<PHINode>(Phi);
|
||||
PI->setIncomingBlock(0, OuterCond);
|
||||
PI->setIncomingValue(0, LowerBound);
|
||||
|
||||
// Then set the pre-header to jump to the OuterCond
|
||||
Instruction *Term = PreHeader->getTerminator();
|
||||
auto *Br = cast<BranchInst>(Term);
|
||||
Br->setSuccessor(0, OuterCond);
|
||||
|
||||
// Modify the inner condition:
|
||||
// * Use the UpperBound returned from the DynamicNext call.
|
||||
// * jump to the loop outer loop when done with one of the inner loops.
|
||||
Builder.SetInsertPoint(Cond, Cond->getFirstInsertionPt());
|
||||
UpperBound = Builder.CreateLoad(IVTy, PUpperBound, "ub");
|
||||
Instruction *Comp = &*Builder.GetInsertPoint();
|
||||
auto *CI = cast<CmpInst>(Comp);
|
||||
CI->setOperand(1, UpperBound);
|
||||
// Redirect the inner exit to branch to outer condition.
|
||||
Instruction *Branch = &Cond->back();
|
||||
auto *BI = cast<BranchInst>(Branch);
|
||||
assert(BI->getSuccessor(1) == Exit);
|
||||
BI->setSuccessor(1, OuterCond);
|
||||
|
||||
// Add the barrier if requested.
|
||||
if (NeedsBarrier) {
|
||||
Builder.SetInsertPoint(&Exit->back());
|
||||
createBarrier(LocationDescription(Builder.saveIP(), Loc.DL),
|
||||
omp::Directive::OMPD_for, /* ForceSimpleCall */ false,
|
||||
/* CheckCancelFlag */ false);
|
||||
}
|
||||
|
||||
return AfterIP;
|
||||
}
|
||||
|
||||
/// Make \p Source branch to \p Target.
|
||||
///
|
||||
/// Handles two situations:
|
||||
|
@ -1901,7 +2041,7 @@ CallInst *OpenMPIRBuilder::createCachedThreadPrivate(
|
|||
llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
|
||||
|
||||
Function *Fn =
|
||||
getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
|
||||
getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
|
||||
|
||||
return Builder.CreateCall(Fn, Args);
|
||||
}
|
||||
|
|
|
@ -1708,6 +1708,105 @@ TEST_F(OpenMPIRBuilderTest, StaticWorkShareLoop) {
|
|||
EXPECT_EQ(NumCallsInExitBlock, 3u);
|
||||
}
|
||||
|
||||
TEST_F(OpenMPIRBuilderTest, DynamicWorkShareLoop) {
|
||||
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
|
||||
OpenMPIRBuilder OMPBuilder(*M);
|
||||
OMPBuilder.initialize();
|
||||
IRBuilder<> Builder(BB);
|
||||
OpenMPIRBuilder::LocationDescription Loc({Builder.saveIP(), DL});
|
||||
|
||||
Type *LCTy = Type::getInt32Ty(Ctx);
|
||||
Value *StartVal = ConstantInt::get(LCTy, 10);
|
||||
Value *StopVal = ConstantInt::get(LCTy, 52);
|
||||
Value *StepVal = ConstantInt::get(LCTy, 2);
|
||||
Value *ChunkVal = ConstantInt::get(LCTy, 7);
|
||||
auto LoopBodyGen = [&](InsertPointTy, llvm::Value *) {};
|
||||
|
||||
CanonicalLoopInfo *CLI = OMPBuilder.createCanonicalLoop(
|
||||
Loc, LoopBodyGen, StartVal, StopVal, StepVal,
|
||||
/*IsSigned=*/false, /*InclusiveStop=*/false);
|
||||
|
||||
Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
|
||||
InsertPointTy AllocaIP = Builder.saveIP();
|
||||
|
||||
// Collect all the info from CLI, as it isn't usable after the call to
|
||||
// createDynamicWorkshareLoop.
|
||||
InsertPointTy AfterIP = CLI->getAfterIP();
|
||||
BasicBlock *Preheader = CLI->getPreheader();
|
||||
BasicBlock *ExitBlock = CLI->getExit();
|
||||
Value *IV = CLI->getIndVar();
|
||||
|
||||
InsertPointTy EndIP =
|
||||
OMPBuilder.createDynamicWorkshareLoop(Loc, CLI, AllocaIP,
|
||||
/*NeedsBarrier=*/true, ChunkVal);
|
||||
// The returned value should be the "after" point.
|
||||
ASSERT_EQ(EndIP.getBlock(), AfterIP.getBlock());
|
||||
ASSERT_EQ(EndIP.getPoint(), AfterIP.getPoint());
|
||||
|
||||
auto AllocaIter = BB->begin();
|
||||
ASSERT_GE(std::distance(BB->begin(), BB->end()), 4);
|
||||
AllocaInst *PLastIter = dyn_cast<AllocaInst>(&*(AllocaIter++));
|
||||
AllocaInst *PLowerBound = dyn_cast<AllocaInst>(&*(AllocaIter++));
|
||||
AllocaInst *PUpperBound = dyn_cast<AllocaInst>(&*(AllocaIter++));
|
||||
AllocaInst *PStride = dyn_cast<AllocaInst>(&*(AllocaIter++));
|
||||
EXPECT_NE(PLastIter, nullptr);
|
||||
EXPECT_NE(PLowerBound, nullptr);
|
||||
EXPECT_NE(PUpperBound, nullptr);
|
||||
EXPECT_NE(PStride, nullptr);
|
||||
|
||||
auto PreheaderIter = Preheader->begin();
|
||||
ASSERT_GE(std::distance(Preheader->begin(), Preheader->end()), 6);
|
||||
StoreInst *LowerBoundStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
|
||||
StoreInst *UpperBoundStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
|
||||
StoreInst *StrideStore = dyn_cast<StoreInst>(&*(PreheaderIter++));
|
||||
ASSERT_NE(LowerBoundStore, nullptr);
|
||||
ASSERT_NE(UpperBoundStore, nullptr);
|
||||
ASSERT_NE(StrideStore, nullptr);
|
||||
|
||||
CallInst *ThreadIdCall = dyn_cast<CallInst>(&*(PreheaderIter++));
|
||||
ASSERT_NE(ThreadIdCall, nullptr);
|
||||
EXPECT_EQ(ThreadIdCall->getCalledFunction()->getName(),
|
||||
"__kmpc_global_thread_num");
|
||||
|
||||
CallInst *InitCall = dyn_cast<CallInst>(&*PreheaderIter);
|
||||
|
||||
ASSERT_NE(InitCall, nullptr);
|
||||
EXPECT_EQ(InitCall->getCalledFunction()->getName(),
|
||||
"__kmpc_dispatch_init_4u");
|
||||
EXPECT_EQ(InitCall->getNumArgOperands(), 7U);
|
||||
EXPECT_EQ(InitCall->getArgOperand(6),
|
||||
ConstantInt::get(Type::getInt32Ty(Ctx), 7));
|
||||
|
||||
ConstantInt *OrigLowerBound =
|
||||
dyn_cast<ConstantInt>(LowerBoundStore->getValueOperand());
|
||||
ConstantInt *OrigUpperBound =
|
||||
dyn_cast<ConstantInt>(UpperBoundStore->getValueOperand());
|
||||
ConstantInt *OrigStride =
|
||||
dyn_cast<ConstantInt>(StrideStore->getValueOperand());
|
||||
ASSERT_NE(OrigLowerBound, nullptr);
|
||||
ASSERT_NE(OrigUpperBound, nullptr);
|
||||
ASSERT_NE(OrigStride, nullptr);
|
||||
EXPECT_EQ(OrigLowerBound->getValue(), 1);
|
||||
EXPECT_EQ(OrigUpperBound->getValue(), 21);
|
||||
EXPECT_EQ(OrigStride->getValue(), 1);
|
||||
|
||||
// The original loop iterator should only be used in the condition, in the
|
||||
// increment and in the statement that adds the lower bound to it.
|
||||
EXPECT_EQ(std::distance(IV->use_begin(), IV->use_end()), 3);
|
||||
|
||||
// The exit block should contain the barrier call, plus the call to obtain
|
||||
// the thread ID.
|
||||
size_t NumCallsInExitBlock =
|
||||
count_if(*ExitBlock, [](Instruction &I) { return isa<CallInst>(I); });
|
||||
EXPECT_EQ(NumCallsInExitBlock, 2u);
|
||||
|
||||
// Add a termination to our block and check that it is internally consistent.
|
||||
Builder.restoreIP(EndIP);
|
||||
Builder.CreateRetVoid();
|
||||
OMPBuilder.finalize();
|
||||
EXPECT_FALSE(verifyModule(*M, &errs()));
|
||||
}
|
||||
|
||||
TEST_F(OpenMPIRBuilderTest, MasterDirective) {
|
||||
using InsertPointTy = OpenMPIRBuilder::InsertPointTy;
|
||||
OpenMPIRBuilder OMPBuilder(*M);
|
||||
|
|
|
@ -179,11 +179,17 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
|
|||
if (loop.getNumLoops() != 1)
|
||||
return opInst.emitOpError("collapsed loops not yet supported");
|
||||
|
||||
if (loop.schedule_val().hasValue() &&
|
||||
omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue()) !=
|
||||
omp::ClauseScheduleKind::Static)
|
||||
return opInst.emitOpError(
|
||||
"only static (default) loop schedule is currently supported");
|
||||
bool isStatic = true;
|
||||
|
||||
if (loop.schedule_val().hasValue()) {
|
||||
auto schedule =
|
||||
omp::symbolizeClauseScheduleKind(loop.schedule_val().getValue());
|
||||
if (schedule != omp::ClauseScheduleKind::Static &&
|
||||
schedule != omp::ClauseScheduleKind::Dynamic)
|
||||
return opInst.emitOpError("only static (default) and dynamic loop "
|
||||
"schedule is currently supported");
|
||||
isStatic = (schedule == omp::ClauseScheduleKind::Static);
|
||||
}
|
||||
|
||||
// Find the loop configuration.
|
||||
llvm::Value *lowerBound = moduleTranslation.lookupValue(loop.lowerBound()[0]);
|
||||
|
@ -241,11 +247,19 @@ convertOmpWsLoop(Operation &opInst, llvm::IRBuilderBase &builder,
|
|||
// Put them at the start of the current block for now.
|
||||
llvm::OpenMPIRBuilder::InsertPointTy allocaIP(
|
||||
insertBlock, insertBlock->getFirstInsertionPt());
|
||||
loopInfo = moduleTranslation.getOpenMPBuilder()->createStaticWorkshareLoop(
|
||||
ompLoc, loopInfo, allocaIP, !loop.nowait(), chunk);
|
||||
llvm::OpenMPIRBuilder::InsertPointTy afterIP;
|
||||
llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
|
||||
if (isStatic) {
|
||||
loopInfo = ompBuilder->createStaticWorkshareLoop(ompLoc, loopInfo, allocaIP,
|
||||
!loop.nowait(), chunk);
|
||||
afterIP = loopInfo->getAfterIP();
|
||||
} else {
|
||||
afterIP = ompBuilder->createDynamicWorkshareLoop(ompLoc, loopInfo, allocaIP,
|
||||
!loop.nowait(), chunk);
|
||||
}
|
||||
|
||||
// Continue building IR after the loop.
|
||||
builder.restoreIP(loopInfo->getAfterIP());
|
||||
builder.restoreIP(afterIP);
|
||||
return success();
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue