llvm-project/polly/lib/CodeGen/LoopGeneratorsKMP.cpp

558 lines
21 KiB
C++

//===------ LoopGeneratorsKMP.cpp - IR helper to create loops -------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file contains functions to create parallel loops as LLVM-IR.
//
//===----------------------------------------------------------------------===//
#include "polly/CodeGen/LoopGeneratorsKMP.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Module.h"
using namespace llvm;
using namespace polly;
void ParallelLoopGeneratorKMP::createCallSpawnThreads(Value *SubFn,
Value *SubFnParam,
Value *LB, Value *UB,
Value *Stride) {
const std::string Name = "__kmpc_fork_call";
Function *F = M->getFunction(Name);
Type *KMPCMicroTy = StructType::getTypeByName(M->getContext(), "kmpc_micro");
if (!KMPCMicroTy) {
// void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...)
Type *MicroParams[] = {Builder.getInt32Ty()->getPointerTo(),
Builder.getInt32Ty()->getPointerTo()};
KMPCMicroTy = FunctionType::get(Builder.getVoidTy(), MicroParams, true);
}
// If F is not available, declare it.
if (!F) {
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(),
KMPCMicroTy->getPointerTo()};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, true);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Task = Builder.CreatePointerBitCastOrAddrSpaceCast(
SubFn, KMPCMicroTy->getPointerTo());
Value *Args[] = {SourceLocationInfo,
Builder.getInt32(4) /* Number of arguments (w/o Task) */,
Task,
LB,
UB,
Stride,
SubFnParam};
Builder.CreateCall(F, Args);
}
void ParallelLoopGeneratorKMP::deployParallelExecution(Function *SubFn,
Value *SubFnParam,
Value *LB, Value *UB,
Value *Stride) {
// Inform OpenMP runtime about the number of threads if greater than zero
if (PollyNumThreads > 0) {
Value *GlobalThreadID = createCallGlobalThreadNum();
createCallPushNumThreads(GlobalThreadID, Builder.getInt32(PollyNumThreads));
}
// Tell the runtime we start a parallel loop
createCallSpawnThreads(SubFn, SubFnParam, LB, UB, Stride);
}
Function *ParallelLoopGeneratorKMP::prepareSubFnDefinition(Function *F) const {
std::vector<Type *> Arguments = {Builder.getInt32Ty()->getPointerTo(),
Builder.getInt32Ty()->getPointerTo(),
LongType,
LongType,
LongType,
Builder.getInt8PtrTy()};
FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
Function *SubFn = Function::Create(FT, Function::InternalLinkage,
F->getName() + "_polly_subfn", M);
// Name the function's arguments
Function::arg_iterator AI = SubFn->arg_begin();
AI->setName("polly.kmpc.global_tid");
std::advance(AI, 1);
AI->setName("polly.kmpc.bound_tid");
std::advance(AI, 1);
AI->setName("polly.kmpc.lb");
std::advance(AI, 1);
AI->setName("polly.kmpc.ub");
std::advance(AI, 1);
AI->setName("polly.kmpc.inc");
std::advance(AI, 1);
AI->setName("polly.kmpc.shared");
return SubFn;
}
// Create a subfunction of the following (preliminary) structure:
//
// PrevBB
// |
// v
// HeaderBB
// / | _____
// / v v |
// / PreHeaderBB |
// | | |
// | v |
// | CheckNextBB |
// \ | \_____/
// \ |
// v v
// ExitBB
//
// HeaderBB will hold allocations, loading of variables and kmp-init calls.
// CheckNextBB will check for more work (dynamic / static chunked) or will be
// empty (static non chunked).
// If there is more work to do: go to PreHeaderBB, otherwise go to ExitBB.
// PreHeaderBB loads the new boundaries (& will lead to the loop body later on).
// Just like CheckNextBB: PreHeaderBB is (preliminary) empty in the static non
// chunked scheduling case. ExitBB marks the end of the parallel execution.
// The possibly empty BasicBlocks will automatically be removed.
std::tuple<Value *, Function *>
ParallelLoopGeneratorKMP::createSubFn(Value *SequentialLoopStride,
AllocaInst *StructData,
SetVector<Value *> Data, ValueMapT &Map) {
Function *SubFn = createSubFnDefinition();
LLVMContext &Context = SubFn->getContext();
// Store the previous basic block.
BasicBlock *PrevBB = Builder.GetInsertBlock();
// Create basic blocks.
BasicBlock *HeaderBB = BasicBlock::Create(Context, "polly.par.setup", SubFn);
BasicBlock *ExitBB = BasicBlock::Create(Context, "polly.par.exit", SubFn);
BasicBlock *CheckNextBB =
BasicBlock::Create(Context, "polly.par.checkNext", SubFn);
BasicBlock *PreHeaderBB =
BasicBlock::Create(Context, "polly.par.loadIVBounds", SubFn);
DT.addNewBlock(HeaderBB, PrevBB);
DT.addNewBlock(ExitBB, HeaderBB);
DT.addNewBlock(CheckNextBB, HeaderBB);
DT.addNewBlock(PreHeaderBB, HeaderBB);
// Fill up basic block HeaderBB.
Builder.SetInsertPoint(HeaderBB);
Value *LBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.LBPtr");
Value *UBPtr = Builder.CreateAlloca(LongType, nullptr, "polly.par.UBPtr");
Value *IsLastPtr = Builder.CreateAlloca(Builder.getInt32Ty(), nullptr,
"polly.par.lastIterPtr");
Value *StridePtr =
Builder.CreateAlloca(LongType, nullptr, "polly.par.StridePtr");
// Get iterator for retrieving the previously defined parameters.
Function::arg_iterator AI = SubFn->arg_begin();
// First argument holds "global thread ID".
Value *IDPtr = &*AI;
// Skip "bound thread ID" since it is not used (but had to be defined).
std::advance(AI, 2);
// Move iterator to: LB, UB, Stride, Shared variable struct.
Value *LB = &*AI;
std::advance(AI, 1);
Value *UB = &*AI;
std::advance(AI, 1);
Value *Stride = &*AI;
std::advance(AI, 1);
Value *Shared = &*AI;
Value *UserContext = Builder.CreateBitCast(Shared, StructData->getType(),
"polly.par.userContext");
extractValuesFromStruct(Data, StructData->getAllocatedType(), UserContext,
Map);
const auto Alignment = llvm::Align(is64BitArch() ? 8 : 4);
Value *ID = Builder.CreateAlignedLoad(Builder.getInt32Ty(), IDPtr, Alignment,
"polly.par.global_tid");
Builder.CreateAlignedStore(LB, LBPtr, Alignment);
Builder.CreateAlignedStore(UB, UBPtr, Alignment);
Builder.CreateAlignedStore(Builder.getInt32(0), IsLastPtr, Alignment);
Builder.CreateAlignedStore(Stride, StridePtr, Alignment);
// Subtract one as the upper bound provided by openmp is a < comparison
// whereas the codegenForSequential function creates a <= comparison.
Value *AdjustedUB = Builder.CreateAdd(UB, ConstantInt::get(LongType, -1),
"polly.indvar.UBAdjusted");
Value *ChunkSize =
ConstantInt::get(LongType, std::max<int>(PollyChunkSize, 1));
OMPGeneralSchedulingType Scheduling =
getSchedType(PollyChunkSize, PollyScheduling);
switch (Scheduling) {
case OMPGeneralSchedulingType::Dynamic:
case OMPGeneralSchedulingType::Guided:
case OMPGeneralSchedulingType::Runtime:
// "DYNAMIC" scheduling types are handled below (including 'runtime')
{
UB = AdjustedUB;
createCallDispatchInit(ID, LB, UB, Stride, ChunkSize);
Value *HasWork =
createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr);
Value *HasIteration =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork,
Builder.getInt32(1), "polly.hasIteration");
Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
Builder.SetInsertPoint(CheckNextBB);
HasWork = createCallDispatchNext(ID, IsLastPtr, LBPtr, UBPtr, StridePtr);
HasIteration =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_EQ, HasWork,
Builder.getInt32(1), "polly.hasWork");
Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
Builder.SetInsertPoint(PreHeaderBB);
LB = Builder.CreateAlignedLoad(LongType, LBPtr, Alignment,
"polly.indvar.LB");
UB = Builder.CreateAlignedLoad(LongType, UBPtr, Alignment,
"polly.indvar.UB");
}
break;
case OMPGeneralSchedulingType::StaticChunked:
case OMPGeneralSchedulingType::StaticNonChunked:
// "STATIC" scheduling types are handled below
{
Builder.CreateAlignedStore(AdjustedUB, UBPtr, Alignment);
createCallStaticInit(ID, IsLastPtr, LBPtr, UBPtr, StridePtr, ChunkSize);
Value *ChunkedStride = Builder.CreateAlignedLoad(
LongType, StridePtr, Alignment, "polly.kmpc.stride");
LB = Builder.CreateAlignedLoad(LongType, LBPtr, Alignment,
"polly.indvar.LB");
UB = Builder.CreateAlignedLoad(LongType, UBPtr, Alignment,
"polly.indvar.UB.temp");
Value *UBInRange =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLE, UB, AdjustedUB,
"polly.indvar.UB.inRange");
UB = Builder.CreateSelect(UBInRange, UB, AdjustedUB, "polly.indvar.UB");
Builder.CreateAlignedStore(UB, UBPtr, Alignment);
Value *HasIteration = Builder.CreateICmp(
llvm::CmpInst::Predicate::ICMP_SLE, LB, UB, "polly.hasIteration");
Builder.CreateCondBr(HasIteration, PreHeaderBB, ExitBB);
if (Scheduling == OMPGeneralSchedulingType::StaticChunked) {
Builder.SetInsertPoint(PreHeaderBB);
LB = Builder.CreateAlignedLoad(LongType, LBPtr, Alignment,
"polly.indvar.LB.entry");
UB = Builder.CreateAlignedLoad(LongType, UBPtr, Alignment,
"polly.indvar.UB.entry");
}
Builder.SetInsertPoint(CheckNextBB);
if (Scheduling == OMPGeneralSchedulingType::StaticChunked) {
Value *NextLB =
Builder.CreateAdd(LB, ChunkedStride, "polly.indvar.nextLB");
Value *NextUB = Builder.CreateAdd(UB, ChunkedStride);
Value *NextUBOutOfBounds =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SGT, NextUB,
AdjustedUB, "polly.indvar.nextUB.outOfBounds");
NextUB = Builder.CreateSelect(NextUBOutOfBounds, AdjustedUB, NextUB,
"polly.indvar.nextUB");
Builder.CreateAlignedStore(NextLB, LBPtr, Alignment);
Builder.CreateAlignedStore(NextUB, UBPtr, Alignment);
Value *HasWork =
Builder.CreateICmp(llvm::CmpInst::Predicate::ICMP_SLE, NextLB,
AdjustedUB, "polly.hasWork");
Builder.CreateCondBr(HasWork, PreHeaderBB, ExitBB);
} else {
Builder.CreateBr(ExitBB);
}
Builder.SetInsertPoint(PreHeaderBB);
}
break;
}
Builder.CreateBr(CheckNextBB);
Builder.SetInsertPoint(&*--Builder.GetInsertPoint());
BasicBlock *AfterBB;
Value *IV = createLoop(LB, UB, SequentialLoopStride, Builder, LI, DT, AfterBB,
ICmpInst::ICMP_SLE, nullptr, true,
/* UseGuard */ false);
BasicBlock::iterator LoopBody = Builder.GetInsertPoint();
// Add code to terminate this subfunction.
Builder.SetInsertPoint(ExitBB);
// Static (i.e. non-dynamic) scheduling types, are terminated with a fini-call
if (Scheduling == OMPGeneralSchedulingType::StaticChunked ||
Scheduling == OMPGeneralSchedulingType::StaticNonChunked) {
createCallStaticFini(ID);
}
Builder.CreateRetVoid();
Builder.SetInsertPoint(&*LoopBody);
return std::make_tuple(IV, SubFn);
}
Value *ParallelLoopGeneratorKMP::createCallGlobalThreadNum() {
const std::string Name = "__kmpc_global_thread_num";
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo()};
FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
return Builder.CreateCall(F, {SourceLocationInfo});
}
void ParallelLoopGeneratorKMP::createCallPushNumThreads(Value *GlobalThreadID,
Value *NumThreads) {
const std::string Name = "__kmpc_push_num_threads";
Function *F = M->getFunction(Name);
// If F is not available, declare it.
if (!F) {
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(),
Builder.getInt32Ty()};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Args[] = {SourceLocationInfo, GlobalThreadID, NumThreads};
Builder.CreateCall(F, Args);
}
void ParallelLoopGeneratorKMP::createCallStaticInit(Value *GlobalThreadID,
Value *IsLastPtr,
Value *LBPtr, Value *UBPtr,
Value *StridePtr,
Value *ChunkSize) {
const std::string Name =
is64BitArch() ? "__kmpc_for_static_init_8" : "__kmpc_for_static_init_4";
Function *F = M->getFunction(Name);
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(),
Builder.getInt32Ty(),
Builder.getInt32Ty(),
Builder.getInt32Ty()->getPointerTo(),
LongType->getPointerTo(),
LongType->getPointerTo(),
LongType->getPointerTo(),
LongType,
LongType};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
// The parameter 'ChunkSize' will hold strictly positive integer values,
// regardless of PollyChunkSize's value
Value *Args[] = {
SourceLocationInfo,
GlobalThreadID,
Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))),
IsLastPtr,
LBPtr,
UBPtr,
StridePtr,
ConstantInt::get(LongType, 1),
ChunkSize};
Builder.CreateCall(F, Args);
}
void ParallelLoopGeneratorKMP::createCallStaticFini(Value *GlobalThreadID) {
const std::string Name = "__kmpc_for_static_fini";
Function *F = M->getFunction(Name);
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty()};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Args[] = {SourceLocationInfo, GlobalThreadID};
Builder.CreateCall(F, Args);
}
void ParallelLoopGeneratorKMP::createCallDispatchInit(Value *GlobalThreadID,
Value *LB, Value *UB,
Value *Inc,
Value *ChunkSize) {
const std::string Name =
is64BitArch() ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_4";
Function *F = M->getFunction(Name);
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(),
Builder.getInt32Ty(),
Builder.getInt32Ty(),
LongType,
LongType,
LongType,
LongType};
FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
// The parameter 'ChunkSize' will hold strictly positive integer values,
// regardless of PollyChunkSize's value
Value *Args[] = {
SourceLocationInfo,
GlobalThreadID,
Builder.getInt32(int(getSchedType(PollyChunkSize, PollyScheduling))),
LB,
UB,
Inc,
ChunkSize};
Builder.CreateCall(F, Args);
}
Value *ParallelLoopGeneratorKMP::createCallDispatchNext(Value *GlobalThreadID,
Value *IsLastPtr,
Value *LBPtr,
Value *UBPtr,
Value *StridePtr) {
const std::string Name =
is64BitArch() ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_4";
Function *F = M->getFunction(Name);
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), "struct.ident_t");
// If F is not available, declare it.
if (!F) {
GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
Type *Params[] = {IdentTy->getPointerTo(),
Builder.getInt32Ty(),
Builder.getInt32Ty()->getPointerTo(),
LongType->getPointerTo(),
LongType->getPointerTo(),
LongType->getPointerTo()};
FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false);
F = Function::Create(Ty, Linkage, Name, M);
}
Value *Args[] = {SourceLocationInfo, GlobalThreadID, IsLastPtr, LBPtr, UBPtr,
StridePtr};
return Builder.CreateCall(F, Args);
}
// TODO: This function currently creates a source location dummy. It might be
// necessary to (actually) provide information, in the future.
GlobalVariable *ParallelLoopGeneratorKMP::createSourceLocation() {
const std::string LocName = ".loc.dummy";
GlobalVariable *SourceLocDummy = M->getGlobalVariable(LocName);
if (SourceLocDummy == nullptr) {
const std::string StructName = "struct.ident_t";
StructType *IdentTy =
StructType::getTypeByName(M->getContext(), StructName);
// If the ident_t StructType is not available, declare it.
// in LLVM-IR: ident_t = type { i32, i32, i32, i32, i8* }
if (!IdentTy) {
Type *LocMembers[] = {Builder.getInt32Ty(), Builder.getInt32Ty(),
Builder.getInt32Ty(), Builder.getInt32Ty(),
Builder.getInt8PtrTy()};
IdentTy =
StructType::create(M->getContext(), LocMembers, StructName, false);
}
const auto ArrayType =
llvm::ArrayType::get(Builder.getInt8Ty(), /* Length */ 23);
// Global Variable Definitions
GlobalVariable *StrVar = new GlobalVariable(
*M, ArrayType, true, GlobalValue::PrivateLinkage, 0, ".str.ident");
StrVar->setAlignment(llvm::Align(1));
SourceLocDummy = new GlobalVariable(
*M, IdentTy, true, GlobalValue::PrivateLinkage, nullptr, LocName);
SourceLocDummy->setAlignment(llvm::Align(8));
// Constant Definitions
Constant *InitStr = ConstantDataArray::getString(
M->getContext(), "Source location dummy.", true);
Constant *StrPtr = static_cast<Constant *>(Builder.CreateInBoundsGEP(
ArrayType, StrVar, {Builder.getInt32(0), Builder.getInt32(0)}));
Constant *LocInitStruct = ConstantStruct::get(
IdentTy, {Builder.getInt32(0), Builder.getInt32(0), Builder.getInt32(0),
Builder.getInt32(0), StrPtr});
// Initialize variables
StrVar->setInitializer(InitStr);
SourceLocDummy->setInitializer(LocInitStruct);
}
return SourceLocDummy;
}
bool ParallelLoopGeneratorKMP::is64BitArch() {
return (LongType->getIntegerBitWidth() == 64);
}
OMPGeneralSchedulingType ParallelLoopGeneratorKMP::getSchedType(
int ChunkSize, OMPGeneralSchedulingType Scheduling) const {
if (ChunkSize == 0 && Scheduling == OMPGeneralSchedulingType::StaticChunked)
return OMPGeneralSchedulingType::StaticNonChunked;
return Scheduling;
}