[OpenMP] Simplify GPU memory globalization

Summary:
Memory globalization is required to maintain OpenMP standard semantics for data sharing between
worker and master threads. The GPU cannot share data between its threads so must allocate global or
shared memory to store the data in. Currently this is implemented fully in the frontend using the
`__kmpc_data_sharing_push_stack` and __kmpc_data_sharing_pop_stack` functions to emulate standard
CPU stack sharing. The front-end scans the target region for variables that escape the region and
must be shared between the threads. Each variable then has a field created for it in a global record
type.

This patch replaces this functinality with a single allocation command, effectively mimicing an
alloca instruction for the variables that must be shared between the threads. This will be much
slower than the current solution, but makes it much easier to optimize as we can analyze each
variable independently and determine if it is not captured. In the future, we can replace these
calls with an `alloca` and small allocations can be pushed to shared memory.

Reviewed By: tianshilei1992

Differential Revision: https://reviews.llvm.org/D97680
This commit is contained in:
Joseph Huber 2021-03-22 16:34:11 -04:00 committed by Huber, Joseph
parent 48e2d3a5c2
commit 68d133a3e8
28 changed files with 40465 additions and 42314 deletions

View File

@ -1102,17 +1102,6 @@ void CGOpenMPRuntimeGPU::emitNonSPMDKernel(const OMPExecutableDirective &D,
} Action(EST, WST);
CodeGen.setAction(Action);
IsInTTDRegion = true;
// Reserve place for the globalized memory.
GlobalizedRecords.emplace_back();
if (!KernelStaticGlobalized) {
KernelStaticGlobalized = new llvm::GlobalVariable(
CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
llvm::GlobalValue::InternalLinkage,
llvm::UndefValue::get(CGM.VoidPtrTy),
"_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
llvm::GlobalValue::NotThreadLocal,
CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
}
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
IsOffloadEntry, CodeGen);
IsInTTDRegion = false;
@ -1164,10 +1153,6 @@ void CGOpenMPRuntimeGPU::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
CGM.getModule(), OMPRTL___kmpc_kernel_init),
Args);
// For data sharing, we need to initialize the stack.
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack));
emitGenericVarsProlog(CGF, WST.Loc);
}
@ -1236,17 +1221,6 @@ void CGOpenMPRuntimeGPU::emitSPMDKernel(const OMPExecutableDirective &D,
} Action(*this, EST, D);
CodeGen.setAction(Action);
IsInTTDRegion = true;
// Reserve place for the globalized memory.
GlobalizedRecords.emplace_back();
if (!KernelStaticGlobalized) {
KernelStaticGlobalized = new llvm::GlobalVariable(
CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
llvm::GlobalValue::InternalLinkage,
llvm::UndefValue::get(CGM.VoidPtrTy),
"_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
llvm::GlobalValue::NotThreadLocal,
CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
}
emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
IsOffloadEntry, CodeGen);
IsInTTDRegion = false;
@ -1268,12 +1242,6 @@ void CGOpenMPRuntimeGPU::emitSPMDEntryHeader(
CGM.getModule(), OMPRTL___kmpc_spmd_kernel_init),
Args);
if (RequiresFullRuntime) {
// For data sharing, we need to initialize the stack.
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_data_sharing_init_stack_spmd));
}
CGF.EmitBranch(ExecuteBB);
CGF.EmitBlock(ExecuteBB);
@ -1679,16 +1647,13 @@ llvm::Function *CGOpenMPRuntimeGPU::emitTeamsOutlinedFunction(
static_cast<CGOpenMPRuntimeGPU &>(CGF.CGM.getOpenMPRuntime());
if (GlobalizedRD) {
auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
I->getSecond().GlobalRecord = GlobalizedRD;
I->getSecond().MappedParams =
std::make_unique<CodeGenFunction::OMPMapVars>();
DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
for (const auto &Pair : MappedDeclsFields) {
assert(Pair.getFirst()->isCanonicalDecl() &&
"Expected canonical declaration");
Data.insert(std::make_pair(Pair.getFirst(),
MappedVarData(Pair.getSecond(),
/*IsOnePerTeam=*/true)));
Data.insert(std::make_pair(Pair.getFirst(), MappedVarData()));
}
}
Rt.emitGenericVarsProlog(CGF, Loc);
@ -1717,282 +1682,69 @@ void CGOpenMPRuntimeGPU::emitGenericVarsProlog(CodeGenFunction &CGF,
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I == FunctionGlobalizedDecls.end())
return;
if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
QualType SecGlobalRecTy;
// Recover pointer to this function's global record. The runtime will
// handle the specifics of the allocation of the memory.
// Use actual memory size of the record including the padding
// for alignment purposes.
unsigned Alignment =
CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
unsigned GlobalRecordSize =
CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
for (auto &Rec : I->getSecond().LocalVarData) {
const auto *VD = cast<VarDecl>(Rec.first);
bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
QualType VarTy = VD->getType();
llvm::PointerType *GlobalRecPtrTy =
CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
llvm::Value *GlobalRecCastAddr;
llvm::Value *IsTTD = nullptr;
if (!IsInTTDRegion &&
(WithSPMDCheck ||
getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
llvm::Value *ThreadID = getThreadID(CGF, Loc);
llvm::Value *PL = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(CGM.getModule(),
OMPRTL___kmpc_parallel_level),
{RTLoc, ThreadID});
IsTTD = Bld.CreateIsNull(PL);
}
llvm::Value *IsSPMD = Bld.CreateIsNotNull(
CGF.EmitNounwindRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_is_spmd_exec_mode)));
Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
// There is no need to emit line number for unconditional branch.
(void)ApplyDebugLocation::CreateEmpty(CGF);
CGF.EmitBlock(SPMDBB);
Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
CharUnits::fromQuantity(Alignment));
CGF.EmitBranch(ExitBB);
// There is no need to emit line number for unconditional branch.
(void)ApplyDebugLocation::CreateEmpty(CGF);
CGF.EmitBlock(NonSPMDBB);
llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
if (const RecordDecl *SecGlobalizedVarsRecord =
I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
SecGlobalRecTy =
CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
// Recover pointer to this function's global record. The runtime will
// handle the specifics of the allocation of the memory.
// Use actual memory size of the record including the padding
// for alignment purposes.
unsigned Alignment =
CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
unsigned GlobalRecordSize =
CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
Size = Bld.CreateSelect(
IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
}
// TODO: allow the usage of shared memory to be controlled by
// the user, for now, default to global.
llvm::Value *GlobalRecordSizeArg[] = {
Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
GlobalRecordSizeArg);
GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
GlobalRecValue, GlobalRecPtrTy);
CGF.EmitBlock(ExitBB);
auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
/*NumReservedValues=*/2, "_select_stack");
Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
GlobalRecCastAddr = Phi;
I->getSecond().GlobalRecordAddr = Phi;
I->getSecond().IsInSPMDModeFlag = IsSPMD;
} else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
assert(GlobalizedRecords.back().Records.size() < 2 &&
"Expected less than 2 globalized records: one for target and one "
"for teams.");
unsigned Offset = 0;
for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
QualType RDTy = CGM.getContext().getRecordType(RD);
unsigned Alignment =
CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
Offset =
llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
}
unsigned Alignment =
CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
Offset = llvm::alignTo(Offset, Alignment);
GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
++GlobalizedRecords.back().RegionCounter;
if (GlobalizedRecords.back().Records.size() == 1) {
assert(KernelStaticGlobalized &&
"Kernel static pointer must be initialized already.");
auto *UseSharedMemory = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
llvm::GlobalValue::InternalLinkage, nullptr,
"_openmp_static_kernel$is_shared");
UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
/*DestWidth=*/16, /*Signed=*/0);
llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
Address(UseSharedMemory,
CGM.getContext().getTypeAlignInChars(Int16Ty)),
/*Volatile=*/false, Int16Ty, Loc);
auto *StaticGlobalized = new llvm::GlobalVariable(
CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
llvm::GlobalValue::CommonLinkage, nullptr);
auto *RecSize = new llvm::GlobalVariable(
CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
llvm::GlobalValue::InternalLinkage, nullptr,
"_openmp_static_kernel$size");
RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
llvm::Value *Ld = CGF.EmitLoadOfScalar(
Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
CGM.getContext().getSizeType(), Loc);
llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
KernelStaticGlobalized, CGM.VoidPtrPtrTy);
llvm::Value *GlobalRecordSizeArg[] = {
llvm::ConstantInt::get(
CGM.Int16Ty,
getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
StaticGlobalized, Ld, IsInSharedMemory, ResAddr};
CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_get_team_static_memory),
GlobalRecordSizeArg);
GlobalizedRecords.back().Buffer = StaticGlobalized;
GlobalizedRecords.back().RecSize = RecSize;
GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
GlobalizedRecords.back().Loc = Loc;
}
assert(KernelStaticGlobalized && "Global address must be set already.");
Address FrameAddr = CGF.EmitLoadOfPointer(
Address(KernelStaticGlobalized, CGM.getPointerAlign()),
CGM.getContext()
.getPointerType(CGM.getContext().VoidPtrTy)
.castAs<PointerType>());
llvm::Value *GlobalRecValue =
Bld.CreateConstInBoundsGEP(FrameAddr, Offset).getPointer();
I->getSecond().GlobalRecordAddr = GlobalRecValue;
I->getSecond().IsInSPMDModeFlag = nullptr;
GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
} else {
// TODO: allow the usage of shared memory to be controlled by
// the user, for now, default to global.
bool UseSharedMemory =
IsInTTDRegion && GlobalRecordSize <= SharedMemorySize;
llvm::Value *GlobalRecordSizeArg[] = {
llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
CGF.Builder.getInt16(UseSharedMemory ? 1 : 0)};
llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(),
IsInTTDRegion ? OMPRTL___kmpc_data_sharing_push_stack
: OMPRTL___kmpc_data_sharing_coalesced_push_stack),
GlobalRecordSizeArg);
GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
GlobalRecValue, GlobalRecPtrTy);
I->getSecond().GlobalRecordAddr = GlobalRecValue;
I->getSecond().IsInSPMDModeFlag = nullptr;
// Get the local allocation of a firstprivate variable before sharing
llvm::Value *ParValue;
if (EscapedParam) {
LValue ParLVal =
CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
}
LValue Base =
CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
// Emit the "global alloca" which is a GEP from the global declaration
// record using the pointer returned by the runtime.
LValue SecBase;
decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
if (IsTTD) {
SecIt = I->getSecond().SecondaryLocalVarData->begin();
llvm::PointerType *SecGlobalRecPtrTy =
CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
Bld.CreatePointerBitCastOrAddrSpaceCast(
I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
SecGlobalRecTy);
}
for (auto &Rec : I->getSecond().LocalVarData) {
bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
llvm::Value *ParValue;
if (EscapedParam) {
const auto *VD = cast<VarDecl>(Rec.first);
LValue ParLVal =
CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
}
LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
// Emit VarAddr basing on lane-id if required.
QualType VarTy;
if (Rec.second.IsOnePerTeam) {
VarTy = Rec.second.FD->getType();
} else {
Address Addr = VarAddr.getAddress(CGF);
llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
Addr.getElementType(), Addr.getPointer(),
{Bld.getInt32(0), getNVPTXLaneID(CGF)});
VarTy =
Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
VarAddr = CGF.MakeAddrLValue(
Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
AlignmentSource::Decl);
}
Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
if (!IsInTTDRegion &&
(WithSPMDCheck ||
getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
assert(I->getSecond().IsInSPMDModeFlag &&
"Expected unknown execution mode or required SPMD check.");
if (IsTTD) {
assert(SecIt->second.IsOnePerTeam &&
"Secondary glob data must be one per team.");
LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
VarAddr.setAddress(
Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(CGF),
VarAddr.getPointer(CGF)),
VarAddr.getAlignment()));
Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
}
Address GlobalPtr = Rec.second.PrivateAddr;
Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
Rec.second.PrivateAddr = Address(
Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
LocalAddr.getPointer(), GlobalPtr.getPointer()),
LocalAddr.getAlignment());
}
if (EscapedParam) {
const auto *VD = cast<VarDecl>(Rec.first);
CGF.EmitStoreOfScalar(ParValue, VarAddr);
I->getSecond().MappedParams->setVarAddr(CGF, VD,
VarAddr.getAddress(CGF));
}
if (IsTTD)
++SecIt;
// Allocate space for the variable to be globalized
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
llvm::Instruction *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
// Cast the void pointer and get the address of the globalized variable.
llvm::PointerType *VarPtrTy = CGF.ConvertTypeForMem(VarTy)->getPointerTo();
llvm::Value *CastedVoidPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
VoidPtr, VarPtrTy, VD->getName() + "_on_stack");
LValue VarAddr = CGF.MakeNaturalAlignAddrLValue(CastedVoidPtr, VarTy);
Rec.second.PrivateAddr = VarAddr.getAddress(CGF);
Rec.second.GlobalizedVal = VoidPtr;
// Assign the local allocation to the newly globalized location.
if (EscapedParam) {
CGF.EmitStoreOfScalar(ParValue, VarAddr);
I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress(CGF));
}
if (auto *DI = CGF.getDebugInfo())
VoidPtr->setDebugLoc(DI->SourceLocToDebugLoc(VD->getLocation()));
}
for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
// Recover pointer to this function's global record. The runtime will
// handle the specifics of the allocation of the memory.
// Use actual memory size of the record including the padding
for (const auto *VD : I->getSecond().EscapedVariableLengthDecls) {
// Use actual memory size of the VLA object including the padding
// for alignment purposes.
CGBuilderTy &Bld = CGF.Builder;
llvm::Value *Size = CGF.getTypeSize(VD->getType());
CharUnits Align = CGM.getContext().getDeclAlign(VD);
Size = Bld.CreateNUWAdd(
Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
llvm::Value *AlignVal =
llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
Size = Bld.CreateUDiv(Size, AlignVal);
Size = Bld.CreateNUWMul(Size, AlignVal);
// TODO: allow the usage of shared memory to be controlled by
// the user, for now, default to global.
llvm::Value *GlobalRecordSizeArg[] = {
Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_data_sharing_coalesced_push_stack),
GlobalRecordSizeArg);
llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
// Allocate space for this VLA object to be globalized.
llvm::Value *AllocArgs[] = {CGF.getTypeSize(VD->getType())};
llvm::Instruction *VoidPtr =
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_alloc_shared),
AllocArgs, VD->getName());
I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(VoidPtr);
LValue Base = CGF.MakeAddrLValue(VoidPtr, VD->getType(),
CGM.getContext().getDeclAlign(VD),
AlignmentSource::Decl);
I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
Base.getAddress(CGF));
I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
}
I->getSecond().MappedParams->apply(CGF);
}
@ -2005,60 +1757,20 @@ void CGOpenMPRuntimeGPU::emitGenericVarsEpilog(CodeGenFunction &CGF,
const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
if (I != FunctionGlobalizedDecls.end()) {
I->getSecond().MappedParams->restore(CGF);
if (!CGF.HaveInsertPoint())
return;
// Deallocate the memory for each globalized VLA object
for (llvm::Value *Addr :
llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
Addr);
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
Addr);
}
if (I->getSecond().GlobalRecordAddr) {
if (!IsInTTDRegion &&
(WithSPMDCheck ||
getExecutionMode() == CGOpenMPRuntimeGPU::EM_Unknown)) {
CGBuilderTy &Bld = CGF.Builder;
llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
// There is no need to emit line number for unconditional branch.
(void)ApplyDebugLocation::CreateEmpty(CGF);
CGF.EmitBlock(NonSPMDBB);
CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
CGF.EmitBlock(ExitBB);
} else if (!CGM.getLangOpts().OpenMPCUDATargetParallel && IsInTTDRegion) {
assert(GlobalizedRecords.back().RegionCounter > 0 &&
"region counter must be > 0.");
--GlobalizedRecords.back().RegionCounter;
// Emit the restore function only in the target region.
if (GlobalizedRecords.back().RegionCounter == 0) {
QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
/*DestWidth=*/16, /*Signed=*/0);
llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
Address(GlobalizedRecords.back().UseSharedMemory,
CGM.getContext().getTypeAlignInChars(Int16Ty)),
/*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
llvm::Value *Args[] = {
llvm::ConstantInt::get(
CGM.Int16Ty,
getExecutionMode() == CGOpenMPRuntimeGPU::EM_SPMD ? 1 : 0),
IsInSharedMemory};
CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_restore_team_static_memory),
Args);
}
} else {
CGF.EmitRuntimeCall(
OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_data_sharing_pop_stack),
I->getSecond().GlobalRecordAddr);
}
// Deallocate the memory for each globalized value
for (auto &Rec : llvm::reverse(I->getSecond().LocalVarData)) {
I->getSecond().MappedParams->restore(CGF);
CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction(
CGM.getModule(), OMPRTL___kmpc_free_shared),
{Rec.second.GlobalizedVal});
}
}
}
@ -4183,7 +3895,6 @@ void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
I->getSecond().MappedParams =
std::make_unique<CodeGenFunction::OMPMapVars>();
I->getSecond().GlobalRecord = GlobalizedVarsRecord;
I->getSecond().EscapedParameters.insert(
VarChecker.getEscapedParameters().begin(),
VarChecker.getEscapedParameters().end());
@ -4192,21 +3903,16 @@ void CGOpenMPRuntimeGPU::emitFunctionProlog(CodeGenFunction &CGF,
DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
assert(VD->isCanonicalDecl() && "Expected canonical declaration");
const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
Data.insert(std::make_pair(VD, MappedVarData()));
}
if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
CheckVarsEscapingDeclContext VarChecker(CGF, llvm::None);
VarChecker.Visit(Body);
I->getSecond().SecondaryGlobalRecord =
VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
I->getSecond().SecondaryLocalVarData.emplace();
DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
assert(VD->isCanonicalDecl() && "Expected canonical declaration");
const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
Data.insert(
std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
Data.insert(std::make_pair(VD, MappedVarData()));
}
}
if (!NeedToDelayGlobalization) {
@ -4499,187 +4205,8 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(
CGOpenMPRuntime::processRequiresDirective(D);
}
/// Get number of SMs and number of blocks per SM.
static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
std::pair<unsigned, unsigned> Data;
if (CGM.getLangOpts().OpenMPCUDANumSMs)
Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
if (Data.first && Data.second)
return Data;
switch (getCudaArch(CGM)) {
case CudaArch::SM_20:
case CudaArch::SM_21:
case CudaArch::SM_30:
case CudaArch::SM_32:
case CudaArch::SM_35:
case CudaArch::SM_37:
case CudaArch::SM_50:
case CudaArch::SM_52:
case CudaArch::SM_53:
return {16, 16};
case CudaArch::SM_60:
case CudaArch::SM_61:
case CudaArch::SM_62:
return {56, 32};
case CudaArch::SM_70:
case CudaArch::SM_72:
case CudaArch::SM_75:
case CudaArch::SM_80:
case CudaArch::SM_86:
return {84, 32};
case CudaArch::GFX600:
case CudaArch::GFX601:
case CudaArch::GFX602:
case CudaArch::GFX700:
case CudaArch::GFX701:
case CudaArch::GFX702:
case CudaArch::GFX703:
case CudaArch::GFX704:
case CudaArch::GFX705:
case CudaArch::GFX801:
case CudaArch::GFX802:
case CudaArch::GFX803:
case CudaArch::GFX805:
case CudaArch::GFX810:
case CudaArch::GFX900:
case CudaArch::GFX902:
case CudaArch::GFX904:
case CudaArch::GFX906:
case CudaArch::GFX908:
case CudaArch::GFX909:
case CudaArch::GFX90a:
case CudaArch::GFX90c:
case CudaArch::GFX1010:
case CudaArch::GFX1011:
case CudaArch::GFX1012:
case CudaArch::GFX1013:
case CudaArch::GFX1030:
case CudaArch::GFX1031:
case CudaArch::GFX1032:
case CudaArch::GFX1033:
case CudaArch::GFX1034:
case CudaArch::UNUSED:
case CudaArch::UNKNOWN:
break;
case CudaArch::LAST:
llvm_unreachable("Unexpected Cuda arch.");
}
llvm_unreachable("Unexpected NVPTX target without ptx feature.");
}
void CGOpenMPRuntimeGPU::clear() {
if (!GlobalizedRecords.empty() &&
!CGM.getLangOpts().OpenMPCUDATargetParallel) {
ASTContext &C = CGM.getContext();
llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
RecordDecl *StaticRD = C.buildImplicitRecord(
"_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
StaticRD->startDefinition();
RecordDecl *SharedStaticRD = C.buildImplicitRecord(
"_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
SharedStaticRD->startDefinition();
for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
if (Records.Records.empty())
continue;
unsigned Size = 0;
unsigned RecAlignment = 0;
for (const RecordDecl *RD : Records.Records) {
QualType RDTy = C.getRecordType(RD);
unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
RecAlignment = std::max(RecAlignment, Alignment);
unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
Size =
llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
}
Size = llvm::alignTo(Size, RecAlignment);
llvm::APInt ArySize(/*numBits=*/64, Size);
QualType SubTy = C.getConstantArrayType(
C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
const bool UseSharedMemory = Size <= SharedMemorySize;
auto *Field =
FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
SourceLocation(), SourceLocation(), nullptr, SubTy,
C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
/*BW=*/nullptr, /*Mutable=*/false,
/*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
if (UseSharedMemory) {
SharedStaticRD->addDecl(Field);
SharedRecs.push_back(&Records);
} else {
StaticRD->addDecl(Field);
GlobalRecs.push_back(&Records);
}
Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
Records.UseSharedMemory->setInitializer(
llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
}
// Allocate SharedMemorySize buffer for the shared memory.
// FIXME: nvlink does not handle weak linkage correctly (object with the
// different size are reported as erroneous).
// Restore this code as sson as nvlink is fixed.
if (!SharedStaticRD->field_empty()) {
llvm::APInt ArySize(/*numBits=*/64, SharedMemorySize);
QualType SubTy = C.getConstantArrayType(
C.CharTy, ArySize, nullptr, ArrayType::Normal, /*IndexTypeQuals=*/0);
auto *Field = FieldDecl::Create(
C, SharedStaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
/*BW=*/nullptr, /*Mutable=*/false,
/*InitStyle=*/ICIS_NoInit);
Field->setAccess(AS_public);
SharedStaticRD->addDecl(Field);
}
SharedStaticRD->completeDefinition();
if (!SharedStaticRD->field_empty()) {
QualType StaticTy = C.getRecordType(SharedStaticRD);
llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
auto *GV = new llvm::GlobalVariable(
CGM.getModule(), LLVMStaticTy,
/*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
llvm::UndefValue::get(LLVMStaticTy),
"_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
llvm::GlobalValue::NotThreadLocal,
C.getTargetAddressSpace(LangAS::cuda_shared));
auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
GV, CGM.VoidPtrTy);
for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
Rec->Buffer->replaceAllUsesWith(Replacement);
Rec->Buffer->eraseFromParent();
}
}
StaticRD->completeDefinition();
if (!StaticRD->field_empty()) {
QualType StaticTy = C.getRecordType(StaticRD);
std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
llvm::APInt Size1(32, SMsBlockPerSM.second);
QualType Arr1Ty =
C.getConstantArrayType(StaticTy, Size1, nullptr, ArrayType::Normal,
/*IndexTypeQuals=*/0);
llvm::APInt Size2(32, SMsBlockPerSM.first);
QualType Arr2Ty =
C.getConstantArrayType(Arr1Ty, Size2, nullptr, ArrayType::Normal,
/*IndexTypeQuals=*/0);
llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
// FIXME: nvlink does not handle weak linkage correctly (object with the
// different size are reported as erroneous).
// Restore CommonLinkage as soon as nvlink is fixed.
auto *GV = new llvm::GlobalVariable(
CGM.getModule(), LLVMArr2Ty,
/*isConstant=*/false, llvm::GlobalValue::InternalLinkage,
llvm::Constant::getNullValue(LLVMArr2Ty),
"_openmp_static_glob_rd_$_");
auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
GV, CGM.VoidPtrTy);
for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
Rec->Buffer->replaceAllUsesWith(Replacement);
Rec->Buffer->eraseFromParent();
}
}
}
if (!TeamsReductions.empty()) {
ASTContext &C = CGM.getContext();
RecordDecl *StaticRD = C.buildImplicitRecord(

View File

@ -440,15 +440,9 @@ private:
/// The data for the single globalized variable.
struct MappedVarData {
/// Corresponding field in the global record.
const FieldDecl *FD = nullptr;
llvm::Value *GlobalizedVal = nullptr;
/// Corresponding address.
Address PrivateAddr = Address::invalid();
/// true, if only one element is required (for latprivates in SPMD mode),
/// false, if need to create based on the warp-size.
bool IsOnePerTeam = false;
MappedVarData() = delete;
MappedVarData(const FieldDecl *FD, bool IsOnePerTeam = false)
: FD(FD), IsOnePerTeam(IsOnePerTeam) {}
};
/// The map of local variables to their addresses in the global memory.
using DeclToAddrMapTy = llvm::MapVector<const Decl *, MappedVarData>;
@ -460,29 +454,12 @@ private:
EscapedParamsTy EscapedParameters;
llvm::SmallVector<const ValueDecl*, 4> EscapedVariableLengthDecls;
llvm::SmallVector<llvm::Value *, 4> EscapedVariableLengthDeclsAddrs;
const RecordDecl *GlobalRecord = nullptr;
llvm::Optional<const RecordDecl *> SecondaryGlobalRecord = llvm::None;
llvm::Value *GlobalRecordAddr = nullptr;
llvm::Value *IsInSPMDModeFlag = nullptr;
std::unique_ptr<CodeGenFunction::OMPMapVars> MappedParams;
};
/// Maps the function to the list of the globalized variables with their
/// addresses.
llvm::SmallDenseMap<llvm::Function *, FunctionData> FunctionGlobalizedDecls;
/// List of records for the globalized variables in target/teams/distribute
/// contexts. Inner records are going to be joined into the single record,
/// while those resulting records are going to be joined into the single
/// union. This resulting union (one per CU) is the entry point for the static
/// memory management runtime functions.
struct GlobalPtrSizeRecsTy {
llvm::GlobalVariable *UseSharedMemory = nullptr;
llvm::GlobalVariable *RecSize = nullptr;
llvm::GlobalVariable *Buffer = nullptr;
SourceLocation Loc;
llvm::SmallVector<const RecordDecl *, 2> Records;
unsigned RegionCounter = 0;
};
llvm::SmallVector<GlobalPtrSizeRecsTy, 8> GlobalizedRecords;
llvm::GlobalVariable *KernelTeamsReductionPtr = nullptr;
/// List of the records with the list of fields for the reductions across the
/// teams. Used to build the intermediate buffer for the fast teams

View File

@ -19,8 +19,6 @@
// CHECK-DAG: declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() [[attr1]]
// CHECK: declare void @__kmpc_kernel_init(i32, i16)
// CHECK-NOT: #
// CHECK: declare void @__kmpc_data_sharing_init_stack()
// CHECK-NOT: #
// CHECK: declare float @_Z3sinf(float) [[attr2:#[0-9]*]]
// CHECK: declare void @__kmpc_kernel_deinit(i16)
// CHECK-NOT: #

View File

@ -33,7 +33,6 @@ int maini1() {
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK1: .execute:
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -81,39 +80,9 @@ int maini1() {
// CHECK1-LABEL: define {{[^@]+}}@_Z3barv
// CHECK1-SAME: () #[[ATTR2]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[A2:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
// CHECK1-NEXT: [[TMP1:%.*]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @[[GLOB2]], i32 [[TMP0]])
// CHECK1-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0
// CHECK1-NEXT: [[TMP3:%.*]] = call i8 @__kmpc_is_spmd_exec_mode() #[[ATTR3:[0-9]+]]
// CHECK1-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTSPMD:%.*]], label [[DOTNON_SPMD:%.*]]
// CHECK1: .spmd:
// CHECK1-NEXT: br label [[DOTEXIT:%.*]]
// CHECK1: .non-spmd:
// CHECK1-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i64 4, i64 128
// CHECK1-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 [[TMP5]], i16 0)
// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty*
// CHECK1-NEXT: br label [[DOTEXIT]]
// CHECK1: .exit:
// CHECK1-NEXT: [[_SELECT_STACK:%.*]] = phi %struct._globalized_locals_ty* [ null, [[DOTSPMD]] ], [ [[TMP7]], [[DOTNON_SPMD]] ]
// CHECK1-NEXT: [[TMP8:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to %struct._globalized_locals_ty.0*
// CHECK1-NEXT: [[A:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[_SELECT_STACK]], i32 0, i32 0
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID]], 31
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* [[A]], i32 0, i32 [[NVPTX_LANE_ID]]
// CHECK1-NEXT: [[A1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP8]], i32 0, i32 0
// CHECK1-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32* [[A1]], i32* [[TMP9]]
// CHECK1-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], i32* [[A2]], i32* [[TMP10]]
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[TMP11]]) #[[ATTR4]]
// CHECK1-NEXT: store i32 [[CALL]], i32* [[RETVAL]], align 4
// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTEXIT4:%.*]], label [[DOTNON_SPMD3:%.*]]
// CHECK1: .non-spmd3:
// CHECK1-NEXT: [[TMP12:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to i8*
// CHECK1-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP12]])
// CHECK1-NEXT: br label [[DOTEXIT4]]
// CHECK1: .exit4:
// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[RETVAL]], align 4
// CHECK1-NEXT: ret i32 [[TMP13]]
// CHECK1-NEXT: [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32*
// CHECK1-NEXT: [[CALL:%.*]] = call i32 @_Z3fooRi(i32* nonnull align 4 dereferenceable(4) [[A_ON_STACK]]) #[[ATTR4]]
// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[A]])
// CHECK1-NEXT: ret i32 [[CALL]]
//

View File

@ -3,8 +3,7 @@
///==========================================================================///
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK2
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK
// expected-no-diagnostics
@ -78,8 +77,6 @@ void test_ds(){
// CHECK1-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15
// CHECK1-SAME: () #[[ATTR1:[0-9]+]] {
// CHECK1-NEXT: entry:
@ -145,8 +142,6 @@ void test_ds(){
// CHECK1-NEXT: br label [[DOTEXIT]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__
// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
// CHECK1-NEXT: entry:
@ -159,8 +154,6 @@ void test_ds(){
// CHECK1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
// CHECK1-NEXT: store i32 1000, i32* [[TMP0]], align 4
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
// CHECK1-NEXT: entry:
@ -178,8 +171,6 @@ void test_ds(){
// CHECK1-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8
// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]]
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1
// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
// CHECK1-NEXT: entry:
@ -200,8 +191,6 @@ void test_ds(){
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10000
// CHECK1-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
// CHECK1-NEXT: entry:
@ -222,8 +211,6 @@ void test_ds(){
// CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8
// CHECK1-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]]
// CHECK1-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15_worker
// CHECK2-SAME: () #[[ATTR0:[0-9]+]] {
// CHECK2-NEXT: entry:
@ -271,8 +258,6 @@ void test_ds(){
// CHECK2-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l15
// CHECK2-SAME: () #[[ATTR1:[0-9]+]] {
// CHECK2-NEXT: entry:
@ -333,8 +318,6 @@ void test_ds(){
// CHECK2-NEXT: br label [[DOTEXIT]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__
// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
// CHECK2-NEXT: entry:
@ -347,8 +330,6 @@ void test_ds(){
// CHECK2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
// CHECK2-NEXT: store i32 1000, i32* [[TMP0]], align 4
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
// CHECK2-NEXT: entry:
@ -366,8 +347,6 @@ void test_ds(){
// CHECK2-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8
// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]]
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1
// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
// CHECK2-NEXT: entry:
@ -388,8 +367,6 @@ void test_ds(){
// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10000
// CHECK2-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
// CHECK2-NEXT: entry:
@ -410,4 +387,190 @@ void test_ds(){
// CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8
// CHECK2-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]]
// CHECK2-NEXT: ret void
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l14_worker
// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8
// CHECK-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK-NEXT: store i8* null, i8** [[WORK_FN]], align 8
// CHECK-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK: .await.work:
// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK: .select.workers:
// CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK: .execute.parallel:
// CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
// CHECK-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*)
// CHECK-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
// CHECK: .execute.fn:
// CHECK-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]]
// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK: .check.next:
// CHECK-NEXT: [[TMP6:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK-NEXT: [[WORK_MATCH1:%.*]] = icmp eq i8* [[TMP6]], bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*)
// CHECK-NEXT: br i1 [[WORK_MATCH1]], label [[DOTEXECUTE_FN2:%.*]], label [[DOTCHECK_NEXT3:%.*]]
// CHECK: .execute.fn2:
// CHECK-NEXT: call void @__omp_outlined__1_wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3]]
// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL]]
// CHECK: .check.next3:
// CHECK-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK-NEXT: call void [[TMP7]](i16 0, i32 [[TMP4]])
// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL]]
// CHECK: .terminate.parallel:
// CHECK-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK: .barrier.parallel:
// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK: .exit:
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l14
// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [1 x i8*], align 8
// CHECK-NEXT: [[C:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[CAPTURED_VARS_ADDRS7:%.*]] = alloca [2 x i8*], align 8
// CHECK-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK-NEXT: [[TMP0:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK-NEXT: br i1 [[TMP0]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK: .worker:
// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z7test_dsv_l14_worker() #[[ATTR3]]
// CHECK-NEXT: br label [[DOTEXIT:%.*]]
// CHECK: .mastercheck:
// CHECK-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-NEXT: [[TMP1:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK-NEXT: [[TMP3:%.*]] = xor i32 [[TMP1]], -1
// CHECK-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP2]], [[TMP3]]
// CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK-NEXT: br i1 [[TMP4]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK: .master:
// CHECK-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK-NEXT: [[A:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK-NEXT: [[A_ON_STACK:%.*]] = bitcast i8* [[A]] to i32*
// CHECK-NEXT: [[B:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK-NEXT: [[B_ON_STACK:%.*]] = bitcast i8* [[B]] to i32*
// CHECK-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK-NEXT: store i32 10, i32* [[A_ON_STACK]], align 4
// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK-NEXT: [[TMP7:%.*]] = bitcast i32* [[A_ON_STACK]] to i8*
// CHECK-NEXT: store i8* [[TMP7]], i8** [[TMP6]], align 8
// CHECK-NEXT: [[TMP8:%.*]] = bitcast [1 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP8]], i64 1)
// CHECK-NEXT: store i32 100, i32* [[B_ON_STACK]], align 4
// CHECK-NEXT: store i32 1000, i32* [[C]], align 4
// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS7]], i64 0, i64 0
// CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[B_ON_STACK]] to i8*
// CHECK-NEXT: store i8* [[TMP10]], i8** [[TMP9]], align 8
// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS7]], i64 0, i64 1
// CHECK-NEXT: [[TMP12:%.*]] = bitcast i32* [[A_ON_STACK]] to i8*
// CHECK-NEXT: store i8* [[TMP12]], i8** [[TMP11]], align 8
// CHECK-NEXT: [[TMP13:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS7]] to i8**
// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB1]], i32 [[TMP5]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, i32*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP13]], i64 2)
// CHECK-NEXT: call void @__kmpc_free_shared(i8* [[B]])
// CHECK-NEXT: call void @__kmpc_free_shared(i8* [[A]])
// CHECK-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK: .termination.notifier:
// CHECK-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK-NEXT: br label [[DOTEXIT]]
// CHECK: .exit:
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
// CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[A_ADDR]], align 8
// CHECK-NEXT: store i32 1000, i32* [[TMP0]], align 4
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
// CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
// CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2
// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
// CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
// CHECK-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0
// CHECK-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8
// CHECK-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]]) #[[ATTR3]]
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1
// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], i32* nonnull align 4 dereferenceable(4) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[A:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca i32*, align 8
// CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32*, align 8
// CHECK-NEXT: [[C:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[C1:%.*]] = alloca i32*, align 8
// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
// CHECK-NEXT: store i32* [[B]], i32** [[B_ADDR]], align 8
// CHECK-NEXT: store i32* [[A]], i32** [[A_ADDR]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load i32*, i32** [[B_ADDR]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[A_ADDR]], align 8
// CHECK-NEXT: store i32* [[C]], i32** [[C1]], align 8
// CHECK-NEXT: [[TMP2:%.*]] = load i32, i32* [[TMP1]], align 4
// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], 10000
// CHECK-NEXT: store i32 [[ADD]], i32* [[TMP0]], align 4
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__1_wrapper
// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
// CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
// CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2
// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
// CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
// CHECK-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0
// CHECK-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to i32**
// CHECK-NEXT: [[TMP5:%.*]] = load i32*, i32** [[TMP4]], align 8
// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1
// CHECK-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32**
// CHECK-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8
// CHECK-NEXT: call void @__omp_outlined__1(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], i32* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]]
// CHECK-NEXT: ret void
//

View File

@ -830,7 +830,6 @@ int main(int argc, char **argv) {
// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK2-NEXT: [[TMP7:%.*]] = load %class.anon*, %class.anon** [[TMP]], align 8
// CHECK2-NEXT: [[TMP8:%.*]] = bitcast %class.anon* [[L7]] to i8*
// CHECK2-NEXT: [[TMP9:%.*]] = bitcast %class.anon* [[TMP7]] to i8*
@ -877,7 +876,6 @@ int main(int argc, char **argv) {
// CHECK2-NEXT: store %class.anon* [[TMP1]], %class.anon** [[TMP]], align 8
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK2: .execute:
// CHECK2-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
@ -1019,7 +1017,6 @@ int main(int argc, char **argv) {
// CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK2-NEXT: [[TMP9:%.*]] = load %class.anon.0*, %class.anon.0** [[_TMP2]], align 8
// CHECK2-NEXT: [[TMP10:%.*]] = bitcast %class.anon.0* [[L9]] to i8*
// CHECK2-NEXT: [[TMP11:%.*]] = bitcast %class.anon.0* [[TMP9]] to i8*
@ -1086,7 +1083,6 @@ int main(int argc, char **argv) {
// CHECK2-NEXT: store %class.anon.0* [[TMP4]], %class.anon.0** [[_TMP2]], align 8
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK2: .execute:
// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
@ -1206,7 +1202,6 @@ int main(int argc, char **argv) {
// CHECK2-NEXT: store %class.anon* [[TMP0]], %class.anon** [[TMP]], align 8
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK2: .execute:
// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
@ -1340,7 +1335,6 @@ int main(int argc, char **argv) {
// CHECK3-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK3-NEXT: [[TMP9:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8
// CHECK3-NEXT: [[TMP10:%.*]] = bitcast %class.anon* [[L9]] to i8*
// CHECK3-NEXT: [[TMP11:%.*]] = bitcast %class.anon* [[TMP9]] to i8*
@ -1407,7 +1401,6 @@ int main(int argc, char **argv) {
// CHECK3-NEXT: store %class.anon* [[TMP4]], %class.anon** [[_TMP2]], align 8
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK3: .execute:
// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
@ -1588,7 +1581,6 @@ int main(int argc, char **argv) {
// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK3-NEXT: [[TMP7:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8
// CHECK3-NEXT: [[TMP8:%.*]] = bitcast %class.anon.0* [[L7]] to i8*
// CHECK3-NEXT: [[TMP9:%.*]] = bitcast %class.anon.0* [[TMP7]] to i8*
@ -1635,7 +1627,6 @@ int main(int argc, char **argv) {
// CHECK3-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK3: .execute:
// CHECK3-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
@ -1697,7 +1688,6 @@ int main(int argc, char **argv) {
// CHECK3-NEXT: store %class.anon.0* [[TMP0]], %class.anon.0** [[TMP]], align 8
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK3: .execute:
// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
@ -1831,7 +1821,6 @@ int main(int argc, char **argv) {
// CHECK4-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK4-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK4-NEXT: [[TMP9:%.*]] = load %class.anon*, %class.anon** [[_TMP2]], align 8
// CHECK4-NEXT: [[TMP10:%.*]] = bitcast %class.anon* [[L9]] to i8*
// CHECK4-NEXT: [[TMP11:%.*]] = bitcast %class.anon* [[TMP9]] to i8*
@ -1898,7 +1887,6 @@ int main(int argc, char **argv) {
// CHECK4-NEXT: store %class.anon* [[TMP4]], %class.anon** [[_TMP2]], align 8
// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK4: .execute:
// CHECK4-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
@ -2079,7 +2067,6 @@ int main(int argc, char **argv) {
// CHECK4-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK4-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK4-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK4-NEXT: [[TMP7:%.*]] = load %class.anon.0*, %class.anon.0** [[TMP]], align 8
// CHECK4-NEXT: [[TMP8:%.*]] = bitcast %class.anon.0* [[L7]] to i8*
// CHECK4-NEXT: [[TMP9:%.*]] = bitcast %class.anon.0* [[TMP7]] to i8*
@ -2126,7 +2113,6 @@ int main(int argc, char **argv) {
// CHECK4-NEXT: store %class.anon.0* [[TMP1]], %class.anon.0** [[TMP]], align 8
// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK4: .execute:
// CHECK4-NEXT: [[TMP2:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
@ -2188,7 +2174,6 @@ int main(int argc, char **argv) {
// CHECK4-NEXT: store %class.anon.0* [[TMP0]], %class.anon.0** [[TMP]], align 8
// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK4: .execute:
// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])

View File

@ -31,7 +31,6 @@ int main() {
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 8
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK1: .execute:
// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -128,7 +127,6 @@ int main() {
// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK1-NEXT: call void @_Z3usev() #[[ATTR7]]
// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK1: .termination.notifier:
@ -171,7 +169,6 @@ int main() {
// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK2: .execute:
// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -268,7 +265,6 @@ int main() {
// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK2-NEXT: call void @_Z3usev() #[[ATTR7]]
// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK2: .termination.notifier:
@ -311,7 +307,6 @@ int main() {
// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [0 x i8*], align 4
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK3: .execute:
// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -408,7 +403,6 @@ int main() {
// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK3-NEXT: call void @_Z3usev() #[[ATTR7]]
// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK3: .termination.notifier:

View File

@ -106,7 +106,6 @@ int main() {
// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK1-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK1-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]]
// CHECK1-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2)
@ -305,7 +304,6 @@ int main() {
// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK2-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK2-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]]
// CHECK2-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2)
@ -504,7 +502,6 @@ int main() {
// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK3-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK3-NEXT: call void @_Z3usePi(i32* [[TMP0]]) #[[ATTR7:[0-9]+]]
// CHECK3-NEXT: call void @__kmpc_push_num_threads(%struct.ident_t* @[[GLOB1]], i32 [[TMP6]], i32 2)

File diff suppressed because it is too large Load Diff

View File

@ -1,8 +1,7 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _
// Test target codegen - host bc file has to be created first.
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK1
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns -fopenmp-cuda-parallel-target-regions | FileCheck %s --check-prefix=CHECK2
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - -disable-llvm-optzns | FileCheck %s --check-prefix=CHECK
// expected-no-diagnostics
#ifndef HEADER
#define HEADER
@ -73,8 +72,6 @@ int bar(int n){
// CHECK1-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14
// CHECK1-SAME: (i64 [[N:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1:[0-9]+]] {
// CHECK1-NEXT: entry:
@ -141,8 +138,6 @@ int bar(int n){
// CHECK1-NEXT: br label [[DOTEXIT]]
// CHECK1: .exit:
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined__
// CHECK1-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
// CHECK1-NEXT: entry:
@ -231,8 +226,6 @@ int bar(int n){
// CHECK1: omp.dispatch.end:
// CHECK1-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]])
// CHECK1-NEXT: ret void
//
//
// CHECK1-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
// CHECK1-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
// CHECK1-NEXT: entry:
@ -253,8 +246,6 @@ int bar(int n){
// CHECK1-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8
// CHECK1-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]]
// CHECK1-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14_worker
// CHECK2-SAME: () #[[ATTR0:[0-9]+]] {
// CHECK2-NEXT: entry:
@ -295,8 +286,6 @@ int bar(int n){
// CHECK2-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l14
// CHECK2-SAME: (i64 [[N:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1:[0-9]+]] {
// CHECK2-NEXT: entry:
@ -358,8 +347,6 @@ int bar(int n){
// CHECK2-NEXT: br label [[DOTEXIT]]
// CHECK2: .exit:
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined__
// CHECK2-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
// CHECK2-NEXT: entry:
@ -448,8 +435,6 @@ int bar(int n){
// CHECK2: omp.dispatch.end:
// CHECK2-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]])
// CHECK2-NEXT: ret void
//
//
// CHECK2-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
// CHECK2-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
// CHECK2-NEXT: entry:
@ -470,4 +455,217 @@ int bar(int n){
// CHECK2-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8
// CHECK2-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]]
// CHECK2-NEXT: ret void
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13_worker
// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[WORK_FN:%.*]] = alloca i8*, align 8
// CHECK-NEXT: [[EXEC_STATUS:%.*]] = alloca i8, align 1
// CHECK-NEXT: store i8* null, i8** [[WORK_FN]], align 8
// CHECK-NEXT: store i8 0, i8* [[EXEC_STATUS]], align 1
// CHECK-NEXT: br label [[DOTAWAIT_WORK:%.*]]
// CHECK: .await.work:
// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK-NEXT: [[TMP0:%.*]] = call i1 @__kmpc_kernel_parallel(i8** [[WORK_FN]])
// CHECK-NEXT: [[TMP1:%.*]] = zext i1 [[TMP0]] to i8
// CHECK-NEXT: store i8 [[TMP1]], i8* [[EXEC_STATUS]], align 1
// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK-NEXT: [[SHOULD_TERMINATE:%.*]] = icmp eq i8* [[TMP2]], null
// CHECK-NEXT: br i1 [[SHOULD_TERMINATE]], label [[DOTEXIT:%.*]], label [[DOTSELECT_WORKERS:%.*]]
// CHECK: .select.workers:
// CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[EXEC_STATUS]], align 1
// CHECK-NEXT: [[IS_ACTIVE:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK-NEXT: br i1 [[IS_ACTIVE]], label [[DOTEXECUTE_PARALLEL:%.*]], label [[DOTBARRIER_PARALLEL:%.*]]
// CHECK: .execute.parallel:
// CHECK-NEXT: [[TMP4:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
// CHECK-NEXT: [[TMP5:%.*]] = load i8*, i8** [[WORK_FN]], align 8
// CHECK-NEXT: [[WORK_MATCH:%.*]] = icmp eq i8* [[TMP5]], bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*)
// CHECK-NEXT: br i1 [[WORK_MATCH]], label [[DOTEXECUTE_FN:%.*]], label [[DOTCHECK_NEXT:%.*]]
// CHECK: .execute.fn:
// CHECK-NEXT: call void @__omp_outlined___wrapper(i16 0, i32 [[TMP4]]) #[[ATTR3:[0-9]+]]
// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL:%.*]]
// CHECK: .check.next:
// CHECK-NEXT: [[TMP6:%.*]] = bitcast i8* [[TMP2]] to void (i16, i32)*
// CHECK-NEXT: call void [[TMP6]](i16 0, i32 [[TMP4]])
// CHECK-NEXT: br label [[DOTTERMINATE_PARALLEL]]
// CHECK: .terminate.parallel:
// CHECK-NEXT: call void @__kmpc_kernel_end_parallel()
// CHECK-NEXT: br label [[DOTBARRIER_PARALLEL]]
// CHECK: .barrier.parallel:
// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK-NEXT: br label [[DOTAWAIT_WORK]]
// CHECK: .exit:
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13
// CHECK-SAME: (i64 [[N:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]]) #[[ATTR1:[0-9]+]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i64, align 8
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
// CHECK-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
// CHECK-NEXT: store i64 [[N]], i64* [[N_ADDR]], align 8
// CHECK-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
// CHECK-NEXT: [[CONV:%.*]] = bitcast i64* [[N_ADDR]] to i32*
// CHECK-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
// CHECK-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-NEXT: [[NVPTX_WARP_SIZE:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-NEXT: [[THREAD_LIMIT:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS]], [[NVPTX_WARP_SIZE]]
// CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[NVPTX_TID]], [[THREAD_LIMIT]]
// CHECK-NEXT: br i1 [[TMP1]], label [[DOTWORKER:%.*]], label [[DOTMASTERCHECK:%.*]]
// CHECK: .worker:
// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z9ftemplateIiET_i_l13_worker() #[[ATTR3]]
// CHECK-NEXT: br label [[DOTEXIT:%.*]]
// CHECK: .mastercheck:
// CHECK-NEXT: [[NVPTX_TID1:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK-NEXT: [[NVPTX_NUM_THREADS2:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-NEXT: [[NVPTX_WARP_SIZE3:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-NEXT: [[TMP2:%.*]] = sub nuw i32 [[NVPTX_WARP_SIZE3]], 1
// CHECK-NEXT: [[TMP3:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS2]], 1
// CHECK-NEXT: [[TMP4:%.*]] = xor i32 [[TMP2]], -1
// CHECK-NEXT: [[MASTER_TID:%.*]] = and i32 [[TMP3]], [[TMP4]]
// CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[NVPTX_TID1]], [[MASTER_TID]]
// CHECK-NEXT: br i1 [[TMP5]], label [[DOTMASTER:%.*]], label [[DOTEXIT]]
// CHECK: .master:
// CHECK-NEXT: [[NVPTX_NUM_THREADS4:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK-NEXT: [[D:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK-NEXT: [[D_ON_STACK:%.*]] = bitcast i8* [[D]] to i32*
// CHECK-NEXT: [[TMP6:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2]])
// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[CONV]], align 8
// CHECK-NEXT: store i32 [[TMP7]], i32* [[D_ON_STACK]], align 4
// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK-NEXT: [[TMP9:%.*]] = bitcast [10 x i32]* [[TMP0]] to i8*
// CHECK-NEXT: store i8* [[TMP9]], i8** [[TMP8]], align 8
// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
// CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[D_ON_STACK]] to i8*
// CHECK-NEXT: store i8* [[TMP11]], i8** [[TMP10]], align 8
// CHECK-NEXT: [[TMP12:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB2]], i32 [[TMP6]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, [10 x i32]*, i32*)* @__omp_outlined__ to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined___wrapper to i8*), i8** [[TMP12]], i64 2)
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 3
// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
// CHECK-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX]], align 4
// CHECK-NEXT: call void @__kmpc_free_shared(i8* [[D]])
// CHECK-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK: .termination.notifier:
// CHECK-NEXT: call void @__kmpc_kernel_deinit(i16 1)
// CHECK-NEXT: call void @__kmpc_barrier_simple_spmd(%struct.ident_t* null, i32 0)
// CHECK-NEXT: br label [[DOTEXIT]]
// CHECK: .exit:
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@__omp_outlined__
// CHECK-SAME: (i32* noalias [[DOTGLOBAL_TID_:%.*]], i32* noalias [[DOTBOUND_TID_:%.*]], [10 x i32]* nonnull align 4 dereferenceable(40) [[B:%.*]], i32* nonnull align 4 dereferenceable(4) [[D:%.*]]) #[[ATTR1]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca i32*, align 8
// CHECK-NEXT: [[B_ADDR:%.*]] = alloca [10 x i32]*, align 8
// CHECK-NEXT: [[D_ADDR:%.*]] = alloca i32*, align 8
// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4
// CHECK-NEXT: store i32* [[DOTGLOBAL_TID_]], i32** [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK-NEXT: store i32* [[DOTBOUND_TID_]], i32** [[DOTBOUND_TID__ADDR]], align 8
// CHECK-NEXT: store [10 x i32]* [[B]], [10 x i32]** [[B_ADDR]], align 8
// CHECK-NEXT: store i32* [[D]], i32** [[D_ADDR]], align 8
// CHECK-NEXT: [[TMP0:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
// CHECK-NEXT: [[TMP1:%.*]] = load i32*, i32** [[D_ADDR]], align 8
// CHECK-NEXT: store i32 0, i32* [[DOTOMP_LB]], align 4
// CHECK-NEXT: store i32 9, i32* [[DOTOMP_UB]], align 4
// CHECK-NEXT: store i32 1, i32* [[DOTOMP_STRIDE]], align 4
// CHECK-NEXT: store i32 0, i32* [[DOTOMP_IS_LAST]], align 4
// CHECK-NEXT: [[TMP2:%.*]] = load i32*, i32** [[DOTGLOBAL_TID__ADDR]], align 8
// CHECK-NEXT: [[TMP3:%.*]] = load i32, i32* [[TMP2]], align 4
// CHECK-NEXT: call void @__kmpc_for_static_init_4(%struct.ident_t* @[[GLOB1:[0-9]+]], i32 [[TMP3]], i32 33, i32* [[DOTOMP_IS_LAST]], i32* [[DOTOMP_LB]], i32* [[DOTOMP_UB]], i32* [[DOTOMP_STRIDE]], i32 1, i32 1)
// CHECK-NEXT: br label [[OMP_DISPATCH_COND:%.*]]
// CHECK: omp.dispatch.cond:
// CHECK-NEXT: [[TMP4:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP4]], 9
// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
// CHECK: cond.true:
// CHECK-NEXT: br label [[COND_END:%.*]]
// CHECK: cond.false:
// CHECK-NEXT: [[TMP5:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
// CHECK-NEXT: br label [[COND_END]]
// CHECK: cond.end:
// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP5]], [[COND_FALSE]] ]
// CHECK-NEXT: store i32 [[COND]], i32* [[DOTOMP_UB]], align 4
// CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
// CHECK-NEXT: store i32 [[TMP6]], i32* [[DOTOMP_IV]], align 4
// CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
// CHECK-NEXT: [[TMP8:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
// CHECK-NEXT: [[CMP1:%.*]] = icmp sle i32 [[TMP7]], [[TMP8]]
// CHECK-NEXT: br i1 [[CMP1]], label [[OMP_DISPATCH_BODY:%.*]], label [[OMP_DISPATCH_END:%.*]]
// CHECK: omp.dispatch.body:
// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]]
// CHECK: omp.inner.for.cond:
// CHECK-NEXT: [[TMP9:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
// CHECK-NEXT: [[TMP10:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
// CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP9]], [[TMP10]]
// CHECK-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]]
// CHECK: omp.inner.for.body:
// CHECK-NEXT: [[TMP11:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP11]], 1
// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]]
// CHECK-NEXT: store i32 [[ADD]], i32* [[I]], align 4
// CHECK-NEXT: [[TMP12:%.*]] = load i32, i32* [[TMP1]], align 4
// CHECK-NEXT: [[TMP13:%.*]] = load i32, i32* [[I]], align 4
// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP13]] to i64
// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP0]], i64 0, i64 [[IDXPROM]]
// CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
// CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP14]], [[TMP12]]
// CHECK-NEXT: store i32 [[ADD3]], i32* [[ARRAYIDX]], align 4
// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]]
// CHECK: omp.body.continue:
// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]]
// CHECK: omp.inner.for.inc:
// CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[DOTOMP_IV]], align 4
// CHECK-NEXT: [[ADD4:%.*]] = add nsw i32 [[TMP15]], 1
// CHECK-NEXT: store i32 [[ADD4]], i32* [[DOTOMP_IV]], align 4
// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]]
// CHECK: omp.inner.for.end:
// CHECK-NEXT: br label [[OMP_DISPATCH_INC:%.*]]
// CHECK: omp.dispatch.inc:
// CHECK-NEXT: [[TMP16:%.*]] = load i32, i32* [[DOTOMP_LB]], align 4
// CHECK-NEXT: [[TMP17:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
// CHECK-NEXT: [[ADD5:%.*]] = add nsw i32 [[TMP16]], [[TMP17]]
// CHECK-NEXT: store i32 [[ADD5]], i32* [[DOTOMP_LB]], align 4
// CHECK-NEXT: [[TMP18:%.*]] = load i32, i32* [[DOTOMP_UB]], align 4
// CHECK-NEXT: [[TMP19:%.*]] = load i32, i32* [[DOTOMP_STRIDE]], align 4
// CHECK-NEXT: [[ADD6:%.*]] = add nsw i32 [[TMP18]], [[TMP19]]
// CHECK-NEXT: store i32 [[ADD6]], i32* [[DOTOMP_UB]], align 4
// CHECK-NEXT: br label [[OMP_DISPATCH_COND]]
// CHECK: omp.dispatch.end:
// CHECK-NEXT: call void @__kmpc_for_static_fini(%struct.ident_t* @[[GLOB1]], i32 [[TMP3]])
// CHECK-NEXT: ret void
//
//
// CHECK-LABEL: define {{[^@]+}}@__omp_outlined___wrapper
// CHECK-SAME: (i16 zeroext [[TMP0:%.*]], i32 [[TMP1:%.*]]) #[[ATTR0]] {
// CHECK-NEXT: entry:
// CHECK-NEXT: [[DOTADDR:%.*]] = alloca i16, align 2
// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[DOTZERO_ADDR:%.*]] = alloca i32, align 4
// CHECK-NEXT: [[GLOBAL_ARGS:%.*]] = alloca i8**, align 8
// CHECK-NEXT: store i32 0, i32* [[DOTZERO_ADDR]], align 4
// CHECK-NEXT: store i16 [[TMP0]], i16* [[DOTADDR]], align 2
// CHECK-NEXT: store i32 [[TMP1]], i32* [[DOTADDR1]], align 4
// CHECK-NEXT: call void @__kmpc_get_shared_variables(i8*** [[GLOBAL_ARGS]])
// CHECK-NEXT: [[TMP2:%.*]] = load i8**, i8*** [[GLOBAL_ARGS]], align 8
// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 0
// CHECK-NEXT: [[TMP4:%.*]] = bitcast i8** [[TMP3]] to [10 x i32]**
// CHECK-NEXT: [[TMP5:%.*]] = load [10 x i32]*, [10 x i32]** [[TMP4]], align 8
// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8*, i8** [[TMP2]], i64 1
// CHECK-NEXT: [[TMP7:%.*]] = bitcast i8** [[TMP6]] to i32**
// CHECK-NEXT: [[TMP8:%.*]] = load i32*, i32** [[TMP7]], align 8
// CHECK-NEXT: call void @__omp_outlined__(i32* [[DOTADDR1]], i32* [[DOTZERO_ADDR]], [10 x i32]* [[TMP5]], i32* [[TMP8]]) #[[ATTR3]]
// CHECK-NEXT: ret void
//

View File

@ -155,7 +155,6 @@ void unreachable_call() {
// CHECK1-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 8
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK1: .execute:
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -257,7 +256,6 @@ void unreachable_call() {
// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK1-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK1: .termination.notifier:
// CHECK1-NEXT: call void @__kmpc_kernel_deinit(i16 1)
@ -332,7 +330,6 @@ void unreachable_call() {
// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK1-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 8
// CHECK1-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1
@ -441,7 +438,6 @@ void unreachable_call() {
// CHECK1-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK1-NEXT: [[TMP13:%.*]] = load i32, i32* [[CONV]], align 8
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8
@ -577,7 +573,6 @@ void unreachable_call() {
// CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8
@ -681,7 +676,6 @@ void unreachable_call() {
// CHECK1-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK1-NEXT: [[TMP9:%.*]] = load i32, i32* [[CONV]], align 8
// CHECK1-NEXT: [[CONV9:%.*]] = sitofp i32 [[TMP9]] to double
// CHECK1-NEXT: [[ADD:%.*]] = fadd double [[CONV9]], 1.500000e+00
@ -711,56 +705,27 @@ void unreachable_call() {
//
//
// CHECK1-LABEL: define {{[^@]+}}@_Z3baziRd
// CHECK1-SAME: (i32 [[F3:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] {
// CHECK1-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] {
// CHECK1-NEXT: entry:
// CHECK1-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[F2:%.*]] = alloca i32, align 4
// CHECK1-NEXT: [[A_ADDR:%.*]] = alloca double*, align 8
// CHECK1-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 8
// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
// CHECK1-NEXT: [[TMP1:%.*]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]])
// CHECK1-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0
// CHECK1-NEXT: [[TMP3:%.*]] = call i8 @__kmpc_is_spmd_exec_mode() #[[ATTR2]]
// CHECK1-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTSPMD:%.*]], label [[DOTNON_SPMD:%.*]]
// CHECK1: .spmd:
// CHECK1-NEXT: br label [[DOTEXIT:%.*]]
// CHECK1: .non-spmd:
// CHECK1-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i64 4, i64 128
// CHECK1-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 [[TMP5]], i16 0)
// CHECK1-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty*
// CHECK1-NEXT: br label [[DOTEXIT]]
// CHECK1: .exit:
// CHECK1-NEXT: [[_SELECT_STACK:%.*]] = phi %struct._globalized_locals_ty* [ null, [[DOTSPMD]] ], [ [[TMP7]], [[DOTNON_SPMD]] ]
// CHECK1-NEXT: [[TMP8:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to %struct._globalized_locals_ty.0*
// CHECK1-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[_SELECT_STACK]], i32 0, i32 0
// CHECK1-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK1-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID]], 31
// CHECK1-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F]], i32 0, i32 [[NVPTX_LANE_ID]]
// CHECK1-NEXT: [[F1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP8]], i32 0, i32 0
// CHECK1-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32* [[F1]], i32* [[TMP9]]
// CHECK1-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], i32* [[F2]], i32* [[TMP10]]
// CHECK1-NEXT: store i32 [[F3]], i32* [[TMP11]], align 4
// CHECK1-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i64 4)
// CHECK1-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
// CHECK1-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4
// CHECK1-NEXT: store double* [[A]], double** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP12:%.*]] = load double*, double** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK1-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP11]] to i8*
// CHECK1-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 8
// CHECK1-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
// CHECK1-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP12]] to i8*
// CHECK1-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 8
// CHECK1-NEXT: [[TMP17:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP17]], i64 2)
// CHECK1-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP11]], align 4
// CHECK1-NEXT: store i32 [[TMP18]], i32* [[RETVAL]], align 4
// CHECK1-NEXT: br i1 [[TMP4]], label [[DOTEXIT5:%.*]], label [[DOTNON_SPMD4:%.*]]
// CHECK1: .non-spmd4:
// CHECK1-NEXT: [[TMP19:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to i8*
// CHECK1-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP19]])
// CHECK1-NEXT: br label [[DOTEXIT5]]
// CHECK1: .exit5:
// CHECK1-NEXT: [[TMP20:%.*]] = load i32, i32* [[RETVAL]], align 4
// CHECK1-NEXT: ret i32 [[TMP20]]
// CHECK1-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 8
// CHECK1-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 0
// CHECK1-NEXT: [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8*
// CHECK1-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 8
// CHECK1-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i64 0, i64 1
// CHECK1-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8*
// CHECK1-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 8
// CHECK1-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK1-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i64 2)
// CHECK1-NEXT: [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4
// CHECK1-NEXT: call void @__kmpc_free_shared(i8* [[F]])
// CHECK1-NEXT: ret i32 [[TMP7]]
//
//
// CHECK1-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker
@ -825,7 +790,6 @@ void unreachable_call() {
// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK1-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]]
// CHECK1-NEXT: unreachable
// CHECK1: 5:
@ -909,7 +873,6 @@ void unreachable_call() {
// CHECK1-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK1-NEXT: [[TMP6:%.*]] = load i32, i32* [[CONV]], align 8
// CHECK1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
// CHECK1-NEXT: store i32 [[ADD]], i32* [[CONV]], align 8
@ -987,7 +950,6 @@ void unreachable_call() {
// CHECK2-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK2: .execute:
// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -1089,7 +1051,6 @@ void unreachable_call() {
// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK2-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK2: .termination.notifier:
// CHECK2-NEXT: call void @__kmpc_kernel_deinit(i16 1)
@ -1164,7 +1125,6 @@ void unreachable_call() {
// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK2-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 4
// CHECK2-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32
// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1
@ -1272,7 +1232,6 @@ void unreachable_call() {
// CHECK2-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK2-NEXT: [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4
// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
// CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4
@ -1407,7 +1366,6 @@ void unreachable_call() {
// CHECK2-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
// CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4
@ -1510,7 +1468,6 @@ void unreachable_call() {
// CHECK2-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK2-NEXT: [[TMP9:%.*]] = load i32, i32* [[B_ADDR]], align 4
// CHECK2-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP9]] to double
// CHECK2-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
@ -1540,56 +1497,27 @@ void unreachable_call() {
//
//
// CHECK2-LABEL: define {{[^@]+}}@_Z3baziRd
// CHECK2-SAME: (i32 [[F3:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] {
// CHECK2-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] {
// CHECK2-NEXT: entry:
// CHECK2-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[F2:%.*]] = alloca i32, align 4
// CHECK2-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4
// CHECK2-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
// CHECK2-NEXT: [[TMP1:%.*]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]])
// CHECK2-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0
// CHECK2-NEXT: [[TMP3:%.*]] = call i8 @__kmpc_is_spmd_exec_mode() #[[ATTR2]]
// CHECK2-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTSPMD:%.*]], label [[DOTNON_SPMD:%.*]]
// CHECK2: .spmd:
// CHECK2-NEXT: br label [[DOTEXIT:%.*]]
// CHECK2: .non-spmd:
// CHECK2-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i32 4, i32 128
// CHECK2-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i32 [[TMP5]], i16 0)
// CHECK2-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty*
// CHECK2-NEXT: br label [[DOTEXIT]]
// CHECK2: .exit:
// CHECK2-NEXT: [[_SELECT_STACK:%.*]] = phi %struct._globalized_locals_ty* [ null, [[DOTSPMD]] ], [ [[TMP7]], [[DOTNON_SPMD]] ]
// CHECK2-NEXT: [[TMP8:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to %struct._globalized_locals_ty.0*
// CHECK2-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[_SELECT_STACK]], i32 0, i32 0
// CHECK2-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK2-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID]], 31
// CHECK2-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F]], i32 0, i32 [[NVPTX_LANE_ID]]
// CHECK2-NEXT: [[F1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP8]], i32 0, i32 0
// CHECK2-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32* [[F1]], i32* [[TMP9]]
// CHECK2-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], i32* [[F2]], i32* [[TMP10]]
// CHECK2-NEXT: store i32 [[F3]], i32* [[TMP11]], align 4
// CHECK2-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK2-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
// CHECK2-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4
// CHECK2-NEXT: store double* [[A]], double** [[A_ADDR]], align 4
// CHECK2-NEXT: [[TMP12:%.*]] = load double*, double** [[A_ADDR]], align 4
// CHECK2-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
// CHECK2-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP11]] to i8*
// CHECK2-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4
// CHECK2-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
// CHECK2-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP12]] to i8*
// CHECK2-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4
// CHECK2-NEXT: [[TMP17:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP17]], i32 2)
// CHECK2-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP11]], align 4
// CHECK2-NEXT: store i32 [[TMP18]], i32* [[RETVAL]], align 4
// CHECK2-NEXT: br i1 [[TMP4]], label [[DOTEXIT5:%.*]], label [[DOTNON_SPMD4:%.*]]
// CHECK2: .non-spmd4:
// CHECK2-NEXT: [[TMP19:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to i8*
// CHECK2-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP19]])
// CHECK2-NEXT: br label [[DOTEXIT5]]
// CHECK2: .exit5:
// CHECK2-NEXT: [[TMP20:%.*]] = load i32, i32* [[RETVAL]], align 4
// CHECK2-NEXT: ret i32 [[TMP20]]
// CHECK2-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4
// CHECK2-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
// CHECK2-NEXT: [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8*
// CHECK2-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4
// CHECK2-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
// CHECK2-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8*
// CHECK2-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4
// CHECK2-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK2-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i32 2)
// CHECK2-NEXT: [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4
// CHECK2-NEXT: call void @__kmpc_free_shared(i8* [[F]])
// CHECK2-NEXT: ret i32 [[TMP7]]
//
//
// CHECK2-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker
@ -1654,7 +1582,6 @@ void unreachable_call() {
// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK2-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]]
// CHECK2-NEXT: unreachable
// CHECK2: 5:
@ -1737,7 +1664,6 @@ void unreachable_call() {
// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK2-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
// CHECK2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
// CHECK2-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4
@ -1815,7 +1741,6 @@ void unreachable_call() {
// CHECK3-NEXT: [[TMP0:%.*]] = load i32**, i32*** [[PTR2_ADDR]], align 4
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK3: .execute:
// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -1917,7 +1842,6 @@ void unreachable_call() {
// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK3-NEXT: br label [[DOTTERMINATION_NOTIFIER:%.*]]
// CHECK3: .termination.notifier:
// CHECK3-NEXT: call void @__kmpc_kernel_deinit(i16 1)
@ -1992,7 +1916,6 @@ void unreachable_call() {
// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK3-NEXT: [[TMP5:%.*]] = load i16, i16* [[CONV]], align 4
// CHECK3-NEXT: [[CONV7:%.*]] = sext i16 [[TMP5]] to i32
// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[CONV7]], 1
@ -2100,7 +2023,6 @@ void unreachable_call() {
// CHECK3-NEXT: [[NVPTX_WARP_SIZE9:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT10:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS8]], [[NVPTX_WARP_SIZE9]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT10]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK3-NEXT: [[TMP13:%.*]] = load i32, i32* [[A_ADDR]], align 4
// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP13]], 1
// CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4
@ -2235,7 +2157,6 @@ void unreachable_call() {
// CHECK3-NEXT: [[NVPTX_WARP_SIZE6:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT7:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS5]], [[NVPTX_WARP_SIZE6]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT7]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
// CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4
@ -2338,7 +2259,6 @@ void unreachable_call() {
// CHECK3-NEXT: [[NVPTX_WARP_SIZE7:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT8:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS6]], [[NVPTX_WARP_SIZE7]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT8]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK3-NEXT: [[TMP9:%.*]] = load i32, i32* [[B_ADDR]], align 4
// CHECK3-NEXT: [[CONV:%.*]] = sitofp i32 [[TMP9]] to double
// CHECK3-NEXT: [[ADD:%.*]] = fadd double [[CONV]], 1.500000e+00
@ -2368,56 +2288,27 @@ void unreachable_call() {
//
//
// CHECK3-LABEL: define {{[^@]+}}@_Z3baziRd
// CHECK3-SAME: (i32 [[F3:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] {
// CHECK3-SAME: (i32 [[F1:%.*]], double* nonnull align 8 dereferenceable(8) [[A:%.*]]) #[[ATTR5]] {
// CHECK3-NEXT: entry:
// CHECK3-NEXT: [[RETVAL:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[F2:%.*]] = alloca i32, align 4
// CHECK3-NEXT: [[A_ADDR:%.*]] = alloca double*, align 4
// CHECK3-NEXT: [[CAPTURED_VARS_ADDRS:%.*]] = alloca [2 x i8*], align 4
// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
// CHECK3-NEXT: [[TMP1:%.*]] = call i16 @__kmpc_parallel_level(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]])
// CHECK3-NEXT: [[TMP2:%.*]] = icmp eq i16 [[TMP1]], 0
// CHECK3-NEXT: [[TMP3:%.*]] = call i8 @__kmpc_is_spmd_exec_mode() #[[ATTR2]]
// CHECK3-NEXT: [[TMP4:%.*]] = icmp ne i8 [[TMP3]], 0
// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTSPMD:%.*]], label [[DOTNON_SPMD:%.*]]
// CHECK3: .spmd:
// CHECK3-NEXT: br label [[DOTEXIT:%.*]]
// CHECK3: .non-spmd:
// CHECK3-NEXT: [[TMP5:%.*]] = select i1 [[TMP2]], i32 4, i32 128
// CHECK3-NEXT: [[TMP6:%.*]] = call i8* @__kmpc_data_sharing_coalesced_push_stack(i32 [[TMP5]], i16 0)
// CHECK3-NEXT: [[TMP7:%.*]] = bitcast i8* [[TMP6]] to %struct._globalized_locals_ty*
// CHECK3-NEXT: br label [[DOTEXIT]]
// CHECK3: .exit:
// CHECK3-NEXT: [[_SELECT_STACK:%.*]] = phi %struct._globalized_locals_ty* [ null, [[DOTSPMD]] ], [ [[TMP7]], [[DOTNON_SPMD]] ]
// CHECK3-NEXT: [[TMP8:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to %struct._globalized_locals_ty.0*
// CHECK3-NEXT: [[F:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY:%.*]], %struct._globalized_locals_ty* [[_SELECT_STACK]], i32 0, i32 0
// CHECK3-NEXT: [[NVPTX_TID:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
// CHECK3-NEXT: [[NVPTX_LANE_ID:%.*]] = and i32 [[NVPTX_TID]], 31
// CHECK3-NEXT: [[TMP9:%.*]] = getelementptr inbounds [32 x i32], [32 x i32]* [[F]], i32 0, i32 [[NVPTX_LANE_ID]]
// CHECK3-NEXT: [[F1:%.*]] = getelementptr inbounds [[STRUCT__GLOBALIZED_LOCALS_TY_0:%.*]], %struct._globalized_locals_ty.0* [[TMP8]], i32 0, i32 0
// CHECK3-NEXT: [[TMP10:%.*]] = select i1 [[TMP2]], i32* [[F1]], i32* [[TMP9]]
// CHECK3-NEXT: [[TMP11:%.*]] = select i1 [[TMP4]], i32* [[F2]], i32* [[TMP10]]
// CHECK3-NEXT: store i32 [[F3]], i32* [[TMP11]], align 4
// CHECK3-NEXT: [[F:%.*]] = call i8* @__kmpc_alloc_shared(i32 4)
// CHECK3-NEXT: [[F_ON_STACK:%.*]] = bitcast i8* [[F]] to i32*
// CHECK3-NEXT: store i32 [[F1]], i32* [[F_ON_STACK]], align 4
// CHECK3-NEXT: store double* [[A]], double** [[A_ADDR]], align 4
// CHECK3-NEXT: [[TMP12:%.*]] = load double*, double** [[A_ADDR]], align 4
// CHECK3-NEXT: [[TMP13:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
// CHECK3-NEXT: [[TMP14:%.*]] = bitcast i32* [[TMP11]] to i8*
// CHECK3-NEXT: store i8* [[TMP14]], i8** [[TMP13]], align 4
// CHECK3-NEXT: [[TMP15:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
// CHECK3-NEXT: [[TMP16:%.*]] = bitcast double* [[TMP12]] to i8*
// CHECK3-NEXT: store i8* [[TMP16]], i8** [[TMP15]], align 4
// CHECK3-NEXT: [[TMP17:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP17]], i32 2)
// CHECK3-NEXT: [[TMP18:%.*]] = load i32, i32* [[TMP11]], align 4
// CHECK3-NEXT: store i32 [[TMP18]], i32* [[RETVAL]], align 4
// CHECK3-NEXT: br i1 [[TMP4]], label [[DOTEXIT5:%.*]], label [[DOTNON_SPMD4:%.*]]
// CHECK3: .non-spmd4:
// CHECK3-NEXT: [[TMP19:%.*]] = bitcast %struct._globalized_locals_ty* [[_SELECT_STACK]] to i8*
// CHECK3-NEXT: call void @__kmpc_data_sharing_pop_stack(i8* [[TMP19]])
// CHECK3-NEXT: br label [[DOTEXIT5]]
// CHECK3: .exit5:
// CHECK3-NEXT: [[TMP20:%.*]] = load i32, i32* [[RETVAL]], align 4
// CHECK3-NEXT: ret i32 [[TMP20]]
// CHECK3-NEXT: [[TMP1:%.*]] = load double*, double** [[A_ADDR]], align 4
// CHECK3-NEXT: [[TMP2:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 0
// CHECK3-NEXT: [[TMP3:%.*]] = bitcast i32* [[F_ON_STACK]] to i8*
// CHECK3-NEXT: store i8* [[TMP3]], i8** [[TMP2]], align 4
// CHECK3-NEXT: [[TMP4:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[CAPTURED_VARS_ADDRS]], i32 0, i32 1
// CHECK3-NEXT: [[TMP5:%.*]] = bitcast double* [[TMP1]] to i8*
// CHECK3-NEXT: store i8* [[TMP5]], i8** [[TMP4]], align 4
// CHECK3-NEXT: [[TMP6:%.*]] = bitcast [2 x i8*]* [[CAPTURED_VARS_ADDRS]] to i8**
// CHECK3-NEXT: call void @__kmpc_parallel_51(%struct.ident_t* @[[GLOB3]], i32 [[TMP0]], i32 1, i32 -1, i32 -1, i8* bitcast (void (i32*, i32*, i32*, double*)* @__omp_outlined__1 to i8*), i8* bitcast (void (i16, i32)* @__omp_outlined__1_wrapper to i8*), i8** [[TMP6]], i32 2)
// CHECK3-NEXT: [[TMP7:%.*]] = load i32, i32* [[F_ON_STACK]], align 4
// CHECK3-NEXT: call void @__kmpc_free_shared(i8* [[F]])
// CHECK3-NEXT: ret i32 [[TMP7]]
//
//
// CHECK3-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z16unreachable_callv_l142_worker
@ -2482,7 +2373,6 @@ void unreachable_call() {
// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK3-NEXT: call void @_Z6asserti(i32 0) #[[ATTR8:[0-9]+]]
// CHECK3-NEXT: unreachable
// CHECK3: 5:
@ -2565,7 +2455,6 @@ void unreachable_call() {
// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK3-NEXT: [[TMP6:%.*]] = load i32, i32* [[A_ADDR]], align 4
// CHECK3-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP6]], 1
// CHECK3-NEXT: store i32 [[ADD]], i32* [[A_ADDR]], align 4

View File

@ -60,7 +60,6 @@ int bar(int n){
// CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK1: .execute:
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -110,7 +109,6 @@ int bar(int n){
// CHECK1-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK1: .execute:
// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -173,7 +171,6 @@ int bar(int n){
// CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK2: .execute:
// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -223,7 +220,6 @@ int bar(int n){
// CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK2: .execute:
// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -286,7 +282,6 @@ int bar(int n){
// CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK3: .execute:
// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -336,7 +331,6 @@ int bar(int n){
// CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK3: .execute:
// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -399,7 +393,6 @@ int bar(int n){
// CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8
// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK4: .execute:
// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -449,7 +442,6 @@ int bar(int n){
// CHECK4-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 8
// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK4: .execute:
// CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -512,7 +504,6 @@ int bar(int n){
// CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK5: .execute:
// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -562,7 +553,6 @@ int bar(int n){
// CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK5: .execute:
// CHECK5-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -625,7 +615,6 @@ int bar(int n){
// CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK6: .execute:
// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -675,7 +664,6 @@ int bar(int n){
// CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK6: .execute:
// CHECK6-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])

View File

@ -55,7 +55,6 @@ int bar(int n){
// CHECK1-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK1: .execute:
// CHECK1-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -108,7 +107,6 @@ int bar(int n){
// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32*
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK1: .execute:
// CHECK1-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -172,7 +170,6 @@ int bar(int n){
// CHECK2-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK2: .execute:
// CHECK2-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -224,7 +221,6 @@ int bar(int n){
// CHECK2-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK2: .execute:
// CHECK2-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -288,7 +284,6 @@ int bar(int n){
// CHECK3-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK3: .execute:
// CHECK3-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -340,7 +335,6 @@ int bar(int n){
// CHECK3-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK3: .execute:
// CHECK3-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -404,7 +398,6 @@ int bar(int n){
// CHECK4-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 8
// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK4: .execute:
// CHECK4-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -457,7 +450,6 @@ int bar(int n){
// CHECK4-NEXT: [[CONV:%.*]] = bitcast i64* [[DOTCAPTURE_EXPR__ADDR]] to i32*
// CHECK4-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK4-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK4-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK4-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK4: .execute:
// CHECK4-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -521,7 +513,6 @@ int bar(int n){
// CHECK5-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK5: .execute:
// CHECK5-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -573,7 +564,6 @@ int bar(int n){
// CHECK5-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
// CHECK5-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK5-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK5-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK5-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK5: .execute:
// CHECK5-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
@ -637,7 +627,6 @@ int bar(int n){
// CHECK6-NEXT: [[TMP0:%.*]] = load i16*, i16** [[AA_ADDR]], align 4
// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK6: .execute:
// CHECK6-NEXT: [[TMP1:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -689,7 +678,6 @@ int bar(int n){
// CHECK6-NEXT: [[TMP2:%.*]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align 4
// CHECK6-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK6-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK6-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK6-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK6: .execute:
// CHECK6-NEXT: [[TMP3:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])

View File

@ -55,7 +55,6 @@ int bar(int n){
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l29}}(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK: br label {{%?}}[[EXEC:.+]]
//
// CHECK: [[EXEC]]
@ -73,7 +72,6 @@ int bar(int n){
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK: br label {{%?}}[[EXEC:.+]]
//
// CHECK: [[EXEC]]
@ -91,7 +89,6 @@ int bar(int n){
// CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK: br label {{%?}}[[EXEC:.+]]
//
// CHECK: [[EXEC]]

View File

@ -55,7 +55,6 @@ int bar(int n){
// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l27}}(
//
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK: br label {{%?}}[[EXECUTE:.+]]
//
// CHECK: [[EXECUTE]]
@ -239,7 +238,6 @@ int bar(int n){
// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l32}}(
//
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK: br label {{%?}}[[EXECUTE:.+]]
//
// CHECK: [[EXECUTE]]
@ -501,7 +499,6 @@ int bar(int n){
// CHECK: define {{.*}}void {{@__omp_offloading_.+template.+l38}}(
//
// CHECK: call void @__kmpc_spmd_kernel_init(i32 {{.+}}, i16 1)
// CHECK: call void @__kmpc_data_sharing_init_stack_spmd
// CHECK: br label {{%?}}[[EXECUTE:.+]]
//
// CHECK: [[EXECUTE]]

View File

@ -118,7 +118,6 @@ int bar(int n){
// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK1-NEXT: [[TMP6:%.*]] = load i8, i8* [[CONV]], align 8
// CHECK1-NEXT: [[CONV7:%.*]] = bitcast i64* [[A_CASTED]] to i8*
@ -218,7 +217,6 @@ int bar(int n){
// CHECK1-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK1-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK1-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK1-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK1-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 8
// CHECK1-NEXT: [[CONV7:%.*]] = bitcast i64* [[AA_CASTED]] to i16*
@ -261,7 +259,6 @@ int bar(int n){
// CHECK1-NEXT: [[CONV:%.*]] = bitcast i64* [[AA_ADDR]] to i16*
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK1: .execute:
// CHECK1-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
@ -404,7 +401,6 @@ int bar(int n){
// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK2-NEXT: [[TMP6:%.*]] = load i8, i8* [[CONV]], align 4
// CHECK2-NEXT: [[CONV7:%.*]] = bitcast i32* [[A_CASTED]] to i8*
@ -504,7 +500,6 @@ int bar(int n){
// CHECK2-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK2-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK2-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK2-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK2-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 4
// CHECK2-NEXT: [[CONV7:%.*]] = bitcast i32* [[AA_CASTED]] to i16*
@ -547,7 +542,6 @@ int bar(int n){
// CHECK2-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
// CHECK2-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK2-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK2-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK2-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK2: .execute:
// CHECK2-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])
@ -690,7 +684,6 @@ int bar(int n){
// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK3-NEXT: [[TMP6:%.*]] = load i8, i8* [[CONV]], align 4
// CHECK3-NEXT: [[CONV7:%.*]] = bitcast i32* [[A_CASTED]] to i8*
@ -790,7 +783,6 @@ int bar(int n){
// CHECK3-NEXT: [[NVPTX_WARP_SIZE5:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
// CHECK3-NEXT: [[THREAD_LIMIT6:%.*]] = sub nuw i32 [[NVPTX_NUM_THREADS4]], [[NVPTX_WARP_SIZE5]]
// CHECK3-NEXT: call void @__kmpc_kernel_init(i32 [[THREAD_LIMIT6]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack()
// CHECK3-NEXT: [[TMP5:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1]])
// CHECK3-NEXT: [[TMP6:%.*]] = load i16, i16* [[CONV]], align 4
// CHECK3-NEXT: [[CONV7:%.*]] = bitcast i32* [[AA_CASTED]] to i16*
@ -833,7 +825,6 @@ int bar(int n){
// CHECK3-NEXT: [[CONV:%.*]] = bitcast i32* [[AA_ADDR]] to i16*
// CHECK3-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
// CHECK3-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1)
// CHECK3-NEXT: call void @__kmpc_data_sharing_init_stack_spmd()
// CHECK3-NEXT: br label [[DOTEXECUTE:%.*]]
// CHECK3: .execute:
// CHECK3-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB2:[0-9]+]])

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -97,7 +97,6 @@ int main() {
// CHECK1-NEXT: [[TMP7:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG45]]
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG45]]
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1), !dbg [[DBG45]]
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd(), !dbg [[DBG45]]
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG45]]
// CHECK1: .execute:
// CHECK1-NEXT: [[TMP8:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB1:[0-9]+]])
@ -321,7 +320,6 @@ int main() {
// CHECK1-NEXT: [[TMP8:%.*]] = load i8*, i8** [[_TMP2]], align 8, !dbg [[DBG135]]
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG135]]
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1), !dbg [[DBG135]]
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd(), !dbg [[DBG135]]
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG135]]
// CHECK1: .execute:
// CHECK1-NEXT: [[TMP9:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB3:[0-9]+]])
@ -539,7 +537,6 @@ int main() {
// CHECK1-NEXT: [[TMP11:%.*]] = load i8*, i8** [[_TMP3]], align 8, !dbg [[DBG210]]
// CHECK1-NEXT: [[NVPTX_NUM_THREADS:%.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg [[DBG210]]
// CHECK1-NEXT: call void @__kmpc_spmd_kernel_init(i32 [[NVPTX_NUM_THREADS]], i16 1), !dbg [[DBG210]]
// CHECK1-NEXT: call void @__kmpc_data_sharing_init_stack_spmd(), !dbg [[DBG210]]
// CHECK1-NEXT: br label [[DOTEXECUTE:%.*]], !dbg [[DBG210]]
// CHECK1: .execute:
// CHECK1-NEXT: [[TMP12:%.*]] = call i32 @__kmpc_global_thread_num(%struct.ident_t* @[[GLOB5:[0-9]+]])

View File

@ -429,20 +429,14 @@ __OMP_RTL(__kmpc_nvptx_teams_reduce_nowait_v2, false, Int32, IdentPtr, Int32,
GlobalListPtr, GlobalListPtr, GlobalListPtr, GlobalListPtr)
__OMP_RTL(__kmpc_shuffle_int64, false, Int64, Int64, Int16, Int16)
__OMP_RTL(__kmpc_data_sharing_init_stack, false, Void, )
__OMP_RTL(__kmpc_data_sharing_init_stack_spmd, false, Void, )
__OMP_RTL(__kmpc_data_sharing_coalesced_push_stack, false, VoidPtr, SizeTy, Int16)
__OMP_RTL(__kmpc_data_sharing_push_stack, false, VoidPtr, SizeTy, Int16)
__OMP_RTL(__kmpc_data_sharing_pop_stack, false, Void, VoidPtr)
__OMP_RTL(__kmpc_alloc_shared, false, VoidPtr, SizeTy)
__OMP_RTL(__kmpc_free_shared, false, Void, VoidPtr)
__OMP_RTL(__kmpc_begin_sharing_variables, false, Void, VoidPtrPtrPtr, SizeTy)
__OMP_RTL(__kmpc_end_sharing_variables, false, Void, )
__OMP_RTL(__kmpc_get_shared_variables, false, Void, VoidPtrPtrPtr)
__OMP_RTL(__kmpc_parallel_level, false, Int16, IdentPtr, Int32)
__OMP_RTL(__kmpc_is_spmd_exec_mode, false, Int8, )
__OMP_RTL(__kmpc_get_team_static_memory, false, Void, Int16, VoidPtr, SizeTy,
Int16, VoidPtrPtr)
__OMP_RTL(__kmpc_restore_team_static_memory, false, Void, Int16, Int16)
__OMP_RTL(__kmpc_barrier_simple_spmd, false, Void, IdentPtr, Int32)
__OMP_RTL(__kmpc_warp_active_thread_mask, false, LanemaskTy,)

View File

@ -1122,9 +1122,8 @@ private:
}
void analysisGlobalization() {
RuntimeFunction GlobalizationRuntimeIDs[] = {
OMPRTL___kmpc_data_sharing_coalesced_push_stack,
OMPRTL___kmpc_data_sharing_push_stack};
RuntimeFunction GlobalizationRuntimeIDs[] = {OMPRTL___kmpc_alloc_shared,
OMPRTL___kmpc_free_shared};
for (const auto GlobalizationCallID : GlobalizationRuntimeIDs) {
auto &RFI = OMPInfoCache.RFIs[GlobalizationCallID];

View File

@ -2,144 +2,41 @@
; ModuleID = 'declare_target_codegen_globalization.cpp'
source_filename = "declare_target_codegen_globalization.cpp"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
target triple = "nvptx64"
%struct.ident_t = type { i32, i32, i32, i32, i8* }
%struct._globalized_locals_ty = type { [32 x i32] }
; CHECK: remark: globalization_remarks.c:5:7: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
@0 = private unnamed_addr constant [56 x i8] c";declare_target_codegen_globalization.cpp;maini1;17;1;;\00", align 1
@1 = private unnamed_addr constant %struct.ident_t { i32 0, i32 2, i32 1, i32 0, i8* getelementptr inbounds ([56 x i8], [56 x i8]* @0, i32 0, i32 0) }, align 8
@__omp_offloading_801_3022563__Z6maini1v_l17_exec_mode = weak constant i8 0
@llvm.compiler.used = appending global [1 x i8*] [i8* @__omp_offloading_801_3022563__Z6maini1v_l17_exec_mode], section "llvm.metadata"
@S = external local_unnamed_addr global i8*
; CHECK: remark: declare_target_codegen_globalization.cpp:17:1: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
; CHECK: remark: declare_target_codegen_globalization.cpp:10:1: Found thread data sharing on the GPU. Expect degraded performance due to data globalization.
; Function Attrs: norecurse nounwind
define weak void @__omp_offloading_801_3022563__Z6maini1v_l17(i32* nonnull align 4 dereferenceable(4) %a) local_unnamed_addr #0 !dbg !10 {
define void @foo() {
entry:
%nvptx_num_threads = tail call i32 @llvm.nvvm.read.ptx.sreg.ntid.x(), !dbg !12, !range !13
tail call void @__kmpc_spmd_kernel_init(i32 %nvptx_num_threads, i16 1, i16 0) #4, !dbg !12
tail call void @__kmpc_data_sharing_init_stack_spmd() #4, !dbg !12
%0 = tail call i32 @__kmpc_global_thread_num(%struct.ident_t* nonnull @1)
%1 = tail call i8 @__kmpc_is_spmd_exec_mode() #4
%.not.i.i = icmp eq i8 %1, 0
br i1 %.not.i.i, label %.non-spmd2.i.i, label %__omp_outlined__.exit
.non-spmd2.i.i: ; preds = %entry
%2 = tail call i8* @__kmpc_data_sharing_coalesced_push_stack(i64 128, i16 0) #4, !dbg !12
tail call void @__kmpc_data_sharing_pop_stack(i8* %2) #4, !dbg !14
br label %__omp_outlined__.exit, !dbg !14
__omp_outlined__.exit: ; preds = %entry, %.non-spmd2.i.i
tail call void @__kmpc_spmd_kernel_deinit_v2(i16 1) #4, !dbg !19
ret void, !dbg !20
%0 = call i8* @__kmpc_alloc_shared(i64 4), !dbg !8
%x_on_stack = bitcast i8* %0 to i32*
%1 = bitcast i32* %x_on_stack to i8*
call void @share(i8* %1)
call void @__kmpc_free_shared(i8* %0)
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1
declare void @__kmpc_spmd_kernel_init(i32, i16, i16) local_unnamed_addr
declare void @__kmpc_data_sharing_init_stack_spmd() local_unnamed_addr
; Function Attrs: norecurse nounwind readonly
define hidden i32 @_Z3fooRi(i32* nocapture nonnull readonly align 4 dereferenceable(4) %a) local_unnamed_addr #2 !dbg !21 {
define void @share(i8* %x) {
entry:
%0 = load i32, i32* %a, align 4, !dbg !22, !tbaa !23
ret i32 %0, !dbg !27
store i8* %x, i8** @S
ret void
}
; Function Attrs: nounwind
define hidden i32 @_Z3barv() local_unnamed_addr #3 !dbg !15 {
entry:
%a1 = alloca i32, align 4
%0 = tail call i8 @__kmpc_is_spmd_exec_mode() #4
%.not = icmp eq i8 %0, 0
br i1 %.not, label %.non-spmd, label %.exit
declare i8* @__kmpc_alloc_shared(i64)
.non-spmd: ; preds = %entry
%1 = tail call i8* @__kmpc_data_sharing_push_stack(i64 128, i16 0) #4, !dbg !31
%2 = bitcast i8* %1 to %struct._globalized_locals_ty*
br label %.exit
declare void @__kmpc_free_shared(i8*)
.exit: ; preds = %entry, %.non-spmd
%_select_stack = phi %struct._globalized_locals_ty* [ %2, %.non-spmd ], [ null, %entry ]
%nvptx_tid = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !28
%nvptx_lane_id = and i32 %nvptx_tid, 31
%3 = zext i32 %nvptx_lane_id to i64
%4 = getelementptr inbounds %struct._globalized_locals_ty, %struct._globalized_locals_ty* %_select_stack, i64 0, i32 0, i64 %3
%5 = select i1 %.not, i32* %4, i32* %a1
%6 = load i32, i32* %5, align 4, !dbg !29, !tbaa !23
br i1 %.not, label %.non-spmd2, label %.exit3, !dbg !31
.non-spmd2: ; preds = %.exit
%7 = bitcast %struct._globalized_locals_ty* %_select_stack to i8*, !dbg !31
tail call void @__kmpc_data_sharing_pop_stack(i8* %7) #4, !dbg !31
br label %.exit3, !dbg !31
.exit3: ; preds = %.non-spmd2, %.exit
ret i32 %6, !dbg !31
}
declare i8 @__kmpc_is_spmd_exec_mode() local_unnamed_addr
declare i8* @__kmpc_data_sharing_coalesced_push_stack(i64, i16) local_unnamed_addr
declare i8* @__kmpc_data_sharing_push_stack(i64, i16) local_unnamed_addr
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
declare void @__kmpc_data_sharing_pop_stack(i8*) local_unnamed_addr
; Function Attrs: nounwind
declare i32 @__kmpc_global_thread_num(%struct.ident_t*) local_unnamed_addr #4
declare void @__kmpc_spmd_kernel_deinit_v2(i16) local_unnamed_addr
attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_35" "target-features"="+ptx32,+sm_35" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #4 = { nounwind }
!llvm.dbg.cu = !{!0}
!omp_offload.info = !{!3}
!nvvm.annotations = !{!4}
!llvm.module.flags = !{!5, !6, !7, !8}
!llvm.ident = !{!9}
!llvm.module.flags = !{!3, !4}
!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: DebugDirectivesOnly, enums: !2, splitDebugInlining: false, nameTableKind: None)
!1 = !DIFile(filename: "declare_target_codegen_globalization.cpp", directory: "/home/jhuber/Documents/llvm-project/clang/test/OpenMP")
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
!1 = !DIFile(filename: "globalization_remarks.c", directory: "/tmp/globalization_remarks.c")
!2 = !{}
!3 = !{i32 0, i32 2049, i32 50472291, !"_Z6maini1v", i32 17, i32 0}
!4 = !{void (i32*)* @__omp_offloading_801_3022563__Z6maini1v_l17, !"kernel", i32 1}
!5 = !{i32 7, !"Dwarf Version", i32 2}
!6 = !{i32 2, !"Debug Info Version", i32 3}
!7 = !{i32 1, !"wchar_size", i32 4}
!8 = !{i32 7, !"PIC Level", i32 2}
!9 = !{!"clang version 12.0.0"}
!10 = distinct !DISubprogram(name: "__omp_offloading_801_3022563__Z6maini1v_l17", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!11 = !DISubroutineType(types: !2)
!12 = !DILocation(line: 17, column: 1, scope: !10)
!13 = !{i32 1, i32 1025}
!14 = !DILocation(line: 10, column: 1, scope: !15, inlinedAt: !16)
!15 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 7, type: !11, scopeLine: 7, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!16 = distinct !DILocation(line: 20, column: 18, scope: !17, inlinedAt: !18)
!17 = distinct !DISubprogram(name: "__omp_outlined__", scope: !1, file: !1, line: 17, type: !11, scopeLine: 17, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!18 = distinct !DILocation(line: 17, column: 1, scope: !10)
!19 = !DILocation(line: 17, column: 40, scope: !10)
!20 = !DILocation(line: 21, column: 3, scope: !10)
!21 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 5, type: !11, scopeLine: 5, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!22 = !DILocation(line: 5, column: 26, scope: !21)
!23 = !{!24, !24, i64 0}
!24 = !{!"int", !25, i64 0}
!25 = !{!"omnipotent char", !26, i64 0}
!26 = !{!"Simple C++ TBAA"}
!27 = !DILocation(line: 5, column: 19, scope: !21)
!28 = !{i32 0, i32 1024}
!29 = !DILocation(line: 5, column: 26, scope: !21, inlinedAt: !30)
!30 = distinct !DILocation(line: 9, column: 10, scope: !15)
!31 = !DILocation(line: 10, column: 1, scope: !15)
!3 = !{i32 2, !"Debug Info Version", i32 3}
!4 = !{i32 1, !"wchar_size", i32 4}
!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !7, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
!7 = !DISubroutineType(types: !2)
!8 = !DILocation(line: 5, column: 7, scope: !6)

View File

@ -7,11 +7,11 @@ target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
define void @foo() {
entry:
%x = call i8* @__kmpc_data_sharing_push_stack(i64 4, i16 0), !dbg !7
%x = call i8* @__kmpc_alloc_shared(i64 4), !dbg !7
%x_on_stack = bitcast i8* %x to i32*
%0 = bitcast i32* %x_on_stack to i8*
call void @use(i8* %0)
call void @__kmpc_data_sharing_pop_stack(i8* %x)
call void @__kmpc_free_shared(i8* %x)
ret void
}
@ -22,7 +22,7 @@ entry:
ret void
}
define internal i8* @__kmpc_data_sharing_push_stack(i64 %DataSize, i16 %shared) {
define internal i8* @__kmpc_alloc_shared(i64 %DataSize) {
entry:
%call = call i8* @_Z10SafeMallocmPKc(i64 %DataSize, i8* getelementptr inbounds ([13 x i8], [13 x i8]* @.str, i64 0, i64 0)) #11
ret i8* %call
@ -31,7 +31,7 @@ entry:
; Function Attrs: convergent nounwind mustprogress
declare i8* @_Z10SafeMallocmPKc(i64 %size, i8* nocapture readnone %msg)
declare void @__kmpc_data_sharing_pop_stack(i8*)
declare void @__kmpc_free_shared(i8*)
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!3, !4}