[OPENMP50]Codegen for use_device_addr clauses.

Summary:
Added codegen for use_device_addr clause. The components of the list
items are mapped as a kind of RETURN components and then the returned
base address is used instead of the real address of the base declaration
used in the use_device_addr expressions.

Reviewers: jdoerfert

Subscribers: yaxunl, guansong, sstefan1, cfe-commits, caomhin

Tags: #clang

Differential Revision: https://reviews.llvm.org/D80730
This commit is contained in:
Alexey Bataev 2020-05-25 16:06:31 -04:00
parent 69bdfb075b
commit 90b54fa045
6 changed files with 433 additions and 54 deletions

View File

@ -1207,8 +1207,8 @@ OMPUseDevicePtrClause *OMPUseDevicePtrClause::Create(
Sizes.NumComponents = getComponentsTotalNumber(ComponentLists);
// We need to allocate:
// 3 x NumVars x Expr* - we have an original list expression for each clause
// list entry and an equal number of private copies and inits.
// NumVars x Expr* - we have an original list expression for each clause
// list entry.
// NumUniqueDeclarations x ValueDecl* - unique base declarations associated
// with each component list.
// (NumUniqueDeclarations + NumComponentLists) x unsigned - we specify the

View File

@ -7031,7 +7031,7 @@ public:
OMP_MAP_TARGET_PARAM = 0x20,
/// Signal that the runtime library has to return the device pointer
/// in the current position for the data being mapped. Used when we have the
/// use_device_ptr clause.
/// use_device_ptr or use_device_addr clause.
OMP_MAP_RETURN_PARAM = 0x40,
/// This flag signals that the reference being passed is a pointer to
/// private data.
@ -7099,26 +7099,30 @@ private:
ArrayRef<OpenMPMapModifierKind> MapModifiers;
bool ReturnDevicePointer = false;
bool IsImplicit = false;
bool ForDeviceAddr = false;
MapInfo() = default;
MapInfo(
OMPClauseMappableExprCommon::MappableExprComponentListRef Components,
OpenMPMapClauseKind MapType,
ArrayRef<OpenMPMapModifierKind> MapModifiers,
bool ReturnDevicePointer, bool IsImplicit)
ArrayRef<OpenMPMapModifierKind> MapModifiers, bool ReturnDevicePointer,
bool IsImplicit, bool ForDeviceAddr = false)
: Components(Components), MapType(MapType), MapModifiers(MapModifiers),
ReturnDevicePointer(ReturnDevicePointer), IsImplicit(IsImplicit) {}
ReturnDevicePointer(ReturnDevicePointer), IsImplicit(IsImplicit),
ForDeviceAddr(ForDeviceAddr) {}
};
/// If use_device_ptr is used on a pointer which is a struct member and there
/// is no map information about it, then emission of that entry is deferred
/// until the whole struct has been processed.
/// If use_device_ptr or use_device_addr is used on a decl which is a struct
/// member and there is no map information about it, then emission of that
/// entry is deferred until the whole struct has been processed.
struct DeferredDevicePtrEntryTy {
const Expr *IE = nullptr;
const ValueDecl *VD = nullptr;
bool ForDeviceAddr = false;
DeferredDevicePtrEntryTy(const Expr *IE, const ValueDecl *VD)
: IE(IE), VD(VD) {}
DeferredDevicePtrEntryTy(const Expr *IE, const ValueDecl *VD,
bool ForDeviceAddr)
: IE(IE), VD(VD), ForDeviceAddr(ForDeviceAddr) {}
};
/// The target directive from where the mappable clauses were extracted. It
@ -7306,13 +7310,12 @@ private:
/// \a IsFirstComponent should be set to true if the provided set of
/// components is the first associated with a capture.
void generateInfoForComponentList(
OpenMPMapClauseKind MapType,
ArrayRef<OpenMPMapModifierKind> MapModifiers,
OpenMPMapClauseKind MapType, ArrayRef<OpenMPMapModifierKind> MapModifiers,
OMPClauseMappableExprCommon::MappableExprComponentListRef Components,
MapBaseValuesArrayTy &BasePointers, MapValuesArrayTy &Pointers,
MapValuesArrayTy &Sizes, MapFlagsArrayTy &Types,
StructRangeInfoTy &PartialStruct, bool IsFirstComponentList,
bool IsImplicit,
bool IsImplicit, bool ForDeviceAddr = false,
ArrayRef<OMPClauseMappableExprCommon::MappableExprComponentListRef>
OverlappedElements = llvm::None) const {
// The following summarizes what has to be generated for each map and the
@ -7623,8 +7626,8 @@ private:
// If this component is a pointer inside the base struct then we don't
// need to create any entry for it - it will be combined with the object
// it is pointing to into a single PTR_AND_OBJ entry.
bool IsMemberPointer =
IsPointer && EncounteredME &&
bool IsMemberPointerOrAddr =
(IsPointer || ForDeviceAddr) && EncounteredME &&
(dyn_cast<MemberExpr>(I->getAssociatedExpression()) ==
EncounteredME);
if (!OverlappedElements.empty()) {
@ -7691,7 +7694,7 @@ private:
break;
}
llvm::Value *Size = getExprTypeSize(I->getAssociatedExpression());
if (!IsMemberPointer) {
if (!IsMemberPointerOrAddr) {
BasePointers.push_back(BP.getPointer());
Pointers.push_back(LB.getPointer());
Sizes.push_back(
@ -7952,17 +7955,18 @@ public:
// Helper function to fill the information map for the different supported
// clauses.
auto &&InfoGen = [&Info](
const ValueDecl *D,
OMPClauseMappableExprCommon::MappableExprComponentListRef L,
OpenMPMapClauseKind MapType,
ArrayRef<OpenMPMapModifierKind> MapModifiers,
bool ReturnDevicePointer, bool IsImplicit) {
const ValueDecl *VD =
D ? cast<ValueDecl>(D->getCanonicalDecl()) : nullptr;
Info[VD].emplace_back(L, MapType, MapModifiers, ReturnDevicePointer,
IsImplicit);
};
auto &&InfoGen =
[&Info](const ValueDecl *D,
OMPClauseMappableExprCommon::MappableExprComponentListRef L,
OpenMPMapClauseKind MapType,
ArrayRef<OpenMPMapModifierKind> MapModifiers,
bool ReturnDevicePointer, bool IsImplicit,
bool ForDeviceAddr = false) {
const ValueDecl *VD =
D ? cast<ValueDecl>(D->getCanonicalDecl()) : nullptr;
Info[VD].emplace_back(L, MapType, MapModifiers, ReturnDevicePointer,
IsImplicit, ForDeviceAddr);
};
assert(CurDir.is<const OMPExecutableDirective *>() &&
"Expect a executable directive");
@ -8032,7 +8036,7 @@ public:
// partial struct.
InfoGen(nullptr, L.second, OMPC_MAP_unknown, llvm::None,
/*ReturnDevicePointer=*/false, C->isImplicit());
DeferredInfo[nullptr].emplace_back(IE, VD);
DeferredInfo[nullptr].emplace_back(IE, VD, /*ForDeviceAddr=*/false);
} else {
llvm::Value *Ptr =
CGF.EmitLoadOfScalar(CGF.EmitLValue(IE), IE->getExprLoc());
@ -8044,6 +8048,70 @@ public:
}
}
// Look at the use_device_addr clause information and mark the existing map
// entries as such. If there is no map information for an entry in the
// use_device_addr list, we create one with map type 'alloc' and zero size
// section. It is the user fault if that was not mapped before. If there is
// no map information and the pointer is a struct member, then we defer the
// emission of that entry until the whole struct has been processed.
llvm::SmallDenseSet<CanonicalDeclPtr<const Decl>, 4> Processed;
for (const auto *C :
CurExecDir->getClausesOfKind<OMPUseDeviceAddrClause>()) {
for (const auto L : C->component_lists()) {
assert(!L.second.empty() && "Not expecting empty list of components!");
const ValueDecl *VD = L.second.back().getAssociatedDeclaration();
if (!Processed.insert(VD).second)
continue;
VD = cast<ValueDecl>(VD->getCanonicalDecl());
const Expr *IE = L.second.back().getAssociatedExpression();
// If the first component is a member expression, we have to look into
// 'this', which maps to null in the map of map information. Otherwise
// look directly for the information.
auto It = Info.find(isa<MemberExpr>(IE) ? nullptr : VD);
// We potentially have map information for this declaration already.
// Look for the first set of components that refer to it.
if (It != Info.end()) {
auto *CI = llvm::find_if(It->second, [VD](const MapInfo &MI) {
return MI.Components.back().getAssociatedDeclaration() == VD;
});
// If we found a map entry, signal that the pointer has to be returned
// and move on to the next declaration.
if (CI != It->second.end()) {
CI->ReturnDevicePointer = true;
continue;
}
}
// We didn't find any match in our map information - generate a zero
// size array section - if the pointer is a struct member we defer this
// action until the whole struct has been processed.
if (isa<MemberExpr>(IE)) {
// Insert the pointer into Info to be processed by
// generateInfoForComponentList. Because it is a member pointer
// without a pointee, no entry will be generated for it, therefore
// we need to generate one after the whole struct has been processed.
// Nonetheless, generateInfoForComponentList must be called to take
// the pointer into account for the calculation of the range of the
// partial struct.
InfoGen(nullptr, L.second, OMPC_MAP_unknown, llvm::None,
/*ReturnDevicePointer=*/false, C->isImplicit(),
/*ForDeviceAddr=*/true);
DeferredInfo[nullptr].emplace_back(IE, VD, /*ForDeviceAddr=*/true);
} else {
llvm::Value *Ptr;
if (IE->isGLValue())
Ptr = CGF.EmitLValue(IE).getPointer(CGF);
else
Ptr = CGF.EmitScalarExpr(IE);
BasePointers.emplace_back(Ptr, VD);
Pointers.push_back(Ptr);
Sizes.push_back(llvm::Constant::getNullValue(CGF.Int64Ty));
Types.push_back(OMP_MAP_RETURN_PARAM | OMP_MAP_TARGET_PARAM);
}
}
}
for (const auto &M : Info) {
// We need to know when we generate information for the first component
// associated with a capture, because the mapping flags depend on it.
@ -8062,10 +8130,10 @@ public:
// Remember the current base pointer index.
unsigned CurrentBasePointersIdx = CurBasePointers.size();
generateInfoForComponentList(L.MapType, L.MapModifiers, L.Components,
CurBasePointers, CurPointers, CurSizes,
CurTypes, PartialStruct,
IsFirstComponentList, L.IsImplicit);
generateInfoForComponentList(
L.MapType, L.MapModifiers, L.Components, CurBasePointers,
CurPointers, CurSizes, CurTypes, PartialStruct,
IsFirstComponentList, L.IsImplicit, L.ForDeviceAddr);
// If this entry relates with a device pointer, set the relevant
// declaration and add the 'return pointer' flag.
@ -8085,21 +8153,35 @@ public:
}
// Append any pending zero-length pointers which are struct members and
// used with use_device_ptr.
// used with use_device_ptr or use_device_addr.
auto CI = DeferredInfo.find(M.first);
if (CI != DeferredInfo.end()) {
for (const DeferredDevicePtrEntryTy &L : CI->second) {
llvm::Value *BasePtr = this->CGF.EmitLValue(L.IE).getPointer(CGF);
llvm::Value *Ptr = this->CGF.EmitLoadOfScalar(
this->CGF.EmitLValue(L.IE), L.IE->getExprLoc());
llvm::Value *BasePtr;
llvm::Value *Ptr;
if (L.ForDeviceAddr) {
if (L.IE->isGLValue())
Ptr = this->CGF.EmitLValue(L.IE).getPointer(CGF);
else
Ptr = this->CGF.EmitScalarExpr(L.IE);
BasePtr = Ptr;
// Entry is RETURN_PARAM. Also, set the placeholder value
// MEMBER_OF=FFFF so that the entry is later updated with the
// correct value of MEMBER_OF.
CurTypes.push_back(OMP_MAP_RETURN_PARAM | OMP_MAP_MEMBER_OF);
} else {
BasePtr = this->CGF.EmitLValue(L.IE).getPointer(CGF);
Ptr = this->CGF.EmitLoadOfScalar(this->CGF.EmitLValue(L.IE),
L.IE->getExprLoc());
// Entry is PTR_AND_OBJ and RETURN_PARAM. Also, set the placeholder
// value MEMBER_OF=FFFF so that the entry is later updated with the
// correct value of MEMBER_OF.
CurTypes.push_back(OMP_MAP_PTR_AND_OBJ | OMP_MAP_RETURN_PARAM |
OMP_MAP_MEMBER_OF);
}
CurBasePointers.emplace_back(BasePtr, L.VD);
CurPointers.push_back(Ptr);
CurSizes.push_back(llvm::Constant::getNullValue(this->CGF.Int64Ty));
// Entry is PTR_AND_OBJ and RETURN_PARAM. Also, set the placeholder
// value MEMBER_OF=FFFF so that the entry is later updated with the
// correct value of MEMBER_OF.
CurTypes.push_back(OMP_MAP_PTR_AND_OBJ | OMP_MAP_RETURN_PARAM |
OMP_MAP_MEMBER_OF);
}
}
@ -8168,10 +8250,10 @@ public:
for (const MapInfo &L : M.second) {
assert(!L.Components.empty() &&
"Not expecting declaration with no component lists.");
generateInfoForComponentList(L.MapType, L.MapModifiers, L.Components,
CurBasePointers, CurPointers, CurSizes,
CurTypes, PartialStruct,
IsFirstComponentList, L.IsImplicit);
generateInfoForComponentList(
L.MapType, L.MapModifiers, L.Components, CurBasePointers,
CurPointers, CurSizes, CurTypes, PartialStruct,
IsFirstComponentList, L.IsImplicit, L.ForDeviceAddr);
IsFirstComponentList = false;
}
@ -8437,10 +8519,10 @@ public:
ArrayRef<OMPClauseMappableExprCommon::MappableExprComponentListRef>
OverlappedComponents = Pair.getSecond();
bool IsFirstComponentList = true;
generateInfoForComponentList(MapType, MapModifiers, Components,
BasePointers, Pointers, Sizes, Types,
PartialStruct, IsFirstComponentList,
IsImplicit, OverlappedComponents);
generateInfoForComponentList(
MapType, MapModifiers, Components, BasePointers, Pointers, Sizes,
Types, PartialStruct, IsFirstComponentList, IsImplicit,
/*ForDeviceAddr=*/false, OverlappedComponents);
}
// Go through other elements without overlapped elements.
bool IsFirstComponentList = OverlappedData.empty();

View File

@ -31,6 +31,8 @@ using namespace clang;
using namespace CodeGen;
using namespace llvm::omp;
static const VarDecl *getBaseDecl(const Expr *Ref);
namespace {
/// Lexical scope for OpenMP executable constructs, that handles correct codegen
/// for captured expressions.
@ -220,6 +222,12 @@ public:
if (const auto *OED = dyn_cast<OMPCapturedExprDecl>(D))
CGF.EmitVarDecl(*OED);
}
} else if (const auto *UDP = dyn_cast<OMPUseDeviceAddrClause>(C)) {
for (const Expr *E : UDP->varlists()) {
const Decl *D = getBaseDecl(E);
if (const auto *OED = dyn_cast<OMPCapturedExprDecl>(D))
CGF.EmitVarDecl(*OED);
}
}
}
if (!isOpenMPSimdDirective(S.getDirectiveKind()))
@ -5804,9 +5812,8 @@ CodeGenFunction::getOMPCancelDestination(OpenMPDirectiveKind Kind) {
}
void CodeGenFunction::EmitOMPUseDevicePtrClause(
const OMPClause &NC, OMPPrivateScope &PrivateScope,
const OMPUseDevicePtrClause &C, OMPPrivateScope &PrivateScope,
const llvm::DenseMap<const ValueDecl *, Address> &CaptureDeviceAddrMap) {
const auto &C = cast<OMPUseDevicePtrClause>(NC);
auto OrigVarIt = C.varlist_begin();
auto InitIt = C.inits().begin();
for (const Expr *PvtVarIt : C.private_copies()) {
@ -5867,6 +5874,60 @@ void CodeGenFunction::EmitOMPUseDevicePtrClause(
}
}
static const VarDecl *getBaseDecl(const Expr *Ref) {
const Expr *Base = Ref->IgnoreParenImpCasts();
while (const auto *OASE = dyn_cast<OMPArraySectionExpr>(Base))
Base = OASE->getBase()->IgnoreParenImpCasts();
while (const auto *ASE = dyn_cast<ArraySubscriptExpr>(Base))
Base = ASE->getBase()->IgnoreParenImpCasts();
return cast<VarDecl>(cast<DeclRefExpr>(Base)->getDecl());
}
void CodeGenFunction::EmitOMPUseDeviceAddrClause(
const OMPUseDeviceAddrClause &C, OMPPrivateScope &PrivateScope,
const llvm::DenseMap<const ValueDecl *, Address> &CaptureDeviceAddrMap) {
llvm::SmallDenseSet<CanonicalDeclPtr<const Decl>, 4> Processed;
for (const Expr *Ref : C.varlists()) {
const VarDecl *OrigVD = getBaseDecl(Ref);
if (!Processed.insert(OrigVD).second)
continue;
// In order to identify the right initializer we need to match the
// declaration used by the mapping logic. In some cases we may get
// OMPCapturedExprDecl that refers to the original declaration.
const ValueDecl *MatchingVD = OrigVD;
if (const auto *OED = dyn_cast<OMPCapturedExprDecl>(MatchingVD)) {
// OMPCapturedExprDecl are used to privative fields of the current
// structure.
const auto *ME = cast<MemberExpr>(OED->getInit());
assert(isa<CXXThisExpr>(ME->getBase()) &&
"Base should be the current struct!");
MatchingVD = ME->getMemberDecl();
}
// If we don't have information about the current list item, move on to
// the next one.
auto InitAddrIt = CaptureDeviceAddrMap.find(MatchingVD);
if (InitAddrIt == CaptureDeviceAddrMap.end())
continue;
Address PrivAddr = InitAddrIt->getSecond();
// For declrefs and variable length array need to load the pointer for
// correct mapping, since the pointer to the data was passed to the runtime.
if (isa<DeclRefExpr>(Ref->IgnoreParenImpCasts()) ||
MatchingVD->getType()->isArrayType())
PrivAddr =
EmitLoadOfPointer(PrivAddr, getContext()
.getPointerType(OrigVD->getType())
->castAs<PointerType>());
llvm::Type *RealTy =
ConvertTypeForMem(OrigVD->getType().getNonReferenceType())
->getPointerTo();
PrivAddr = Builder.CreatePointerBitCastOrAddrSpaceCast(PrivAddr, RealTy);
(void)PrivateScope.addPrivate(OrigVD, [PrivAddr]() { return PrivAddr; });
}
}
// Generate the instructions for '#pragma omp target data' directive.
void CodeGenFunction::EmitOMPTargetDataDirective(
const OMPTargetDataDirective &S) {
@ -5911,6 +5972,9 @@ void CodeGenFunction::EmitOMPTargetDataDirective(
for (const auto *C : S.getClausesOfKind<OMPUseDevicePtrClause>())
CGF.EmitOMPUseDevicePtrClause(*C, PrivateScope,
Info.CaptureDeviceAddrMap);
for (const auto *C : S.getClausesOfKind<OMPUseDeviceAddrClause>())
CGF.EmitOMPUseDeviceAddrClause(*C, PrivateScope,
Info.CaptureDeviceAddrMap);
(void)PrivateScope.Privatize();
RCG(CGF);
} else {

View File

@ -76,6 +76,8 @@ class ObjCAtTryStmt;
class ObjCAtThrowStmt;
class ObjCAtSynchronizedStmt;
class ObjCAutoreleasePoolStmt;
class OMPUseDevicePtrClause;
class OMPUseDeviceAddrClause;
class ReturnsNonNullAttr;
class SVETypeFlags;
@ -3173,7 +3175,10 @@ public:
void EmitOMPPrivateClause(const OMPExecutableDirective &D,
OMPPrivateScope &PrivateScope);
void EmitOMPUseDevicePtrClause(
const OMPClause &C, OMPPrivateScope &PrivateScope,
const OMPUseDevicePtrClause &C, OMPPrivateScope &PrivateScope,
const llvm::DenseMap<const ValueDecl *, Address> &CaptureDeviceAddrMap);
void EmitOMPUseDeviceAddrClause(
const OMPUseDeviceAddrClause &C, OMPPrivateScope &PrivateScope,
const llvm::DenseMap<const ValueDecl *, Address> &CaptureDeviceAddrMap);
/// Emit code for copyin clause in \a D directive. The next code is
/// generated at the start of outlined functions for directives:

View File

@ -18513,8 +18513,12 @@ OMPClause *Sema::ActOnOpenMPUseDeviceAddrClause(ArrayRef<Expr *> VarList,
// only need a component.
MVLI.VarBaseDeclarations.push_back(D);
MVLI.VarComponents.emplace_back();
Expr *Component = SimpleRefExpr;
if (VD && (isa<OMPArraySectionExpr>(RefExpr->IgnoreParenImpCasts()) ||
isa<ArraySubscriptExpr>(RefExpr->IgnoreParenImpCasts())))
Component = DefaultFunctionArrayLvalueConversion(SimpleRefExpr).get();
MVLI.VarComponents.back().push_back(
OMPClauseMappableExprCommon::MappableComponent(SimpleRefExpr, D));
OMPClauseMappableExprCommon::MappableComponent(Component, D));
}
if (MVLI.ProcessedVarList.empty())

View File

@ -0,0 +1,224 @@
// RUN: %clang_cc1 -DCK1 -verify -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s
// RUN: %clang_cc1 -DCK1 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
// RUN: %clang_cc1 -fopenmp -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
// RUN: %clang_cc1 -DCK1 -verify -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck --check-prefix SIMD-ONLY0 %s
// RUN: %clang_cc1 -DCK1 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
// RUN: %clang_cc1 -fopenmp-simd -fopenmp-version=50 -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck --check-prefix SIMD-ONLY0 %s
// SIMD-ONLY0-NOT: {{__kmpc|__tgt}}
// expected-no-diagnostics
#ifndef HEADER
#define HEADER
// CHECK-DAG: [[SIZES1:@.+]] = private unnamed_addr constant [5 x i64] zeroinitializer
// 96 = 0x60 = OMP_MAP_TARGET_PARAM | OMP_MAP_RETURN_PARAM
// CHECK-DAG: [[MAPTYPES1:@.+]] = private unnamed_addr constant [5 x i64] [i64 96, i64 96, i64 96, i64 96, i64 96]
// 32 = 0x20 = OMP_MAP_TARGET_PARAM
// 281474976710720 = 0x1000000000040 = OMP_MAP_MEMBER_OF | OMP_MAP_RETURN_PARAM
// CHECK-DAG: [[MAPTYPES2:@.+]] = private unnamed_addr constant [5 x i64] [i64 32, i64 281474976710720, i64 281474976710720, i64 281474976710720, i64 281474976710720]
struct S {
int a = 0;
int *ptr = &a;
int &ref = a;
int arr[4];
S() {}
void foo() {
#pragma omp target data use_device_addr(a, ptr [3:4], ref, ptr[0], arr[:a])
++a, ++*ptr, ++ref, ++arr[0];
}
};
int main() {
float a = 0;
float *ptr = &a;
float &ref = a;
float arr[4];
float vla[(int)a];
S s;
s.foo();
#pragma omp target data use_device_addr(a, ptr [3:4], ref, ptr[0], arr[:(int)a], vla[0])
++a, ++*ptr, ++ref, ++arr[0], ++vla[0];
return a;
}
// CHECK-LABEL: @main()
// CHECK: [[A_ADDR:%.+]] = alloca float,
// CHECK: [[PTR_ADDR:%.+]] = alloca float*,
// CHECK: [[REF_ADDR:%.+]] = alloca float*,
// CHECK: [[ARR_ADDR:%.+]] = alloca [4 x float],
// CHECK: [[BPTRS:%.+]] = alloca [5 x i8*],
// CHECK: [[PTRS:%.+]] = alloca [5 x i8*],
// CHECK: [[VLA_ADDR:%.+]] = alloca float, i64 %{{.+}},
// CHECK: [[PTR:%.+]] = load float*, float** [[PTR_ADDR]],
// CHECK: [[REF:%.+]] = load float*, float** [[REF_ADDR]],
// CHECK: [[ARR:%.+]] = getelementptr inbounds [4 x float], [4 x float]* [[ARR_ADDR]], i64 0, i64 0
// CHECK: [[BPTR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BPTRS]], i32 0, i32 0
// CHECK: [[BPTR0_A_ADDR:%.+]] = bitcast i8** [[BPTR0]] to float**
// CHECK: store float* [[A_ADDR]], float** [[BPTR0_A_ADDR]],
// CHECK: [[PTR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS]], i32 0, i32 0
// CHECK: [[PTR0_A_ADDR:%.+]] = bitcast i8** [[PTR0]] to float**
// CHECK: store float* [[A_ADDR]], float** [[PTR0_A_ADDR]],
// CHECK: [[BPTR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BPTRS]], i32 0, i32 1
// CHECK: [[BPTR1_PTR_ADDR:%.+]] = bitcast i8** [[BPTR1]] to float**
// CHECK: store float* [[PTR]], float** [[BPTR1_PTR_ADDR]],
// CHECK: [[PTR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS]], i32 0, i32 1
// CHECK: [[PTR1_PTR_ADDR:%.+]] = bitcast i8** [[PTR1]] to float**
// CHECK: store float* [[PTR]], float** [[PTR1_PTR_ADDR]],
// CHECK: [[BPTR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BPTRS]], i32 0, i32 2
// CHECK: [[BPTR2_REF_ADDR:%.+]] = bitcast i8** [[BPTR2]] to float**
// CHECK: store float* [[REF]], float** [[BPTR2_REF_ADDR]],
// CHECK: [[PTR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS]], i32 0, i32 2
// CHECK: [[PTR2_REF_ADDR:%.+]] = bitcast i8** [[PTR2]] to float**
// CHECK: store float* [[REF]], float** [[PTR2_REF_ADDR]],
// CHECK: [[BPTR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BPTRS]], i32 0, i32 3
// CHECK: [[BPTR3_ARR_ADDR:%.+]] = bitcast i8** [[BPTR3]] to float**
// CHECK: store float* [[ARR]], float** [[BPTR3_ARR_ADDR]],
// CHECK: [[PTR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS]], i32 0, i32 3
// CHECK: [[PTR3_ARR_ADDR:%.+]] = bitcast i8** [[PTR3]] to float**
// CHECK: store float* [[ARR]], float** [[PTR3_ARR_ADDR]],
// CHECK: [[BPTR4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BPTRS]], i32 0, i32 4
// CHECK: [[BPTR4_VLA_ADDR:%.+]] = bitcast i8** [[BPTR4]] to float**
// CHECK: store float* [[VLA_ADDR]], float** [[BPTR4_VLA_ADDR]],
// CHECK: [[PTR4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS]], i32 0, i32 4
// CHECK: [[PTR4_VLA_ADDR:%.+]] = bitcast i8** [[PTR4]] to float**
// CHECK: store float* [[VLA_ADDR]], float** [[PTR4_VLA_ADDR]],
// CHECK: [[BPTR:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BPTRS]], i32 0, i32 0
// CHECK: [[PTR:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS]], i32 0, i32 0
// CHECK: call void @__tgt_target_data_begin(i64 -1, i32 5, i8** [[BPTR]], i8** [[PTR]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[SIZES1]], i32 0, i32 0), i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPTYPES1]], i32 0, i32 0))
// CHECK: [[A_REF:%.+]] = load float*, float** [[BPTR0_A_ADDR]],
// CHECK: [[REF_REF:%.+]] = load float*, float** [[BPTR2_REF_ADDR]],
// CHECK: store float* [[REF_REF]], float** [[TMP_REF_ADDR:%.+]],
// CHECK: [[ARR:%.+]] = load float*, float** [[BPTR3_ARR_ADDR]],
// CHECK: [[ARR_REF:%.+]] = bitcast float* [[ARR]] to [4 x float]*
// CHECK: [[VLA_REF:%.+]] = load float*, float** [[BPTR4_VLA_ADDR]],
// CHECK: [[A:%.+]] = load float, float* [[A_REF]],
// CHECK: [[INC:%.+]] = fadd float [[A]], 1.000000e+00
// CHECK: store float [[INC]], float* [[A_REF]],
// CHECK: [[PTR_ADDR:%.+]] = load float*, float** [[BPTR1_PTR_ADDR]],
// CHECK: [[VAL:%.+]] = load float, float* [[PTR_ADDR]],
// CHECK: [[INC:%.+]] = fadd float [[VAL]], 1.000000e+00
// CHECK: store float [[INC]], float* [[PTR_ADDR]],
// CHECK: [[REF_ADDR:%.+]] = load float*, float** [[TMP_REF_ADDR]],
// CHECK: [[REF:%.+]] = load float, float* [[REF_ADDR]],
// CHECK: [[INC:%.+]] = fadd float [[REF]], 1.000000e+00
// CHECK: store float [[INC]], float* [[REF_ADDR]],
// CHECK: [[ARR0_ADDR:%.+]] = getelementptr inbounds [4 x float], [4 x float]* [[ARR_REF]], i64 0, i64 0
// CHECK: [[ARR0:%.+]] = load float, float* [[ARR0_ADDR]],
// CHECK: [[INC:%.+]] = fadd float [[ARR0]], 1.000000e+00
// CHECK: store float [[INC]], float* [[ARR0_ADDR]],
// CHECK: [[VLA0_ADDR:%.+]] = getelementptr inbounds float, float* [[VLA_REF]], i64 0
// CHECK: [[VLA0:%.+]] = load float, float* [[VLA0_ADDR]],
// CHECK: [[INC:%.+]] = fadd float [[VLA0]], 1.000000e+00
// CHECK: store float [[INC]], float* [[VLA0_ADDR]],
// CHECK: [[BPTR:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BPTRS]], i32 0, i32 0
// CHECK: [[PTR:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS]], i32 0, i32 0
// CHECK: call void @__tgt_target_data_end(i64 -1, i32 5, i8** [[BPTR]], i8** [[PTR]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[SIZES1]], i32 0, i32 0), i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPTYPES1]], i32 0, i32 0))
// CHECK: foo
// %this.addr = alloca %struct.S*, align 8
// CHECK: [[BPTRS:%.+]] = alloca [5 x i8*],
// CHECK: [[PTRS:%.+]] = alloca [5 x i8*],
// CHECK: [[SIZES:%.+]] = alloca [5 x i64],
// %tmp = alloca i32*, align 8
// %tmp6 = alloca i32**, align 8
// %tmp7 = alloca i32*, align 8
// %tmp8 = alloca i32**, align 8
// %tmp9 = alloca [4 x i32]*, align 8
// store %struct.S* %this, %struct.S** %this.addr, align 8
// %this1 = load %struct.S*, %struct.S** %this.addr, align 8
// CHECK: [[A_ADDR:%.+]] = getelementptr inbounds %struct.S, %struct.S* [[THIS:%.+]], i32 0, i32 0
// %ptr = getelementptr inbounds %struct.S, %struct.S* %this1, i32 0, i32 1
// %ref = getelementptr inbounds %struct.S, %struct.S* %this1, i32 0, i32 2
// %0 = load i32*, i32** %ref, align 8
// CHECK: [[ARR_ADDR:%.+]] = getelementptr inbounds %struct.S, %struct.S* [[THIS]], i32 0, i32 3
// CHECK: [[A_ADDR2:%.+]] = getelementptr inbounds %struct.S, %struct.S* [[THIS]], i32 0, i32 0
// CHECK: [[PTR_ADDR:%.+]] = getelementptr inbounds %struct.S, %struct.S* [[THIS]], i32 0, i32 1
// CHECK: [[REF_REF:%.+]] = getelementptr inbounds %struct.S, %struct.S* [[THIS]], i32 0, i32 2
// CHECK: [[REF_PTR:%.+]] = load i32*, i32** [[REF_REF]],
// CHECK: [[ARR_ADDR2:%.+]] = getelementptr inbounds %struct.S, %struct.S* [[THIS]], i32 0, i32 3
// CHECK: [[ARR_END:%.+]] = getelementptr [4 x i32], [4 x i32]* [[ARR_ADDR]], i32 1
// CHECK: [[BEGIN:%.+]] = bitcast i32* [[A_ADDR]] to i8*
// CHECK: [[END:%.+]] = bitcast [4 x i32]* [[ARR_END]] to i8*
// CHECK: [[E:%.+]] = ptrtoint i8* [[END]] to i64
// CHECK: [[B:%.+]] = ptrtoint i8* [[BEGIN]] to i64
// CHECK: [[DIFF:%.+]] = sub i64 [[E]], [[B]]
// CHECK: [[SZ:%.+]] = sdiv exact i64 [[DIFF]], ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64)
// CHECK: [[BPTR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BPTRS]], i32 0, i32 0
// CHECK: [[BPTR0_S:%.+]] = bitcast i8** [[BPTR0]] to %struct.S**
// CHECK: store %struct.S* [[THIS]], %struct.S** [[BPTR0_S]],
// CHECK: [[PTR0:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS]], i32 0, i32 0
// CHECK: [[PTR0_BEGIN:%.+]] = bitcast i8** [[PTR0]] to i32**
// CHECK: store i32* [[A_ADDR]], i32** [[PTR0_BEGIN]],
// CHECK: [[SIZE0:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[SIZES]], i32 0, i32 0
// CHECK: store i64 [[SZ]], i64* [[SIZE0]],
// CHECK: [[BPTR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BPTRS]], i32 0, i32 1
// CHECK: [[BPTR1_A_ADDR:%.+]] = bitcast i8** [[BPTR1]] to i32**
// CHECK: store i32* [[A_ADDR2]], i32** [[BPTR1_A_ADDR]],
// CHECK: [[PTR1:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS]], i32 0, i32 1
// CHECK: [[PTR1_A_ADDR:%.+]] = bitcast i8** [[PTR1]] to i32**
// CHECK: store i32* [[A_ADDR2]], i32** [[PTR1_A_ADDR]],
// CHECK: [[SIZE1:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[SIZES]], i32 0, i32 1
// CHECK: store i64 0, i64* [[SIZE1]],
// CHECK: [[BPTR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BPTRS]], i32 0, i32 2
// CHECK: [[BPTR2_PTR_ADDR:%.+]] = bitcast i8** [[BPTR2]] to i32***
// CHECK: store i32** [[PTR_ADDR]], i32*** [[BPTR2_PTR_ADDR]],
// CHECK: [[PTR2:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS]], i32 0, i32 2
// CHECK: [[PTR2_PTR_ADDR:%.+]] = bitcast i8** [[PTR2]] to i32***
// CHECK: store i32** [[PTR_ADDR]], i32*** [[PTR2_PTR_ADDR]],
// CHECK: [[SIZE2:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[SIZES]], i32 0, i32 2
// CHECK: store i64 0, i64* [[SIZE2]],
// CHECK: [[BPTR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BPTRS]], i32 0, i32 3
// CHECK: [[BPTR3_REF_PTR:%.+]] = bitcast i8** [[BPTR3]] to i32**
// CHECK: store i32* [[REF_PTR]], i32** [[BPTR3_REF_PTR]],
// CHECK: [[PTR3:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS]], i32 0, i32 3
// CHECK: [[PTR3_REF_PTR:%.+]] = bitcast i8** [[PTR3]] to i32**
// CHECK: store i32* [[REF_PTR]], i32** [[PTR3_REF_PTR]],
// CHECK: [[SIZE3:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[SIZES]], i32 0, i32 3
// CHECK: store i64 0, i64* [[SIZE3]],
// CHECK: [[BPTR4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BPTRS]], i32 0, i32 4
// CHECK: [[BPTR4_ARR_ADDR:%.+]] = bitcast i8** [[BPTR4]] to [4 x i32]**
// CHECK: store [4 x i32]* [[ARR_ADDR2]], [4 x i32]** [[BPTR4_ARR_ADDR]],
// CHECK: [[PTR4:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS]], i32 0, i32 4
// CHECK: [[PTR4_ARR_ADDR:%.+]] = bitcast i8** [[PTR4]] to [4 x i32]**
// CHECK: store [4 x i32]* [[ARR_ADDR2]], [4 x i32]** [[PTR4_ARR_ADDR]],
// CHECK: [[SIZE4:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[SIZES]], i32 0, i32 4
// CHECK: store i64 0, i64* [[SIZE4]],
// CHECK: [[BPTR:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BPTRS]], i32 0, i32 0
// CHECK: [[PTR:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS]], i32 0, i32 0
// CHECK: [[SIZE:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[SIZES]], i32 0, i32 0
// CHECK: call void @__tgt_target_data_begin(i64 -1, i32 5, i8** [[BPTR]], i8** [[PTR]], i64* [[SIZE]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPTYPES2]], i32 0, i32 0))
// CHECK: [[A_ADDR:%.+]] = load i32*, i32** [[BPTR1_A_ADDR]],
// CHECK: store i32* [[A_ADDR]], i32** [[A_REF:%.+]],
// CHECK: [[PTR_ADDR:%.+]] = load i32**, i32*** [[BPTR2_PTR_ADDR]],
// CHECK: store i32** [[PTR_ADDR]], i32*** [[PTR_REF:%.+]],
// CHECK: [[REF_PTR:%.+]] = load i32*, i32** [[BPTR3_REF_PTR]],
// CHECK: store i32* [[REF_PTR]], i32** [[REF_REF:%.+]],
// CHECK: [[PTR_ADDR:%.+]] = load i32**, i32*** [[BPTR2_PTR_ADDR]],
// CHECK: store i32** [[PTR_ADDR]], i32*** [[PTR_REF2:%.+]],
// CHECK: [[ARR_ADDR:%.+]] = load [4 x i32]*, [4 x i32]** [[BPTR4_ARR_ADDR]],
// CHECK: store [4 x i32]* [[ARR_ADDR]], [4 x i32]** [[ARR_REF:%.+]],
// CHECK: [[A_ADDR:%.+]] = load i32*, i32** [[A_REF]],
// CHECK: [[A:%.+]] = load i32, i32* [[A_ADDR]],
// CHECK: [[INC:%.+]] = add nsw i32 [[A]], 1
// CHECK: store i32 [[INC]], i32* [[A_ADDR]],
// CHECK: [[PTR_PTR:%.+]] = load i32**, i32*** [[PTR_REF2]],
// CHECK: [[PTR:%.+]] = load i32*, i32** [[PTR_PTR]],
// CHECK: [[VAL:%.+]] = load i32, i32* [[PTR]],
// CHECK: [[INC:%.+]] = add nsw i32 [[VAL]], 1
// CHECK: store i32 [[INC]], i32* [[PTR]],
// CHECK: [[REF_PTR:%.+]] = load i32*, i32** [[REF_REF]],
// CHECK: [[VAL:%.+]] = load i32, i32* [[REF_PTR]],
// CHECK: [[INC:%.+]] = add nsw i32 [[VAL]], 1
// CHECK: store i32 [[INC]], i32* [[REF_PTR]],
// CHECK: [[ARR_ADDR:%.+]] = load [4 x i32]*, [4 x i32]** [[ARR_REF]],
// CHECK: [[ARR0_ADDR:%.+]] = getelementptr inbounds [4 x i32], [4 x i32]* [[ARR_ADDR]], i64 0, i64 0
// CHECK: [[VAL:%.+]] = load i32, i32* [[ARR0_ADDR]],
// CHECK: [[INC:%.+]] = add nsw i32 [[VAL]], 1
// CHECK: store i32 [[INC]], i32* [[ARR0_ADDR]],
// CHECK: [[BPTR:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[BPTRS]], i32 0, i32 0
// CHECK: [[PTR:%.+]] = getelementptr inbounds [5 x i8*], [5 x i8*]* [[PTRS]], i32 0, i32 0
// CHECK: [[SIZE:%.+]] = getelementptr inbounds [5 x i64], [5 x i64]* [[SIZES]], i32 0, i32 0
// CHECK: call void @__tgt_target_data_end(i64 -1, i32 5, i8** [[BPTR]], i8** [[PTR]], i64* [[SIZE]], i64* getelementptr inbounds ([5 x i64], [5 x i64]* [[MAPTYPES2]], i32 0, i32 0))
#endif