forked from OSchip/llvm-project
[OpenMP][libomptarget] Add support for unified memory for regular maps
Summary: This patch adds support for using unified memory in the case of regular maps that happen when a target region is offloaded to the device. For cases where only a single version of the data is required then the host address can be used. When variables need to be privatized in any way or globalized, then the copy to the device is still required for correctness. Reviewers: ABataev, jdoerfert, Hahnfeld, AlexEichenberger, caomhin, grokos Reviewed By: Hahnfeld Subscribers: mgorny, guansong, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D65001 llvm-svn: 368192
This commit is contained in:
parent
d8c3c17394
commit
a1d20506e7
|
@ -113,7 +113,15 @@ EXTERN int omp_target_is_present(void *ptr, int device_num) {
|
|||
|
||||
DeviceTy& Device = Devices[device_num];
|
||||
bool IsLast; // not used
|
||||
int rc = (Device.getTgtPtrBegin(ptr, 0, IsLast, false) != NULL);
|
||||
bool IsHostPtr;
|
||||
void *TgtPtr = Device.getTgtPtrBegin(ptr, 0, IsLast, false, IsHostPtr);
|
||||
int rc = (TgtPtr != NULL);
|
||||
// Under unified memory the host pointer can be returned by the
|
||||
// getTgtPtrBegin() function which means that there is no device
|
||||
// corresponding point for ptr. This function should return false
|
||||
// in that situation.
|
||||
if (Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)
|
||||
rc = !IsHostPtr;
|
||||
DP("Call to omp_target_is_present returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
|
|
@ -157,12 +157,17 @@ LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) {
|
|||
// If NULL is returned, then either data allocation failed or the user tried
|
||||
// to do an illegal mapping.
|
||||
void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
|
||||
int64_t Size, bool &IsNew, bool IsImplicit, bool UpdateRefCount) {
|
||||
int64_t Size, bool &IsNew, bool &IsHostPtr, bool IsImplicit,
|
||||
bool UpdateRefCount) {
|
||||
void *rc = NULL;
|
||||
IsHostPtr = false;
|
||||
DataMapMtx.lock();
|
||||
LookupResult lr = lookupMapping(HstPtrBegin, Size);
|
||||
|
||||
// Check if the pointer is contained.
|
||||
// If a variable is mapped to the device manually by the user - which would
|
||||
// lead to the IsContained flag to be true - then we must ensure that the
|
||||
// device address is returned even under unified memory conditions.
|
||||
if (lr.Flags.IsContained ||
|
||||
((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) {
|
||||
auto &HT = *lr.Entry;
|
||||
|
@ -183,15 +188,28 @@ void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
|
|||
// Explicit extension of mapped data - not allowed.
|
||||
DP("Explicit extension of mapping is not allowed.\n");
|
||||
} else if (Size) {
|
||||
// If it is not contained and Size > 0 we should create a new entry for it.
|
||||
IsNew = true;
|
||||
uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin);
|
||||
DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
|
||||
"HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
|
||||
DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
|
||||
HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
|
||||
(uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
|
||||
rc = (void *)tp;
|
||||
// If unified shared memory is active, implicitly mapped variables that are not
|
||||
// privatized use host address. Any explicitly mapped variables also use
|
||||
// host address where correctness is not impeded. In all other cases
|
||||
// maps are respected.
|
||||
// TODO: In addition to the mapping rules above, when the close map
|
||||
// modifier is implemented, foce the mapping of the variable to the device.
|
||||
if (RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
|
||||
DP("Return HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n",
|
||||
DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : ""));
|
||||
IsHostPtr = true;
|
||||
rc = HstPtrBegin;
|
||||
} else {
|
||||
// If it is not contained and Size > 0 we should create a new entry for it.
|
||||
IsNew = true;
|
||||
uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin);
|
||||
DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", "
|
||||
"HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase),
|
||||
DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp));
|
||||
HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase,
|
||||
(uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp));
|
||||
rc = (void *)tp;
|
||||
}
|
||||
}
|
||||
|
||||
DataMapMtx.unlock();
|
||||
|
@ -202,8 +220,10 @@ void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase,
|
|||
// Return the target pointer begin (where the data will be moved).
|
||||
// Decrement the reference counter if called from target_data_end.
|
||||
void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
|
||||
bool UpdateRefCount) {
|
||||
bool UpdateRefCount, bool &IsHostPtr) {
|
||||
void *rc = NULL;
|
||||
IsHostPtr = false;
|
||||
IsLast = false;
|
||||
DataMapMtx.lock();
|
||||
LookupResult lr = lookupMapping(HstPtrBegin, Size);
|
||||
|
||||
|
@ -221,8 +241,14 @@ void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
|
|||
(CONSIDERED_INF(HT.RefCount)) ? "INF" :
|
||||
std::to_string(HT.RefCount).c_str());
|
||||
rc = (void *)tp;
|
||||
} else {
|
||||
IsLast = false;
|
||||
} else if (RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
|
||||
// If the value isn't found in the mapping and unified shared memory
|
||||
// is on then it means we have stumbled upon a value which we need to
|
||||
// use directly from the host.
|
||||
DP("Get HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n",
|
||||
DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : ""));
|
||||
IsHostPtr = true;
|
||||
rc = HstPtrBegin;
|
||||
}
|
||||
|
||||
DataMapMtx.unlock();
|
||||
|
@ -244,6 +270,8 @@ void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) {
|
|||
}
|
||||
|
||||
int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete) {
|
||||
if (RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)
|
||||
return OFFLOAD_SUCCESS;
|
||||
// Check if the pointer is contained in any sub-nodes.
|
||||
int rc;
|
||||
DataMapMtx.lock();
|
||||
|
|
|
@ -137,10 +137,10 @@ struct DeviceTy {
|
|||
long getMapEntryRefCnt(void *HstPtrBegin);
|
||||
LookupResult lookupMapping(void *HstPtrBegin, int64_t Size);
|
||||
void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size,
|
||||
bool &IsNew, bool IsImplicit, bool UpdateRefCount = true);
|
||||
bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true);
|
||||
void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size);
|
||||
void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
|
||||
bool UpdateRefCount);
|
||||
bool UpdateRefCount, bool &IsHostPtr);
|
||||
int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete);
|
||||
int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size);
|
||||
int disassociatePtr(void *HstPtrBegin);
|
||||
|
|
|
@ -242,6 +242,7 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
|
|||
// Address of pointer on the host and device, respectively.
|
||||
void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin;
|
||||
bool IsNew, Pointer_IsNew;
|
||||
bool IsHostPtr = false;
|
||||
bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT;
|
||||
// UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we
|
||||
// have reached this point via __tgt_target_data_begin and not __tgt_target
|
||||
|
@ -253,7 +254,7 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
|
|||
DP("Has a pointer entry: \n");
|
||||
// base is address of pointer.
|
||||
Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase,
|
||||
sizeof(void *), Pointer_IsNew, IsImplicit, UpdateRef);
|
||||
sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef);
|
||||
if (!Pointer_TgtPtrBegin) {
|
||||
DP("Call to getOrAllocTgtPtr returned null pointer (device failure or "
|
||||
"illegal mapping).\n");
|
||||
|
@ -269,7 +270,7 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
|
|||
}
|
||||
|
||||
void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase,
|
||||
data_size, IsNew, IsImplicit, UpdateRef);
|
||||
data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef);
|
||||
if (!TgtPtrBegin && data_size) {
|
||||
// If data_size==0, then the argument could be a zero-length pointer to
|
||||
// NULL, so getOrAlloc() returning NULL is not an error.
|
||||
|
@ -289,19 +290,21 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
|
|||
|
||||
if (arg_types[i] & OMP_TGT_MAPTYPE_TO) {
|
||||
bool copy = false;
|
||||
if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
|
||||
copy = true;
|
||||
} else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
|
||||
// Copy data only if the "parent" struct has RefCount==1.
|
||||
int32_t parent_idx = member_of(arg_types[i]);
|
||||
long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
|
||||
assert(parent_rc > 0 && "parent struct not found");
|
||||
if (parent_rc == 1) {
|
||||
if (!(Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)) {
|
||||
if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) {
|
||||
copy = true;
|
||||
} else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) {
|
||||
// Copy data only if the "parent" struct has RefCount==1.
|
||||
int32_t parent_idx = member_of(arg_types[i]);
|
||||
long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
|
||||
assert(parent_rc > 0 && "parent struct not found");
|
||||
if (parent_rc == 1) {
|
||||
copy = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (copy) {
|
||||
if (copy && !IsHostPtr) {
|
||||
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n",
|
||||
data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin));
|
||||
int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size);
|
||||
|
@ -312,7 +315,7 @@ int target_data_begin(DeviceTy &Device, int32_t arg_num,
|
|||
}
|
||||
}
|
||||
|
||||
if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
|
||||
if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) {
|
||||
DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n",
|
||||
DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
|
||||
uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
|
||||
|
@ -363,14 +366,14 @@ int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
|
|||
}
|
||||
}
|
||||
|
||||
bool IsLast;
|
||||
bool IsLast, IsHostPtr;
|
||||
bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) ||
|
||||
(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ);
|
||||
bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE;
|
||||
|
||||
// If PTR_AND_OBJ, HstPtrBegin is address of pointee
|
||||
void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast,
|
||||
UpdateRef);
|
||||
UpdateRef, IsHostPtr);
|
||||
DP("There are %" PRId64 " bytes allocated at target address " DPxMOD
|
||||
" - is%s last\n", data_size, DPxPTR(TgtPtrBegin),
|
||||
(IsLast ? "" : " not"));
|
||||
|
@ -387,18 +390,22 @@ int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base,
|
|||
if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
|
||||
bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS;
|
||||
bool CopyMember = false;
|
||||
if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
|
||||
!(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
|
||||
// Copy data only if the "parent" struct has RefCount==1.
|
||||
int32_t parent_idx = member_of(arg_types[i]);
|
||||
long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
|
||||
assert(parent_rc > 0 && "parent struct not found");
|
||||
if (parent_rc == 1) {
|
||||
CopyMember = true;
|
||||
if (!(Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY)) {
|
||||
if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) &&
|
||||
!(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) {
|
||||
// Copy data only if the "parent" struct has RefCount==1.
|
||||
int32_t parent_idx = member_of(arg_types[i]);
|
||||
long parent_rc = Device.getMapEntryRefCnt(args[parent_idx]);
|
||||
assert(parent_rc > 0 && "parent struct not found");
|
||||
if (parent_rc == 1) {
|
||||
CopyMember = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (DelEntry || Always || CopyMember) {
|
||||
if ((DelEntry || Always || CopyMember) &&
|
||||
!(Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
|
||||
TgtPtrBegin == HstPtrBegin)) {
|
||||
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
|
||||
data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
|
||||
int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size);
|
||||
|
@ -471,14 +478,21 @@ int target_data_update(DeviceTy &Device, int32_t arg_num,
|
|||
|
||||
void *HstPtrBegin = args[i];
|
||||
int64_t MapSize = arg_sizes[i];
|
||||
bool IsLast;
|
||||
bool IsLast, IsHostPtr;
|
||||
void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast,
|
||||
false);
|
||||
false, IsHostPtr);
|
||||
if (!TgtPtrBegin) {
|
||||
DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
|
||||
TgtPtrBegin == HstPtrBegin) {
|
||||
DP("hst data:" DPxMOD " unified and shared, becomes a noop\n",
|
||||
DPxPTR(HstPtrBegin));
|
||||
continue;
|
||||
}
|
||||
|
||||
if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) {
|
||||
DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n",
|
||||
arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin));
|
||||
|
@ -514,6 +528,7 @@ int target_data_update(DeviceTy &Device, int32_t arg_num,
|
|||
DP("Copying data to device failed.\n");
|
||||
return OFFLOAD_FAIL;
|
||||
}
|
||||
|
||||
uintptr_t lb = (uintptr_t) HstPtrBegin;
|
||||
uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize;
|
||||
Device.ShadowMtx.lock();
|
||||
|
@ -640,19 +655,26 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num,
|
|||
void *HstPtrVal = args[i];
|
||||
void *HstPtrBegin = args_base[i];
|
||||
void *HstPtrBase = args[idx];
|
||||
bool IsLast; // unused.
|
||||
bool IsLast, IsHostPtr; // unused.
|
||||
void *TgtPtrBase =
|
||||
(void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]);
|
||||
DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase));
|
||||
uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase;
|
||||
void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta);
|
||||
void *Pointer_TgtPtrBegin =
|
||||
Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false);
|
||||
Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false,
|
||||
IsHostPtr);
|
||||
if (!Pointer_TgtPtrBegin) {
|
||||
DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n",
|
||||
DPxPTR(HstPtrVal));
|
||||
continue;
|
||||
}
|
||||
if (Device.RTLRequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
|
||||
TgtPtrBegin == HstPtrBegin) {
|
||||
DP("Unified memory is active, no need to map lambda captured"
|
||||
"variable (" DPxMOD ")\n", DPxPTR(HstPtrVal));
|
||||
continue;
|
||||
}
|
||||
DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n",
|
||||
DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin));
|
||||
int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin,
|
||||
|
@ -668,7 +690,7 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num,
|
|||
void *HstPtrBase = args_base[i];
|
||||
void *TgtPtrBegin;
|
||||
ptrdiff_t TgtBaseOffset;
|
||||
bool IsLast; // unused.
|
||||
bool IsLast, IsHostPtr; // unused.
|
||||
if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) {
|
||||
DP("Forwarding first-private value " DPxMOD " to the target construct\n",
|
||||
DPxPTR(HstPtrBase));
|
||||
|
@ -705,14 +727,14 @@ int target(int64_t device_id, void *host_ptr, int32_t arg_num,
|
|||
}
|
||||
} else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) {
|
||||
TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast,
|
||||
false);
|
||||
false, IsHostPtr);
|
||||
TgtBaseOffset = 0; // no offset for ptrs.
|
||||
DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to "
|
||||
"object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase),
|
||||
DPxPTR(HstPtrBase));
|
||||
} else {
|
||||
TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast,
|
||||
false);
|
||||
false, IsHostPtr);
|
||||
TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin;
|
||||
#ifdef OMPTARGET_DEBUG
|
||||
void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset);
|
||||
|
|
|
@ -43,4 +43,4 @@ int main() {
|
|||
{}
|
||||
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -0,0 +1,164 @@
|
|||
// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
|
||||
// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
|
||||
// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
|
||||
// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Various definitions copied from OpenMP RTL
|
||||
|
||||
extern void __tgt_register_requires(int64_t);
|
||||
|
||||
// End of definitions copied from OpenMP RTL.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#pragma omp requires unified_shared_memory
|
||||
|
||||
#define N 1024
|
||||
|
||||
void init(int A[], int B[], int C[]) {
|
||||
for (int i = 0; i < N; ++i) {
|
||||
A[i] = 0;
|
||||
B[i] = 1;
|
||||
C[i] = i;
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
const int device = omp_get_default_device();
|
||||
|
||||
// Manual registration of requires flags for Clang versions
|
||||
// that do not support requires.
|
||||
__tgt_register_requires(8);
|
||||
|
||||
// CHECK: Initial device: -10
|
||||
printf("Initial device: %d\n", omp_get_initial_device());
|
||||
|
||||
//
|
||||
// Target alloc & target memcpy
|
||||
//
|
||||
int A[N], B[N], C[N];
|
||||
|
||||
// Init
|
||||
init(A, B, C);
|
||||
|
||||
int *pA, *pB, *pC;
|
||||
|
||||
// map ptrs
|
||||
pA = &A[0];
|
||||
pB = &B[0];
|
||||
pC = &C[0];
|
||||
|
||||
int *d_A = (int *)omp_target_alloc(N * sizeof(int), device);
|
||||
int *d_B = (int *)omp_target_alloc(N * sizeof(int), device);
|
||||
int *d_C = (int *)omp_target_alloc(N * sizeof(int), device);
|
||||
|
||||
// CHECK: omp_target_alloc succeeded
|
||||
printf("omp_target_alloc %s\n", d_A && d_B && d_C ? "succeeded" : "failed");
|
||||
|
||||
omp_target_memcpy(d_B, pB, N * sizeof(int), 0, 0, device,
|
||||
omp_get_initial_device());
|
||||
omp_target_memcpy(d_C, pC, N * sizeof(int), 0, 0, device,
|
||||
omp_get_initial_device());
|
||||
|
||||
#pragma omp target is_device_ptr(d_A, d_B, d_C) device(device)
|
||||
{
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int i = 0; i < N; i++) {
|
||||
d_A[i] = d_B[i] + d_C[i] + 1;
|
||||
}
|
||||
}
|
||||
|
||||
omp_target_memcpy(pA, d_A, N * sizeof(int), 0, 0, omp_get_initial_device(),
|
||||
device);
|
||||
|
||||
// CHECK: Test omp_target_memcpy: Succeeded
|
||||
int fail = 0;
|
||||
for (int i = 0; i < N; ++i) {
|
||||
if (A[i] != i + 2)
|
||||
fail++;
|
||||
}
|
||||
if (fail) {
|
||||
printf("Test omp_target_memcpy: Failed\n");
|
||||
} else {
|
||||
printf("Test omp_target_memcpy: Succeeded\n");
|
||||
}
|
||||
|
||||
//
|
||||
// target_is_present and target_associate/disassociate_ptr
|
||||
//
|
||||
init(A, B, C);
|
||||
|
||||
// CHECK: B is not present, associating it...
|
||||
// CHECK: omp_target_associate_ptr B succeeded
|
||||
if (!omp_target_is_present(B, device)) {
|
||||
printf("B is not present, associating it...\n");
|
||||
int rc = omp_target_associate_ptr(B, d_B, N * sizeof(int), 0, device);
|
||||
printf("omp_target_associate_ptr B %s\n", !rc ? "succeeded" : "failed");
|
||||
}
|
||||
|
||||
// CHECK: C is not present, associating it...
|
||||
// CHECK: omp_target_associate_ptr C succeeded
|
||||
if (!omp_target_is_present(C, device)) {
|
||||
printf("C is not present, associating it...\n");
|
||||
int rc = omp_target_associate_ptr(C, d_C, N * sizeof(int), 0, device);
|
||||
printf("omp_target_associate_ptr C %s\n", !rc ? "succeeded" : "failed");
|
||||
}
|
||||
|
||||
// CHECK: Inside target data: A is not present
|
||||
// CHECK: Inside target data: B is present
|
||||
// CHECK: Inside target data: C is present
|
||||
#pragma omp target data map(from : B, C) device(device)
|
||||
{
|
||||
printf("Inside target data: A is%s present\n",
|
||||
omp_target_is_present(A, device) ? "" : " not");
|
||||
printf("Inside target data: B is%s present\n",
|
||||
omp_target_is_present(B, device) ? "" : " not");
|
||||
printf("Inside target data: C is%s present\n",
|
||||
omp_target_is_present(C, device) ? "" : " not");
|
||||
|
||||
#pragma omp target map(from : A) device(device)
|
||||
{
|
||||
#pragma omp parallel for schedule(static, 1)
|
||||
for (int i = 0; i < N; i++)
|
||||
A[i] = B[i] + C[i] + 1;
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK: B is present, disassociating it...
|
||||
// CHECK: omp_target_disassociate_ptr B succeeded
|
||||
// CHECK: C is present, disassociating it...
|
||||
// CHECK: omp_target_disassociate_ptr C succeeded
|
||||
if (omp_target_is_present(B, device)) {
|
||||
printf("B is present, disassociating it...\n");
|
||||
int rc = omp_target_disassociate_ptr(B, device);
|
||||
printf("omp_target_disassociate_ptr B %s\n", !rc ? "succeeded" : "failed");
|
||||
}
|
||||
if (omp_target_is_present(C, device)) {
|
||||
printf("C is present, disassociating it...\n");
|
||||
int rc = omp_target_disassociate_ptr(C, device);
|
||||
printf("omp_target_disassociate_ptr C %s\n", !rc ? "succeeded" : "failed");
|
||||
}
|
||||
|
||||
// CHECK: Test omp_target_associate_ptr: Succeeded
|
||||
fail = 0;
|
||||
for (int i = 0; i < N; ++i) {
|
||||
if (A[i] != i + 2)
|
||||
fail++;
|
||||
}
|
||||
if (fail) {
|
||||
printf("Test omp_target_associate_ptr: Failed\n");
|
||||
} else {
|
||||
printf("Test omp_target_associate_ptr: Succeeded\n");
|
||||
}
|
||||
|
||||
omp_target_free(d_A, device);
|
||||
omp_target_free(d_B, device);
|
||||
omp_target_free(d_C, device);
|
||||
|
||||
printf("Done!\n");
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,114 @@
|
|||
// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu
|
||||
// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu
|
||||
// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu
|
||||
// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Various definitions copied from OpenMP RTL
|
||||
|
||||
extern void __tgt_register_requires(int64_t);
|
||||
|
||||
// End of definitions copied from OpenMP RTL.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
#pragma omp requires unified_shared_memory
|
||||
|
||||
#define N 1024
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int fails;
|
||||
void *host_alloc, *device_alloc;
|
||||
void *host_data, *device_data;
|
||||
int *alloc = (int *)malloc(N * sizeof(int));
|
||||
int data[N];
|
||||
|
||||
// Manual registration of requires flags for Clang versions
|
||||
// that do not support requires.
|
||||
__tgt_register_requires(8);
|
||||
|
||||
for (int i = 0; i < N; ++i) {
|
||||
alloc[i] = 10;
|
||||
data[i] = 1;
|
||||
}
|
||||
|
||||
host_data = &data[0];
|
||||
host_alloc = &alloc[0];
|
||||
|
||||
// implicit mapping of data
|
||||
#pragma omp target map(tofrom : device_data, device_alloc)
|
||||
{
|
||||
device_data = &data[0];
|
||||
device_alloc = &alloc[0];
|
||||
|
||||
for (int i = 0; i < N; i++) {
|
||||
alloc[i] += 1;
|
||||
data[i] += 1;
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK: Address of alloc on device matches host address.
|
||||
if (device_alloc == host_alloc)
|
||||
printf("Address of alloc on device matches host address.\n");
|
||||
|
||||
// CHECK: Address of data on device matches host address.
|
||||
if (device_data == host_data)
|
||||
printf("Address of data on device matches host address.\n");
|
||||
|
||||
// On the host, check that the arrays have been updated.
|
||||
// CHECK: Alloc device values updated: Succeeded
|
||||
fails = 0;
|
||||
for (int i = 0; i < N; i++) {
|
||||
if (alloc[i] != 11)
|
||||
fails++;
|
||||
}
|
||||
printf("Alloc device values updated: %s\n",
|
||||
(fails == 0) ? "Succeeded" : "Failed");
|
||||
|
||||
// CHECK: Data device values updated: Succeeded
|
||||
fails = 0;
|
||||
for (int i = 0; i < N; i++) {
|
||||
if (data[i] != 2)
|
||||
fails++;
|
||||
}
|
||||
printf("Data device values updated: %s\n",
|
||||
(fails == 0) ? "Succeeded" : "Failed");
|
||||
|
||||
//
|
||||
// Test that updates on the host snd on the device are both visible.
|
||||
//
|
||||
|
||||
// Update on the host.
|
||||
for (int i = 0; i < N; ++i) {
|
||||
alloc[i] += 1;
|
||||
data[i] += 1;
|
||||
}
|
||||
|
||||
#pragma omp target
|
||||
{
|
||||
// CHECK: Alloc host values updated: Succeeded
|
||||
fails = 0;
|
||||
for (int i = 0; i < N; i++) {
|
||||
if (alloc[i] != 12)
|
||||
fails++;
|
||||
}
|
||||
printf("Alloc host values updated: %s\n",
|
||||
(fails == 0) ? "Succeeded" : "Failed");
|
||||
// CHECK: Data host values updated: Succeeded
|
||||
fails = 0;
|
||||
for (int i = 0; i < N; i++) {
|
||||
if (data[i] != 3)
|
||||
fails++;
|
||||
}
|
||||
printf("Data host values updated: %s\n",
|
||||
(fails == 0) ? "Succeeded" : "Failed");
|
||||
}
|
||||
|
||||
free(alloc);
|
||||
|
||||
printf("Done!\n");
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue