From 03ff643d2e9ebbf319d71b3a17d2ed0320a6a25b Mon Sep 17 00:00:00 2001 From: Shilei Tian Date: Thu, 9 Apr 2020 22:40:30 -0400 Subject: [PATCH] [OpenMP] Put old APIs back and added new _async series for backward compatibility Summary: According to comments on bi-weekly meeting, this patch put back old APIs and added new `_async` series Reviewers: jdoerfert Reviewed By: jdoerfert Subscribers: yaxunl, guansong, openmp-commits Tags: #openmp Differential Revision: https://reviews.llvm.org/D77822 --- openmp/libomptarget/include/omptargetplugin.h | 44 ++++++---- openmp/libomptarget/plugins/cuda/src/rtl.cpp | 82 +++++++++++++------ openmp/libomptarget/plugins/exports | 4 + .../plugins/generic-elf-64bit/src/rtl.cpp | 26 +++--- openmp/libomptarget/src/device.cpp | 38 ++++++--- openmp/libomptarget/src/device.h | 9 +- openmp/libomptarget/src/interface.cpp | 16 ++-- openmp/libomptarget/src/omptarget.cpp | 5 +- openmp/libomptarget/src/private.h | 4 +- openmp/libomptarget/src/rtl.cpp | 60 ++++++++------ openmp/libomptarget/src/rtl.h | 31 +++++-- 11 files changed, 198 insertions(+), 121 deletions(-) diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h index b330c1935282..083e422aac16 100644 --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -58,21 +58,24 @@ __tgt_target_table *__tgt_rtl_load_binary(int32_t ID, // case an error occurred on the target device. void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr); -// Pass the data content to the target device using the target address. If -// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous. -// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that -// case, it is synchronous. In case of success, return zero. Otherwise, return -// an error code. +// Pass the data content to the target device using the target address. In case +// of success, return zero. Otherwise, return an error code. int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, - int64_t Size, __tgt_async_info *AsyncInfoPtr); + int64_t Size); -// Retrieve the data content from the target device using its address. If -// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous. -// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that -// case, it is synchronous. In case of success, return zero. Otherwise, return -// an error code. +int32_t __tgt_rtl_data_submit_async(int32_t ID, void *TargetPtr, void *HostPtr, + int64_t Size, + __tgt_async_info *AsyncInfoPtr); + +// Retrieve the data content from the target device using its address. In case +// of success, return zero. Otherwise, return an error code. int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, - int64_t Size, __tgt_async_info *AsyncInfoPtr); + int64_t Size); + +// Asynchronous version of __tgt_rtl_data_retrieve +int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr, + void *TargetPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr); // De-allocate the data referenced by target ptr on the device. In case of // success, return zero. Otherwise, return an error code. @@ -86,8 +89,12 @@ int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr); // ignored on some platforms, like x86_64. In that case, it is synchronous. In // case of success, return zero. Otherwise, return an error code. int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args, - ptrdiff_t *Offsets, int32_t NumArgs, - __tgt_async_info *AsyncInfoPtr); + ptrdiff_t *Offsets, int32_t NumArgs); + +// Asynchronous version of __tgt_rtl_run_target_region +int32_t __tgt_rtl_run_target_region_async(int32_t ID, void *Entry, void **Args, + ptrdiff_t *Offsets, int32_t NumArgs, + __tgt_async_info *AsyncInfoPtr); // Similar to __tgt_rtl_run_target_region, but additionally specify the // number of teams to be created and a number of threads in each team. If @@ -97,8 +104,13 @@ int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args, int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args, ptrdiff_t *Offsets, int32_t NumArgs, int32_t NumTeams, int32_t ThreadLimit, - uint64_t loop_tripcount, - __tgt_async_info *AsyncInfoPtr); + uint64_t loop_tripcount); + +// Asynchronous version of __tgt_rtl_run_target_team_region +int32_t __tgt_rtl_run_target_team_region_async( + int32_t ID, void *Entry, void **Args, ptrdiff_t *Offsets, int32_t NumArgs, + int32_t NumTeams, int32_t ThreadLimit, uint64_t loop_tripcount, + __tgt_async_info *AsyncInfoPtr); // Device synchronization. In case of success, return zero. Otherwise, return an // error code. diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp index c0fb87b8e19d..1147f821b7ae 100644 --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -725,40 +725,41 @@ void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { } int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, - int64_t size, __tgt_async_info *async_info_ptr) { - // The function dataSubmit is always asynchronous. Considering some data - // transfer must be synchronous, we assume if async_info_ptr is nullptr, the - // transfer will be synchronous by creating a temporary async info and then - // synchronizing after call dataSubmit; otherwise, it is asynchronous. - if (async_info_ptr) - return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr); - + int64_t size) { __tgt_async_info async_info; - int32_t rc = dataSubmit(device_id, tgt_ptr, hst_ptr, size, &async_info); + int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, size, + &async_info); if (rc != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; return __tgt_rtl_synchronize(device_id, &async_info); } -int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, - int64_t size, - __tgt_async_info *async_info_ptr) { - // The function dataRetrieve is always asynchronous. Considering some data - // transfer must be synchronous, we assume if async_info_ptr is nullptr, the - // transfer will be synchronous by creating a temporary async info and then - // synchronizing after call dataRetrieve; otherwise, it is asynchronous. - if (async_info_ptr) - return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr); +int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, + void *hst_ptr, int64_t size, + __tgt_async_info *async_info_ptr) { + assert(async_info_ptr && "async_info_ptr is nullptr"); + return dataSubmit(device_id, tgt_ptr, hst_ptr, size, async_info_ptr); +} +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { __tgt_async_info async_info; - int32_t rc = dataRetrieve(device_id, hst_ptr, tgt_ptr, size, &async_info); + int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, size, + &async_info); if (rc != OFFLOAD_SUCCESS) return OFFLOAD_FAIL; return __tgt_rtl_synchronize(device_id, &async_info); } +int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, + void *tgt_ptr, int64_t size, + __tgt_async_info *async_info_ptr) { + assert(async_info_ptr && "async_info_ptr is nullptr"); + return dataRetrieve(device_id, hst_ptr, tgt_ptr, size, async_info_ptr); +} + int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { // Set the context we are using. CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); @@ -782,8 +783,22 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, int32_t thread_limit, - uint64_t loop_tripcount, - __tgt_async_info *async_info) { + uint64_t loop_tripcount) { + __tgt_async_info async_info; + int32_t rc = __tgt_rtl_run_target_team_region_async( + device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, + thread_limit, loop_tripcount, &async_info); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &async_info); +} + +int32_t __tgt_rtl_run_target_team_region_async( + int32_t device_id, void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount, + __tgt_async_info *async_info) { // Set the context we are using. CUresult err = cuCtxSetCurrent(DeviceInfo.Contexts[device_id]); if (err != CUDA_SUCCESS) { @@ -890,21 +905,34 @@ int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, } DP("Launch of entry point at " DPxMOD " successful!\n", - DPxPTR(tgt_entry_ptr)); + DPxPTR(tgt_entry_ptr)); return OFFLOAD_SUCCESS; } int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, void **tgt_args, ptrdiff_t *tgt_offsets, - int32_t arg_num, - __tgt_async_info *async_info) { + int32_t arg_num) { + __tgt_async_info async_info; + int32_t rc = __tgt_rtl_run_target_region_async( + device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &async_info); +} + +int32_t __tgt_rtl_run_target_region_async(int32_t device_id, + void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, + __tgt_async_info *async_info) { // use one team and the default number of threads. const int32_t team_num = 1; const int32_t thread_limit = 0; - return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, - tgt_offsets, arg_num, team_num, - thread_limit, 0, async_info); + return __tgt_rtl_run_target_team_region_async( + device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, + thread_limit, 0, async_info); } int32_t __tgt_rtl_synchronize(int32_t device_id, __tgt_async_info *async_info) { diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports index cbbad6d0364d..a4e1a3186daa 100644 --- a/openmp/libomptarget/plugins/exports +++ b/openmp/libomptarget/plugins/exports @@ -7,10 +7,14 @@ VERS1.0 { __tgt_rtl_load_binary; __tgt_rtl_data_alloc; __tgt_rtl_data_submit; + __tgt_rtl_data_submit_async; __tgt_rtl_data_retrieve; + __tgt_rtl_data_retrieve_async; __tgt_rtl_data_delete; __tgt_rtl_run_target_team_region; + __tgt_rtl_run_target_team_region_async; __tgt_rtl_run_target_region; + __tgt_rtl_run_target_region_async; __tgt_rtl_synchronize; local: *; diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp index 84875f591ac0..8a6e085d3f75 100644 --- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp @@ -277,13 +277,13 @@ void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { } int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, - int64_t size, __tgt_async_info *) { + int64_t size) { memcpy(tgt_ptr, hst_ptr, size); return OFFLOAD_SUCCESS; } int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, - int64_t size, __tgt_async_info *) { + int64_t size) { memcpy(hst_ptr, tgt_ptr, size); return OFFLOAD_SUCCESS; } @@ -293,11 +293,12 @@ int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { return OFFLOAD_SUCCESS; } -int32_t __tgt_rtl_run_target_team_region( - int32_t device_id, void *tgt_entry_ptr, void **tgt_args, - ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, - int32_t thread_limit, uint64_t loop_tripcount /*not used*/, - __tgt_async_info *async_info /*not used*/) { +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t team_num, + int32_t thread_limit, + uint64_t loop_tripcount /*not used*/) { // ignore team num and thread limit. // Use libffi to launch execution. @@ -331,17 +332,10 @@ int32_t __tgt_rtl_run_target_team_region( int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, void **tgt_args, ptrdiff_t *tgt_offsets, - int32_t arg_num, - __tgt_async_info *async_info_ptr) { + int32_t arg_num) { // use one team and one thread. return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, - tgt_offsets, arg_num, 1, 1, 0, - async_info_ptr); -} - -int32_t __tgt_rtl_synchronize(int32_t device_id, - __tgt_async_info *async_info_ptr) { - return OFFLOAD_SUCCESS; + tgt_offsets, arg_num, 1, 1, 0); } #ifdef __cplusplus diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp index 09ddcceff9ea..765dd54fe5ca 100644 --- a/openmp/libomptarget/src/device.cpp +++ b/openmp/libomptarget/src/device.cpp @@ -334,24 +334,33 @@ __tgt_target_table *DeviceTy::load_binary(void *Img) { // Submit data to device int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, __tgt_async_info *AsyncInfoPtr) { - - return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size, - AsyncInfoPtr); + if (!AsyncInfoPtr || !RTL->data_submit_async || !RTL->synchronize) + return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); + else + return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size, + AsyncInfoPtr); } // Retrieve data from device int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, __tgt_async_info *AsyncInfoPtr) { - return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size, - AsyncInfoPtr); + if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize) + return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); + else + return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size, + AsyncInfoPtr); } // Run region on device int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, - __tgt_async_info *AsyncInfo) { - return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, - TgtVarsSize, AsyncInfo); + __tgt_async_info *AsyncInfoPtr) { + if (!AsyncInfoPtr || !RTL->run_region || !RTL->synchronize) + return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, + TgtVarsSize); + else + return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, + TgtOffsets, TgtVarsSize, AsyncInfoPtr); } // Run team region on device. @@ -359,10 +368,15 @@ int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit, uint64_t LoopTripCount, - __tgt_async_info *AsyncInfo) { - return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, - TgtVarsSize, NumTeams, ThreadLimit, LoopTripCount, - AsyncInfo); + __tgt_async_info *AsyncInfoPtr) { + if (!AsyncInfoPtr || !RTL->run_team_region_async || !RTL->synchronize) + return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, + TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit, + LoopTripCount); + else + return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, + TgtOffsets, TgtVarsSize, NumTeams, + ThreadLimit, LoopTripCount, AsyncInfoPtr); } /// Check whether a device has an associated RTL and initialize it if it's not diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h index e44adaf70e4e..a3a5767f81ff 100644 --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -174,8 +174,8 @@ struct DeviceTy { int32_t initOnce(); __tgt_target_table *load_binary(void *Img); - // Asynchronous data transfer. When AsyncInfoPtr is nullptr, the transfer will - // be synchronous. + // Data transfer. When AsyncInfoPtr is nullptr, the transfer will be + // synchronous. int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, __tgt_async_info *AsyncInfoPtr); int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, @@ -183,11 +183,12 @@ struct DeviceTy { int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, - __tgt_async_info *AsyncInfo); + __tgt_async_info *AsyncInfoPtr); int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, int32_t NumTeams, int32_t ThreadLimit, - uint64_t LoopTripCount, __tgt_async_info *AsyncInfo); + uint64_t LoopTripCount, + __tgt_async_info *AsyncInfoPtr); private: // Call to RTL diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index 8ff9d8cab1d9..924bc490b110 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -108,18 +108,18 @@ EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, return; } - DeviceTy& Device = Devices[device_id]; + DeviceTy &Device = Devices[device_id]; #ifdef OMPTARGET_DEBUG - for (int i=0; isynchronize(device_id, &AsyncInfo); + if (Device.RTL->synchronize) + return Device.RTL->synchronize(device_id, &AsyncInfo); + + return OFFLOAD_SUCCESS; } diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index 6e6b39f3fdca..dbc5bafbab5b 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -20,11 +20,11 @@ extern int target_data_begin(DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - __tgt_async_info *async_info_ptr = nullptr); + __tgt_async_info *async_info_ptr); extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - __tgt_async_info *async_info_ptr = nullptr); + __tgt_async_info *async_info_ptr); extern int target_data_update(DeviceTy &Device, int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types); diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp index ed0be2c2ee53..1439f67e7c64 100644 --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -96,43 +96,49 @@ void RTLsTy::LoadRTLs() { R.RTLName = Name; #endif - if (!(*((void**) &R.is_valid_binary) = dlsym( - dynlib_handle, "__tgt_rtl_is_valid_binary"))) + if (!(*((void **)&R.is_valid_binary) = + dlsym(dynlib_handle, "__tgt_rtl_is_valid_binary"))) continue; - if (!(*((void**) &R.number_of_devices) = dlsym( - dynlib_handle, "__tgt_rtl_number_of_devices"))) + if (!(*((void **)&R.number_of_devices) = + dlsym(dynlib_handle, "__tgt_rtl_number_of_devices"))) continue; - if (!(*((void**) &R.init_device) = dlsym( - dynlib_handle, "__tgt_rtl_init_device"))) + if (!(*((void **)&R.init_device) = + dlsym(dynlib_handle, "__tgt_rtl_init_device"))) continue; - if (!(*((void**) &R.load_binary) = dlsym( - dynlib_handle, "__tgt_rtl_load_binary"))) + if (!(*((void **)&R.load_binary) = + dlsym(dynlib_handle, "__tgt_rtl_load_binary"))) continue; - if (!(*((void**) &R.data_alloc) = dlsym( - dynlib_handle, "__tgt_rtl_data_alloc"))) + if (!(*((void **)&R.data_alloc) = + dlsym(dynlib_handle, "__tgt_rtl_data_alloc"))) continue; - if (!(*((void**) &R.data_submit) = dlsym( - dynlib_handle, "__tgt_rtl_data_submit"))) + if (!(*((void **)&R.data_submit) = + dlsym(dynlib_handle, "__tgt_rtl_data_submit"))) continue; - if (!(*((void**) &R.data_retrieve) = dlsym( - dynlib_handle, "__tgt_rtl_data_retrieve"))) + if (!(*((void **)&R.data_retrieve) = + dlsym(dynlib_handle, "__tgt_rtl_data_retrieve"))) continue; - if (!(*((void**) &R.data_delete) = dlsym( - dynlib_handle, "__tgt_rtl_data_delete"))) + if (!(*((void **)&R.data_delete) = + dlsym(dynlib_handle, "__tgt_rtl_data_delete"))) continue; - if (!(*((void**) &R.run_region) = dlsym( - dynlib_handle, "__tgt_rtl_run_target_region"))) + if (!(*((void **)&R.run_region) = + dlsym(dynlib_handle, "__tgt_rtl_run_target_region"))) continue; - if (!(*((void**) &R.run_team_region) = dlsym( - dynlib_handle, "__tgt_rtl_run_target_team_region"))) - continue; - if (!(*((void**) &R.synchronize) = dlsym( - dynlib_handle, "__tgt_rtl_synchronize"))) + if (!(*((void **)&R.run_team_region) = + dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region"))) continue; // Optional functions - *((void**) &R.init_requires) = dlsym( - dynlib_handle, "__tgt_rtl_init_requires"); + *((void **)&R.init_requires) = + dlsym(dynlib_handle, "__tgt_rtl_init_requires"); + *((void **)&R.data_submit_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_submit_async"); + *((void **)&R.data_retrieve_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_retrieve_async"); + *((void **)&R.run_region_async) = + dlsym(dynlib_handle, "__tgt_rtl_run_target_region_async"); + *((void **)&R.run_team_region_async) = + dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region_async"); + *((void **)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize"); // No devices are supported by this RTL? if (!(R.NumberOfDevices = R.number_of_devices())) { @@ -140,8 +146,8 @@ void RTLsTy::LoadRTLs() { continue; } - DP("Registering RTL %s supporting %d devices!\n", - R.RTLName.c_str(), R.NumberOfDevices); + DP("Registering RTL %s supporting %d devices!\n", R.RTLName.c_str(), + R.NumberOfDevices); // The RTL is valid! Will save the information in the RTLs list. AllRTLs.push_back(R); diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h index 846c89b0ed2e..86ecd6724a8d 100644 --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -30,16 +30,23 @@ struct RTLInfoTy { typedef int32_t(init_device_ty)(int32_t); typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); typedef void *(data_alloc_ty)(int32_t, int64_t, void *); - typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t, - __tgt_async_info *); - typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t, - __tgt_async_info *); + typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t); + typedef int32_t(data_submit_async_ty)(int32_t, void *, void *, int64_t, + __tgt_async_info *); + typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t); + typedef int32_t(data_retrieve_async_ty)(int32_t, void *, void *, int64_t, + __tgt_async_info *); typedef int32_t(data_delete_ty)(int32_t, void *); - typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, int32_t, - __tgt_async_info *); + typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, + int32_t); + typedef int32_t(run_region_async_ty)(int32_t, void *, void **, ptrdiff_t *, + int32_t, __tgt_async_info *); typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *, - int32_t, int32_t, int32_t, uint64_t, - __tgt_async_info *); + int32_t, int32_t, int32_t, uint64_t); + typedef int32_t(run_team_region_async_ty)(int32_t, void *, void **, + ptrdiff_t *, int32_t, int32_t, + int32_t, uint64_t, + __tgt_async_info *); typedef int64_t(init_requires_ty)(int64_t); typedef int64_t(synchronize_ty)(int64_t, __tgt_async_info *); @@ -62,10 +69,14 @@ struct RTLInfoTy { load_binary_ty *load_binary = nullptr; data_alloc_ty *data_alloc = nullptr; data_submit_ty *data_submit = nullptr; + data_submit_async_ty *data_submit_async = nullptr; data_retrieve_ty *data_retrieve = nullptr; + data_retrieve_async_ty *data_retrieve_async = nullptr; data_delete_ty *data_delete = nullptr; run_region_ty *run_region = nullptr; + run_region_async_ty *run_region_async = nullptr; run_team_region_ty *run_team_region = nullptr; + run_team_region_async_ty *run_team_region_async = nullptr; init_requires_ty *init_requires = nullptr; synchronize_ty *synchronize = nullptr; @@ -94,10 +105,14 @@ struct RTLInfoTy { load_binary = r.load_binary; data_alloc = r.data_alloc; data_submit = r.data_submit; + data_submit_async = r.data_submit_async; data_retrieve = r.data_retrieve; + data_retrieve_async = r.data_retrieve_async; data_delete = r.data_delete; run_region = r.run_region; + run_region_async = r.run_region_async; run_team_region = r.run_team_region; + run_team_region_async = r.run_team_region_async; init_requires = r.init_requires; isUsed = r.isUsed; synchronize = r.synchronize;