[libomptarget][nfc] Introduce SHARED, ALIGN macros

Summary:
[libomptarget][nfc] Introduce SHARED, ALIGN macros
Move remaining cuda attributes behind such macros

Reviewers: ABataev, jdoerfert, grokos

Reviewed By: ABataev

Subscribers: openmp-commits, jvesely

Tags: #openmp

Differential Revision: https://reviews.llvm.org/D71076
This commit is contained in:
Jon Chesterfield 2019-12-05 21:47:43 +00:00 committed by JonChesterfield
parent 338588d7cf
commit 4af84d2686
7 changed files with 44 additions and 40 deletions

View File

@ -22,6 +22,8 @@
#define DEVICE __attribute__((device))
#define INLINE inline DEVICE
#define NOINLINE __attribute__((noinline)) DEVICE
#define SHARED __attribute__((shared))
#define ALIGN(N) __attribute__((aligned(N)))
////////////////////////////////////////////////////////////////////////////////
// Kernel options

View File

@ -77,7 +77,7 @@ private:
uint32_t nArgs;
};
extern __device__ __shared__ omptarget_nvptx_SharedArgs
extern DEVICE SHARED omptarget_nvptx_SharedArgs
omptarget_nvptx_globalArgs;
// Data structure to keep in shared memory that traces the current slot, stack,
@ -107,7 +107,7 @@ struct __kmpc_data_sharing_master_slot_static {
void *DataEnd;
char Data[DS_Slot_Size];
};
extern __device__ __shared__ DataSharingStateTy DataSharingState;
extern DEVICE SHARED DataSharingStateTy DataSharingState;
////////////////////////////////////////////////////////////////////////////////
// task ICV and (implicit & explicit) task state
@ -259,9 +259,9 @@ private:
workDescrForActiveParallel; // one, ONLY for the active par
uint64_t lastprivateIterBuffer;
__align__(16)
__kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
__align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
ALIGN(16)
__kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
};
////////////////////////////////////////////////////////////////////////////////
@ -326,7 +326,7 @@ private:
/// Memory manager for statically allocated memory.
class omptarget_nvptx_SimpleMemoryManager {
private:
__align__(128) struct MemDataTy {
ALIGN(128) struct MemDataTy {
volatile unsigned keys[OMP_STATE_COUNT];
} MemData[MAX_SM];
@ -345,20 +345,20 @@ public:
// global data tables
////////////////////////////////////////////////////////////////////////////////
extern __device__ omptarget_nvptx_SimpleMemoryManager
extern DEVICE omptarget_nvptx_SimpleMemoryManager
omptarget_nvptx_simpleMemoryManager;
extern __device__ __shared__ uint32_t usedMemIdx;
extern __device__ __shared__ uint32_t usedSlotIdx;
extern __device__ __shared__ uint8_t
extern DEVICE SHARED uint32_t usedMemIdx;
extern DEVICE SHARED uint32_t usedSlotIdx;
extern DEVICE SHARED uint8_t
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
extern __device__ __shared__ uint16_t threadLimit;
extern __device__ __shared__ uint16_t threadsInTeam;
extern __device__ __shared__ uint16_t nThreads;
extern __device__ __shared__
extern DEVICE SHARED uint16_t threadLimit;
extern DEVICE SHARED uint16_t threadsInTeam;
extern DEVICE SHARED uint16_t nThreads;
extern DEVICE SHARED
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
extern __device__ __shared__ uint32_t execution_param;
extern __device__ __shared__ void *ReductionScratchpadPtr;
extern DEVICE SHARED uint32_t execution_param;
extern DEVICE SHARED void *ReductionScratchpadPtr;
////////////////////////////////////////////////////////////////////////////////
// work function (outlined parallel/simd functions) and arguments.
@ -366,7 +366,7 @@ extern __device__ __shared__ void *ReductionScratchpadPtr;
////////////////////////////////////////////////////////////////////////////////
typedef void *omptarget_nvptx_WorkFn;
extern volatile __device__ __shared__ omptarget_nvptx_WorkFn
extern volatile DEVICE SHARED omptarget_nvptx_WorkFn
omptarget_nvptx_workFn;
////////////////////////////////////////////////////////////////////////////////

View File

@ -17,7 +17,7 @@
// global data tables
////////////////////////////////////////////////////////////////////////////////
extern __device__
extern DEVICE
omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
omptarget_nvptx_device_State[MAX_SM];

View File

@ -97,7 +97,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");
// If the runtime has been elided, used __shared__ memory for master-worker
// If the runtime has been elided, used shared memory for master-worker
// data sharing.
if (!IsOMPRuntimeInitialized)
return (void *)&DataSharingState;
@ -300,7 +300,7 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
int16_t IsOMPRuntimeInitialized) {
DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");
// If the runtime has been elided, use __shared__ memory for master-worker
// If the runtime has been elided, use shared memory for master-worker
// data sharing. We're reusing the statically allocated data structure
// that is used for standard data sharing.
if (!IsOMPRuntimeInitialized)

View File

@ -17,27 +17,27 @@
// global device environment
////////////////////////////////////////////////////////////////////////////////
__device__ omptarget_device_environmentTy omptarget_device_environment;
DEVICE omptarget_device_environmentTy omptarget_device_environment;
////////////////////////////////////////////////////////////////////////////////
// global data holding OpenMP state information
////////////////////////////////////////////////////////////////////////////////
__device__
DEVICE
omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
omptarget_nvptx_device_State[MAX_SM];
__device__ omptarget_nvptx_SimpleMemoryManager
DEVICE omptarget_nvptx_SimpleMemoryManager
omptarget_nvptx_simpleMemoryManager;
__device__ __shared__ uint32_t usedMemIdx;
__device__ __shared__ uint32_t usedSlotIdx;
DEVICE SHARED uint32_t usedMemIdx;
DEVICE SHARED uint32_t usedSlotIdx;
__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
__device__ __shared__ uint16_t threadLimit;
__device__ __shared__ uint16_t threadsInTeam;
__device__ __shared__ uint16_t nThreads;
DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
DEVICE SHARED uint16_t threadLimit;
DEVICE SHARED uint16_t threadsInTeam;
DEVICE SHARED uint16_t nThreads;
// Pointer to this team's OpenMP state object
__device__ __shared__
DEVICE SHARED
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
////////////////////////////////////////////////////////////////////////////////
@ -45,24 +45,24 @@ __device__ __shared__
// communicate with the workers. Since it is in shared memory, there is one
// copy of these variables for each kernel, instance, and team.
////////////////////////////////////////////////////////////////////////////////
volatile __device__ __shared__ omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
////////////////////////////////////////////////////////////////////////////////
// OpenMP kernel execution parameters
////////////////////////////////////////////////////////////////////////////////
__device__ __shared__ uint32_t execution_param;
DEVICE SHARED uint32_t execution_param;
////////////////////////////////////////////////////////////////////////////////
// Data sharing state
////////////////////////////////////////////////////////////////////////////////
__device__ __shared__ DataSharingStateTy DataSharingState;
DEVICE SHARED DataSharingStateTy DataSharingState;
////////////////////////////////////////////////////////////////////////////////
// Scratchpad for teams reduction.
////////////////////////////////////////////////////////////////////////////////
__device__ __shared__ void *ReductionScratchpadPtr;
DEVICE SHARED void *ReductionScratchpadPtr;
////////////////////////////////////////////////////////////////////////////////
// Data sharing related variables.
////////////////////////////////////////////////////////////////////////////////
__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;

View File

@ -233,7 +233,7 @@ static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
: /*Master thread only*/ 1;
uint32_t TeamId = GetBlockIdInKernel();
uint32_t NumTeams = GetNumberOfBlocksInKernel();
__shared__ volatile bool IsLastTeam;
SHARED volatile bool IsLastTeam;
// Team masters of all teams write to the scratchpad.
if (ThreadId == 0) {
@ -403,8 +403,8 @@ INLINE static uint32_t roundToWarpsize(uint32_t s) {
return (s & ~(unsigned)(WARPSIZE - 1));
}
__device__ static volatile uint32_t IterCnt = 0;
__device__ static volatile uint32_t Cnt = 0;
DEVICE static volatile uint32_t IterCnt = 0;
DEVICE static volatile uint32_t Cnt = 0;
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
kmp_Ident *loc, int32_t global_tid, void *global_buffer,
int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
@ -426,8 +426,8 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
: /*Master thread only*/ 1;
uint32_t TeamId = GetBlockIdInKernel();
uint32_t NumTeams = GetNumberOfBlocksInKernel();
__shared__ unsigned Bound;
__shared__ unsigned ChunkTeamCount;
SHARED unsigned Bound;
SHARED unsigned ChunkTeamCount;
// Block progress for teams greater than the current upper
// limit. We always only allow a number of teams less or equal

View File

@ -18,6 +18,8 @@
#define DEVICE __device__
#define INLINE __forceinline__ DEVICE
#define NOINLINE __noinline__ DEVICE
#define SHARED __shared__
#define ALIGN(N) __align__(N)
////////////////////////////////////////////////////////////////////////////////
// Kernel options