forked from OSchip/llvm-project
[libomptarget][nfc] Introduce SHARED, ALIGN macros
Summary: [libomptarget][nfc] Introduce SHARED, ALIGN macros Move remaining cuda attributes behind such macros Reviewers: ABataev, jdoerfert, grokos Reviewed By: ABataev Subscribers: openmp-commits, jvesely Tags: #openmp Differential Revision: https://reviews.llvm.org/D71076
This commit is contained in:
parent
338588d7cf
commit
4af84d2686
|
@ -22,6 +22,8 @@
|
|||
#define DEVICE __attribute__((device))
|
||||
#define INLINE inline DEVICE
|
||||
#define NOINLINE __attribute__((noinline)) DEVICE
|
||||
#define SHARED __attribute__((shared))
|
||||
#define ALIGN(N) __attribute__((aligned(N)))
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Kernel options
|
||||
|
|
|
@ -77,7 +77,7 @@ private:
|
|||
uint32_t nArgs;
|
||||
};
|
||||
|
||||
extern __device__ __shared__ omptarget_nvptx_SharedArgs
|
||||
extern DEVICE SHARED omptarget_nvptx_SharedArgs
|
||||
omptarget_nvptx_globalArgs;
|
||||
|
||||
// Data structure to keep in shared memory that traces the current slot, stack,
|
||||
|
@ -107,7 +107,7 @@ struct __kmpc_data_sharing_master_slot_static {
|
|||
void *DataEnd;
|
||||
char Data[DS_Slot_Size];
|
||||
};
|
||||
extern __device__ __shared__ DataSharingStateTy DataSharingState;
|
||||
extern DEVICE SHARED DataSharingStateTy DataSharingState;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// task ICV and (implicit & explicit) task state
|
||||
|
@ -259,9 +259,9 @@ private:
|
|||
workDescrForActiveParallel; // one, ONLY for the active par
|
||||
uint64_t lastprivateIterBuffer;
|
||||
|
||||
__align__(16)
|
||||
__kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
|
||||
__align__(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
|
||||
ALIGN(16)
|
||||
__kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE];
|
||||
ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1];
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -326,7 +326,7 @@ private:
|
|||
/// Memory manager for statically allocated memory.
|
||||
class omptarget_nvptx_SimpleMemoryManager {
|
||||
private:
|
||||
__align__(128) struct MemDataTy {
|
||||
ALIGN(128) struct MemDataTy {
|
||||
volatile unsigned keys[OMP_STATE_COUNT];
|
||||
} MemData[MAX_SM];
|
||||
|
||||
|
@ -345,20 +345,20 @@ public:
|
|||
// global data tables
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern __device__ omptarget_nvptx_SimpleMemoryManager
|
||||
extern DEVICE omptarget_nvptx_SimpleMemoryManager
|
||||
omptarget_nvptx_simpleMemoryManager;
|
||||
extern __device__ __shared__ uint32_t usedMemIdx;
|
||||
extern __device__ __shared__ uint32_t usedSlotIdx;
|
||||
extern __device__ __shared__ uint8_t
|
||||
extern DEVICE SHARED uint32_t usedMemIdx;
|
||||
extern DEVICE SHARED uint32_t usedSlotIdx;
|
||||
extern DEVICE SHARED uint8_t
|
||||
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
|
||||
extern __device__ __shared__ uint16_t threadLimit;
|
||||
extern __device__ __shared__ uint16_t threadsInTeam;
|
||||
extern __device__ __shared__ uint16_t nThreads;
|
||||
extern __device__ __shared__
|
||||
extern DEVICE SHARED uint16_t threadLimit;
|
||||
extern DEVICE SHARED uint16_t threadsInTeam;
|
||||
extern DEVICE SHARED uint16_t nThreads;
|
||||
extern DEVICE SHARED
|
||||
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
|
||||
|
||||
extern __device__ __shared__ uint32_t execution_param;
|
||||
extern __device__ __shared__ void *ReductionScratchpadPtr;
|
||||
extern DEVICE SHARED uint32_t execution_param;
|
||||
extern DEVICE SHARED void *ReductionScratchpadPtr;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// work function (outlined parallel/simd functions) and arguments.
|
||||
|
@ -366,7 +366,7 @@ extern __device__ __shared__ void *ReductionScratchpadPtr;
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef void *omptarget_nvptx_WorkFn;
|
||||
extern volatile __device__ __shared__ omptarget_nvptx_WorkFn
|
||||
extern volatile DEVICE SHARED omptarget_nvptx_WorkFn
|
||||
omptarget_nvptx_workFn;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
|
|
@ -17,7 +17,7 @@
|
|||
// global data tables
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern __device__
|
||||
extern DEVICE
|
||||
omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
|
||||
omptarget_nvptx_device_State[MAX_SM];
|
||||
|
||||
|
|
|
@ -97,7 +97,7 @@ EXTERN void *__kmpc_data_sharing_environment_begin(
|
|||
|
||||
DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n");
|
||||
|
||||
// If the runtime has been elided, used __shared__ memory for master-worker
|
||||
// If the runtime has been elided, used shared memory for master-worker
|
||||
// data sharing.
|
||||
if (!IsOMPRuntimeInitialized)
|
||||
return (void *)&DataSharingState;
|
||||
|
@ -300,7 +300,7 @@ __kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID,
|
|||
int16_t IsOMPRuntimeInitialized) {
|
||||
DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n");
|
||||
|
||||
// If the runtime has been elided, use __shared__ memory for master-worker
|
||||
// If the runtime has been elided, use shared memory for master-worker
|
||||
// data sharing. We're reusing the statically allocated data structure
|
||||
// that is used for standard data sharing.
|
||||
if (!IsOMPRuntimeInitialized)
|
||||
|
|
|
@ -17,27 +17,27 @@
|
|||
// global device environment
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__device__ omptarget_device_environmentTy omptarget_device_environment;
|
||||
DEVICE omptarget_device_environmentTy omptarget_device_environment;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// global data holding OpenMP state information
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
__device__
|
||||
DEVICE
|
||||
omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
|
||||
omptarget_nvptx_device_State[MAX_SM];
|
||||
|
||||
__device__ omptarget_nvptx_SimpleMemoryManager
|
||||
DEVICE omptarget_nvptx_SimpleMemoryManager
|
||||
omptarget_nvptx_simpleMemoryManager;
|
||||
__device__ __shared__ uint32_t usedMemIdx;
|
||||
__device__ __shared__ uint32_t usedSlotIdx;
|
||||
DEVICE SHARED uint32_t usedMemIdx;
|
||||
DEVICE SHARED uint32_t usedSlotIdx;
|
||||
|
||||
__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
|
||||
__device__ __shared__ uint16_t threadLimit;
|
||||
__device__ __shared__ uint16_t threadsInTeam;
|
||||
__device__ __shared__ uint16_t nThreads;
|
||||
DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
|
||||
DEVICE SHARED uint16_t threadLimit;
|
||||
DEVICE SHARED uint16_t threadsInTeam;
|
||||
DEVICE SHARED uint16_t nThreads;
|
||||
// Pointer to this team's OpenMP state object
|
||||
__device__ __shared__
|
||||
DEVICE SHARED
|
||||
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -45,24 +45,24 @@ __device__ __shared__
|
|||
// communicate with the workers. Since it is in shared memory, there is one
|
||||
// copy of these variables for each kernel, instance, and team.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
volatile __device__ __shared__ omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
|
||||
volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// OpenMP kernel execution parameters
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__device__ __shared__ uint32_t execution_param;
|
||||
DEVICE SHARED uint32_t execution_param;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Data sharing state
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__device__ __shared__ DataSharingStateTy DataSharingState;
|
||||
DEVICE SHARED DataSharingStateTy DataSharingState;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Scratchpad for teams reduction.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__device__ __shared__ void *ReductionScratchpadPtr;
|
||||
DEVICE SHARED void *ReductionScratchpadPtr;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Data sharing related variables.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
__device__ __shared__ omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
|
||||
DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs;
|
||||
|
|
|
@ -233,7 +233,7 @@ static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
|
|||
: /*Master thread only*/ 1;
|
||||
uint32_t TeamId = GetBlockIdInKernel();
|
||||
uint32_t NumTeams = GetNumberOfBlocksInKernel();
|
||||
__shared__ volatile bool IsLastTeam;
|
||||
SHARED volatile bool IsLastTeam;
|
||||
|
||||
// Team masters of all teams write to the scratchpad.
|
||||
if (ThreadId == 0) {
|
||||
|
@ -403,8 +403,8 @@ INLINE static uint32_t roundToWarpsize(uint32_t s) {
|
|||
return (s & ~(unsigned)(WARPSIZE - 1));
|
||||
}
|
||||
|
||||
__device__ static volatile uint32_t IterCnt = 0;
|
||||
__device__ static volatile uint32_t Cnt = 0;
|
||||
DEVICE static volatile uint32_t IterCnt = 0;
|
||||
DEVICE static volatile uint32_t Cnt = 0;
|
||||
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
|
||||
kmp_Ident *loc, int32_t global_tid, void *global_buffer,
|
||||
int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
|
||||
|
@ -426,8 +426,8 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
|
|||
: /*Master thread only*/ 1;
|
||||
uint32_t TeamId = GetBlockIdInKernel();
|
||||
uint32_t NumTeams = GetNumberOfBlocksInKernel();
|
||||
__shared__ unsigned Bound;
|
||||
__shared__ unsigned ChunkTeamCount;
|
||||
SHARED unsigned Bound;
|
||||
SHARED unsigned ChunkTeamCount;
|
||||
|
||||
// Block progress for teams greater than the current upper
|
||||
// limit. We always only allow a number of teams less or equal
|
||||
|
|
|
@ -18,6 +18,8 @@
|
|||
#define DEVICE __device__
|
||||
#define INLINE __forceinline__ DEVICE
|
||||
#define NOINLINE __noinline__ DEVICE
|
||||
#define SHARED __shared__
|
||||
#define ALIGN(N) __align__(N)
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Kernel options
|
||||
|
|
Loading…
Reference in New Issue