forked from OSchip/llvm-project
[openmp] Use llvm GridValues from devicertl
Add include path to the cmakefiles and set the target_impl enums from the llvm constants instead of copying the values. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D108391
This commit is contained in:
parent
401a45c61b
commit
842f875c8b
|
@ -21,6 +21,12 @@ if (NOT (LIBOMPTARGET_DEP_CUDA_FOUND OR LIBOMPTARGET_BUILD_NVPTX_BCLIB))
|
|||
return()
|
||||
endif()
|
||||
|
||||
if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
|
||||
libomptarget_say("Not building device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
|
||||
return()
|
||||
endif()
|
||||
|
||||
|
||||
# Check if we can create an LLVM bitcode implementation of the runtime library
|
||||
# that could be inlined in the user application. For that we need to find
|
||||
# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
|
||||
|
@ -132,6 +138,10 @@ set(src_files
|
|||
set(clang_opt_flags -O1 -mllvm -openmp-opt-disable -DSHARED_SCRATCHPAD_SIZE=2048)
|
||||
set(link_opt_flags -O1 -openmp-opt-disable)
|
||||
|
||||
# Prepend -I to each list element
|
||||
set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
|
||||
list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL PREPEND "-I")
|
||||
|
||||
# Set flags for LLVM Bitcode compilation.
|
||||
set(bc_flags -S -x c++ -std=c++17
|
||||
${clang_opt_flags}
|
||||
|
@ -141,6 +151,7 @@ set(bc_flags -S -x c++ -std=c++17
|
|||
-fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
|
||||
-Xclang -target-feature -Xclang +ptx61
|
||||
-I${include_directory}
|
||||
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_DEVICERTL}
|
||||
)
|
||||
|
||||
if(${LIBOMPTARGET_DEVICE_DEBUG})
|
||||
|
|
|
@ -16,6 +16,8 @@
|
|||
|
||||
#pragma omp declare target
|
||||
|
||||
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
||||
|
||||
using namespace _OMP;
|
||||
|
||||
namespace _OMP {
|
||||
|
@ -26,6 +28,10 @@ namespace impl {
|
|||
///{
|
||||
#pragma omp begin declare variant match(device = {arch(amdgcn)})
|
||||
|
||||
constexpr const llvm::omp::GV &getGridValue() {
|
||||
return llvm::omp::AMDGPUGridValues;
|
||||
}
|
||||
|
||||
uint32_t getGridDim(uint32_t n, uint16_t d) {
|
||||
uint32_t q = n / d;
|
||||
return q + (n > q * d);
|
||||
|
@ -86,8 +92,6 @@ uint32_t getWarpId() {
|
|||
return mapping::getThreadIdInBlock() / mapping::getWarpSize();
|
||||
}
|
||||
|
||||
uint32_t getWarpSize() { return 64; }
|
||||
|
||||
uint32_t getNumberOfWarpsInBlock() {
|
||||
return mapping::getBlockSize() / mapping::getWarpSize();
|
||||
}
|
||||
|
@ -101,6 +105,10 @@ uint32_t getNumberOfWarpsInBlock() {
|
|||
#pragma omp begin declare variant match( \
|
||||
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
|
||||
|
||||
constexpr const llvm::omp::GV &getGridValue() {
|
||||
return llvm::omp::NVPTXGridValues;
|
||||
}
|
||||
|
||||
LaneMaskTy activemask() {
|
||||
unsigned int Mask;
|
||||
asm("activemask.b32 %0;" : "=r"(Mask));
|
||||
|
@ -144,8 +152,6 @@ uint32_t getWarpId() {
|
|||
return mapping::getThreadIdInBlock() / mapping::getWarpSize();
|
||||
}
|
||||
|
||||
uint32_t getWarpSize() { return 32; }
|
||||
|
||||
uint32_t getNumberOfWarpsInBlock() {
|
||||
return (mapping::getBlockSize() + mapping::getWarpSize() - 1) /
|
||||
mapping::getWarpSize();
|
||||
|
@ -154,6 +160,8 @@ uint32_t getNumberOfWarpsInBlock() {
|
|||
#pragma omp end declare variant
|
||||
///}
|
||||
|
||||
uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
|
||||
|
||||
} // namespace impl
|
||||
} // namespace _OMP
|
||||
|
||||
|
|
|
@ -18,6 +18,12 @@ if (NOT LIBOMPTARGET_BUILD_AMDGCN_BCLIB)
|
|||
return()
|
||||
endif()
|
||||
|
||||
if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
|
||||
libomptarget_say("Not building AMDGCN device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
|
||||
return()
|
||||
endif()
|
||||
|
||||
|
||||
# Copied from nvptx CMakeLists
|
||||
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
|
||||
set(aux_triple x86_64-unknown-linux-gnu)
|
||||
|
@ -103,6 +109,10 @@ if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
|
|||
set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
|
||||
endif()
|
||||
|
||||
# Prepend -I to each list element
|
||||
set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
|
||||
list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN PREPEND "-I")
|
||||
|
||||
macro(add_cuda_bc_library)
|
||||
set(cu_cmd ${CLANG_TOOL}
|
||||
-xc++
|
||||
|
@ -123,7 +133,8 @@ macro(add_cuda_bc_library)
|
|||
${CUDA_DEBUG}
|
||||
-I${CMAKE_CURRENT_SOURCE_DIR}/src
|
||||
-I${devicertl_base_directory}/common/include
|
||||
-I${devicertl_base_directory})
|
||||
-I${devicertl_base_directory}
|
||||
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN})
|
||||
|
||||
set(bc1_files)
|
||||
|
||||
|
|
|
@ -31,6 +31,12 @@ typedef uint64_t __kmpc_impl_lanemask_t;
|
|||
#define NOINLINE __attribute__((noinline))
|
||||
#define ALIGN(N) __attribute__((aligned(N)))
|
||||
|
||||
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
||||
|
||||
INLINE constexpr const llvm::omp::GV &getGridValue() {
|
||||
return llvm::omp::AMDGPUGridValues;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Kernel options
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -38,9 +44,8 @@ typedef uint64_t __kmpc_impl_lanemask_t;
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// The following def must match the absolute limit hardwired in the host RTL
|
||||
// max number of threads per team
|
||||
#define MAX_THREADS_PER_TEAM 1024
|
||||
|
||||
#define WARPSIZE 64
|
||||
enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
|
||||
enum { WARPSIZE = getGridValue().GV_Warp_Size };
|
||||
|
||||
// Maximum number of omp state objects per SM allocated statically in global
|
||||
// memory.
|
||||
|
@ -52,11 +57,11 @@ typedef uint64_t __kmpc_impl_lanemask_t;
|
|||
// Data sharing related quantities, need to match what is used in the compiler.
|
||||
enum DATA_SHARING_SIZES {
|
||||
// The size reserved for data in a shared memory slot.
|
||||
DS_Slot_Size = 256,
|
||||
DS_Slot_Size = getGridValue().GV_Slot_Size,
|
||||
// The slot size that should be reserved for a working warp.
|
||||
DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
|
||||
DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
|
||||
// The maximum number of warps in use
|
||||
DS_Max_Warp_Number = 16,
|
||||
DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
|
||||
};
|
||||
|
||||
enum : __kmpc_impl_lanemask_t {
|
||||
|
|
|
@ -19,6 +19,11 @@ if (NOT (LIBOMPTARGET_DEP_CUDA_FOUND OR LIBOMPTARGET_BUILD_NVPTX_BCLIB))
|
|||
return()
|
||||
endif()
|
||||
|
||||
if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
|
||||
libomptarget_say("Not building NVPTX device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
|
||||
return()
|
||||
endif()
|
||||
|
||||
# Check if we can create an LLVM bitcode implementation of the runtime library
|
||||
# that could be inlined in the user application. For that we need to find
|
||||
# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
|
||||
|
@ -151,6 +156,10 @@ set(cuda_src_files
|
|||
src/target_impl.cu
|
||||
)
|
||||
|
||||
# Prepend -I to each list element
|
||||
set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
|
||||
list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX PREPEND "-I")
|
||||
|
||||
# Set flags for LLVM Bitcode compilation.
|
||||
set(bc_flags -S -x c++ -O1 -std=c++14
|
||||
-mllvm -openmp-opt-disable
|
||||
|
@ -162,7 +171,8 @@ set(bc_flags -S -x c++ -O1 -std=c++14
|
|||
-D__CUDACC__
|
||||
-I${devicertl_base_directory}
|
||||
-I${devicertl_common_directory}/include
|
||||
-I${devicertl_nvptx_directory}/src)
|
||||
-I${devicertl_nvptx_directory}/src
|
||||
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX})
|
||||
|
||||
if(${LIBOMPTARGET_NVPTX_DEBUG})
|
||||
list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1 -g)
|
||||
|
|
|
@ -24,6 +24,12 @@ typedef uint32_t __kmpc_impl_lanemask_t;
|
|||
#define NOINLINE __attribute__((noinline))
|
||||
#define ALIGN(N) __attribute__((aligned(N)))
|
||||
|
||||
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
||||
|
||||
INLINE constexpr const llvm::omp::GV &getGridValue() {
|
||||
return llvm::omp::NVPTXGridValues;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Kernel options
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
@ -31,9 +37,8 @@ typedef uint32_t __kmpc_impl_lanemask_t;
|
|||
////////////////////////////////////////////////////////////////////////////////
|
||||
// The following def must match the absolute limit hardwired in the host RTL
|
||||
// max number of threads per team
|
||||
#define MAX_THREADS_PER_TEAM 1024
|
||||
|
||||
#define WARPSIZE 32
|
||||
enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
|
||||
enum { WARPSIZE = getGridValue().GV_Warp_Size };
|
||||
|
||||
// Maximum number of omp state objects per SM allocated statically in global
|
||||
// memory.
|
||||
|
@ -64,11 +69,11 @@ typedef uint32_t __kmpc_impl_lanemask_t;
|
|||
// Data sharing related quantities, need to match what is used in the compiler.
|
||||
enum DATA_SHARING_SIZES {
|
||||
// The size reserved for data in a shared memory slot.
|
||||
DS_Slot_Size = 256,
|
||||
DS_Slot_Size = getGridValue().GV_Slot_Size,
|
||||
// The slot size that should be reserved for a working warp.
|
||||
DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size,
|
||||
DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
|
||||
// The maximum number of warps in use
|
||||
DS_Max_Warp_Number = 32,
|
||||
DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
|
||||
};
|
||||
|
||||
enum : __kmpc_impl_lanemask_t {
|
||||
|
|
Loading…
Reference in New Issue