forked from mindspore-Ecosystem/mindspore
Add register limit constraint
This commit is contained in:
parent
e7ea93dacd
commit
8c98146c76
|
@ -26,6 +26,11 @@ using std::fstream;
|
|||
using std::string;
|
||||
using std::vector;
|
||||
|
||||
const int MAX_REGISTER_PER_THREAD_BLOCK = 65536;
|
||||
const int REGISTER_UNIT_IN_WARP = 256;
|
||||
const int WARP_SIZE = 32;
|
||||
const int WARP_ALLOC_GRAN = 4;
|
||||
|
||||
GpuKernelManagerPtr GpuKernelMod::kernelmanager_ = std::make_shared<GpuKernelManager>();
|
||||
GpuKernelManager::GpuKernelManager() {}
|
||||
|
||||
|
@ -54,8 +59,20 @@ CUresult GpuKernelManager::GetFunction(const KernelPackPtr &kernel_pack, bool fo
|
|||
thread_info->emplace_back(js["threadIdx.x"]);
|
||||
thread_info->emplace_back(js["threadIdx.y"]);
|
||||
thread_info->emplace_back(js["threadIdx.z"]);
|
||||
|
||||
CUmodule module;
|
||||
CUresult result = cuModuleLoadData(&module, kernel_pack->GetKernel()->contents);
|
||||
CUjit_option options[1];
|
||||
options[0] = CU_JIT_MAX_REGISTERS;
|
||||
void *values[1];
|
||||
int total_threads = thread_info->at(3) * thread_info->at(4) * thread_info->at(5);
|
||||
int total_warps = std::ceil(static_cast<float>(total_threads) / static_cast<float>(WARP_SIZE));
|
||||
int limit_warps = (total_warps + WARP_ALLOC_GRAN - 1) / WARP_ALLOC_GRAN * WARP_ALLOC_GRAN;
|
||||
int total_register_unit_nums = MAX_REGISTER_PER_THREAD_BLOCK / REGISTER_UNIT_IN_WARP;
|
||||
int register_unit_nums_per_warp = total_register_unit_nums / limit_warps;
|
||||
int64_t register_nums = (register_unit_nums_per_warp * REGISTER_UNIT_IN_WARP) / WARP_SIZE;
|
||||
values[0] = reinterpret_cast<void *>(register_nums);
|
||||
|
||||
CUresult result = cuModuleLoadDataEx(&module, kernel_pack->GetKernel()->contents, 1, options, values);
|
||||
if (result != CUDA_SUCCESS) {
|
||||
MS_LOG(ERROR) << "cuModuleLoadData failed.";
|
||||
return result;
|
||||
|
|
|
@ -17,6 +17,9 @@
|
|||
|
||||
CUresult cuModuleLoadData(CUmodule *module, const void *image) { return CUDA_SUCCESS; }
|
||||
|
||||
CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions,
|
||||
CUjit_option *options, void **optionValues) { return CUDA_SUCCESS; }
|
||||
|
||||
CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name) { return CUDA_SUCCESS; }
|
||||
|
||||
CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
|
||||
|
|
|
@ -22,6 +22,31 @@ typedef enum cudaError_enum {
|
|||
CUDA_ERROR_DEINITIALIZED = 2,
|
||||
} CUresult;
|
||||
|
||||
typedef enum CUjit_option_enum
|
||||
{
|
||||
CU_JIT_MAX_REGISTERS = 0,
|
||||
CU_JIT_THREADS_PER_BLOCK,
|
||||
CU_JIT_WALL_TIME,
|
||||
CU_JIT_INFO_LOG_BUFFER,
|
||||
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
|
||||
CU_JIT_ERROR_LOG_BUFFER,
|
||||
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
|
||||
CU_JIT_OPTIMIZATION_LEVEL,
|
||||
CU_JIT_TARGET_FROM_CUCONTEXT,
|
||||
CU_JIT_TARGET,
|
||||
CU_JIT_FALLBACK_STRATEGY,
|
||||
CU_JIT_GENERATE_DEBUG_INFO,
|
||||
CU_JIT_LOG_VERBOSE,
|
||||
CU_JIT_GENERATE_LINE_INFO,
|
||||
CU_JIT_CACHE_MODE,
|
||||
CU_JIT_NEW_SM3X_OPT,
|
||||
CU_JIT_FAST_COMPILE,
|
||||
CU_JIT_GLOBAL_SYMBOL_NAMES,
|
||||
CU_JIT_GLOBAL_SYMBOL_ADDRESSES,
|
||||
CU_JIT_GLOBAL_SYMBOL_COUNT,
|
||||
CU_JIT_NUM_OPTIONS
|
||||
} CUjit_option;
|
||||
|
||||
struct CUctx_st {
|
||||
int arch;
|
||||
};
|
||||
|
@ -40,7 +65,9 @@ typedef struct CUmod_st *CUmodule;
|
|||
typedef struct CUfunc_st *CUfunction;
|
||||
typedef struct CUstream_st *CUstream;
|
||||
|
||||
|
||||
CUresult cuModuleLoadData(CUmodule *module, const void *image);
|
||||
CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
|
||||
CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
|
||||
CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
|
||||
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,
|
||||
|
|
Loading…
Reference in New Issue