Add register limit constraint

This commit is contained in:
lishanni513 2021-06-22 19:34:41 +08:00
parent e7ea93dacd
commit 8c98146c76
3 changed files with 48 additions and 1 deletions

View File

@ -26,6 +26,11 @@ using std::fstream;
using std::string;
using std::vector;
const int MAX_REGISTER_PER_THREAD_BLOCK = 65536;
const int REGISTER_UNIT_IN_WARP = 256;
const int WARP_SIZE = 32;
const int WARP_ALLOC_GRAN = 4;
GpuKernelManagerPtr GpuKernelMod::kernelmanager_ = std::make_shared<GpuKernelManager>();
GpuKernelManager::GpuKernelManager() {}
@ -54,8 +59,20 @@ CUresult GpuKernelManager::GetFunction(const KernelPackPtr &kernel_pack, bool fo
thread_info->emplace_back(js["threadIdx.x"]);
thread_info->emplace_back(js["threadIdx.y"]);
thread_info->emplace_back(js["threadIdx.z"]);
CUmodule module;
CUresult result = cuModuleLoadData(&module, kernel_pack->GetKernel()->contents);
CUjit_option options[1];
options[0] = CU_JIT_MAX_REGISTERS;
void *values[1];
int total_threads = thread_info->at(3) * thread_info->at(4) * thread_info->at(5);
int total_warps = std::ceil(static_cast<float>(total_threads) / static_cast<float>(WARP_SIZE));
int limit_warps = (total_warps + WARP_ALLOC_GRAN - 1) / WARP_ALLOC_GRAN * WARP_ALLOC_GRAN;
int total_register_unit_nums = MAX_REGISTER_PER_THREAD_BLOCK / REGISTER_UNIT_IN_WARP;
int register_unit_nums_per_warp = total_register_unit_nums / limit_warps;
int64_t register_nums = (register_unit_nums_per_warp * REGISTER_UNIT_IN_WARP) / WARP_SIZE;
values[0] = reinterpret_cast<void *>(register_nums);
CUresult result = cuModuleLoadDataEx(&module, kernel_pack->GetKernel()->contents, 1, options, values);
if (result != CUDA_SUCCESS) {
MS_LOG(ERROR) << "cuModuleLoadData failed.";
return result;

View File

@ -17,6 +17,9 @@
CUresult cuModuleLoadData(CUmodule *module, const void *image) { return CUDA_SUCCESS; }
CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions,
CUjit_option *options, void **optionValues) { return CUDA_SUCCESS; }
CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name) { return CUDA_SUCCESS; }
CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,

View File

@ -22,6 +22,31 @@ typedef enum cudaError_enum {
CUDA_ERROR_DEINITIALIZED = 2,
} CUresult;
typedef enum CUjit_option_enum
{
CU_JIT_MAX_REGISTERS = 0,
CU_JIT_THREADS_PER_BLOCK,
CU_JIT_WALL_TIME,
CU_JIT_INFO_LOG_BUFFER,
CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
CU_JIT_ERROR_LOG_BUFFER,
CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES,
CU_JIT_OPTIMIZATION_LEVEL,
CU_JIT_TARGET_FROM_CUCONTEXT,
CU_JIT_TARGET,
CU_JIT_FALLBACK_STRATEGY,
CU_JIT_GENERATE_DEBUG_INFO,
CU_JIT_LOG_VERBOSE,
CU_JIT_GENERATE_LINE_INFO,
CU_JIT_CACHE_MODE,
CU_JIT_NEW_SM3X_OPT,
CU_JIT_FAST_COMPILE,
CU_JIT_GLOBAL_SYMBOL_NAMES,
CU_JIT_GLOBAL_SYMBOL_ADDRESSES,
CU_JIT_GLOBAL_SYMBOL_COUNT,
CU_JIT_NUM_OPTIONS
} CUjit_option;
struct CUctx_st {
int arch;
};
@ -40,7 +65,9 @@ typedef struct CUmod_st *CUmodule;
typedef struct CUfunc_st *CUfunction;
typedef struct CUstream_st *CUstream;
CUresult cuModuleLoadData(CUmodule *module, const void *image);
CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues);
CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name);
CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ,
unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,