diff --git a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.cc b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.cc index 0af8ee6c820..3cdb095ab41 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/akg/gpu/akg_gpu_kernel_mod.cc @@ -26,6 +26,11 @@ using std::fstream; using std::string; using std::vector; +const int MAX_REGISTER_PER_THREAD_BLOCK = 65536; +const int REGISTER_UNIT_IN_WARP = 256; +const int WARP_SIZE = 32; +const int WARP_ALLOC_GRAN = 4; + GpuKernelManagerPtr GpuKernelMod::kernelmanager_ = std::make_shared(); GpuKernelManager::GpuKernelManager() {} @@ -54,8 +59,20 @@ CUresult GpuKernelManager::GetFunction(const KernelPackPtr &kernel_pack, bool fo thread_info->emplace_back(js["threadIdx.x"]); thread_info->emplace_back(js["threadIdx.y"]); thread_info->emplace_back(js["threadIdx.z"]); + CUmodule module; - CUresult result = cuModuleLoadData(&module, kernel_pack->GetKernel()->contents); + CUjit_option options[1]; + options[0] = CU_JIT_MAX_REGISTERS; + void *values[1]; + int total_threads = thread_info->at(3) * thread_info->at(4) * thread_info->at(5); + int total_warps = std::ceil(static_cast(total_threads) / static_cast(WARP_SIZE)); + int limit_warps = (total_warps + WARP_ALLOC_GRAN - 1) / WARP_ALLOC_GRAN * WARP_ALLOC_GRAN; + int total_register_unit_nums = MAX_REGISTER_PER_THREAD_BLOCK / REGISTER_UNIT_IN_WARP; + int register_unit_nums_per_warp = total_register_unit_nums / limit_warps; + int64_t register_nums = (register_unit_nums_per_warp * REGISTER_UNIT_IN_WARP) / WARP_SIZE; + values[0] = reinterpret_cast(register_nums); + + CUresult result = cuModuleLoadDataEx(&module, kernel_pack->GetKernel()->contents, 1, options, values); if (result != CUDA_SUCCESS) { MS_LOG(ERROR) << "cuModuleLoadData failed."; return result; diff --git a/tests/ut/cpp/stub/runtime/cuda.cc b/tests/ut/cpp/stub/runtime/cuda.cc index 5747746cc49..a9595ec6e76 100644 --- a/tests/ut/cpp/stub/runtime/cuda.cc +++ b/tests/ut/cpp/stub/runtime/cuda.cc @@ -17,6 +17,9 @@ CUresult cuModuleLoadData(CUmodule *module, const void *image) { return CUDA_SUCCESS; } +CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, + CUjit_option *options, void **optionValues) { return CUDA_SUCCESS; } + CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name) { return CUDA_SUCCESS; } CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, diff --git a/tests/ut/cpp/stub/runtime/cuda.h b/tests/ut/cpp/stub/runtime/cuda.h index ca30a355d6b..dc4d326acc6 100644 --- a/tests/ut/cpp/stub/runtime/cuda.h +++ b/tests/ut/cpp/stub/runtime/cuda.h @@ -22,6 +22,31 @@ typedef enum cudaError_enum { CUDA_ERROR_DEINITIALIZED = 2, } CUresult; +typedef enum CUjit_option_enum +{ + CU_JIT_MAX_REGISTERS = 0, + CU_JIT_THREADS_PER_BLOCK, + CU_JIT_WALL_TIME, + CU_JIT_INFO_LOG_BUFFER, + CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, + CU_JIT_ERROR_LOG_BUFFER, + CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES, + CU_JIT_OPTIMIZATION_LEVEL, + CU_JIT_TARGET_FROM_CUCONTEXT, + CU_JIT_TARGET, + CU_JIT_FALLBACK_STRATEGY, + CU_JIT_GENERATE_DEBUG_INFO, + CU_JIT_LOG_VERBOSE, + CU_JIT_GENERATE_LINE_INFO, + CU_JIT_CACHE_MODE, + CU_JIT_NEW_SM3X_OPT, + CU_JIT_FAST_COMPILE, + CU_JIT_GLOBAL_SYMBOL_NAMES, + CU_JIT_GLOBAL_SYMBOL_ADDRESSES, + CU_JIT_GLOBAL_SYMBOL_COUNT, + CU_JIT_NUM_OPTIONS +} CUjit_option; + struct CUctx_st { int arch; }; @@ -40,7 +65,9 @@ typedef struct CUmod_st *CUmodule; typedef struct CUfunc_st *CUfunction; typedef struct CUstream_st *CUstream; + CUresult cuModuleLoadData(CUmodule *module, const void *image); +CUresult cuModuleLoadDataEx(CUmodule *module, const void *image, unsigned int numOptions, CUjit_option *options, void **optionValues); CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod, const char *name); CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX, unsigned int gridDimY, unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY, unsigned int blockDimZ,