diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_c_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_c_gpu_kernel.h index 4980bcf876f..4b9828ab177 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_c_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_c_gpu_kernel.h @@ -86,34 +86,30 @@ class EighcGpuKernel : public GpuKernel { bool Launch(const std::vector &inputs, const std::vector &workspace, const std::vector &outputs, void *stream_ptr) override { // matrix A, input or output(eigenvector) - auto inout_A_addr = GetDeviceAddress(inputs, 0); + auto inout_A_addr = GetDeviceAddress(inputs, kDim0); if (lower_) { uplo_ = CUBLAS_FILL_MODE_LOWER; } else { uplo_ = CUBLAS_FILL_MODE_UPPER; } size_t lda_ = m_; - int *devInfo = nullptr; - cudaMalloc(reinterpret_cast(&devInfo), sizeof(int)); - T *d_work = nullptr; - auto output_w_addr = GetDeviceAddress(outputs, 0); + auto output_w_addr = GetDeviceAddress(outputs, kDim0); // output eigenvector - auto output_v_addr = GetDeviceAddress(outputs, 1); + auto output_v_addr = GetDeviceAddress(outputs, kDim1); + int *devInfo = GetDeviceAddress(workspace, kDim0); // temp output eigenvalues real scalar - auto w_w_addr = GetDeviceAddress(workspace, 0); - auto w_w_c_addr = GetDeviceAddress(workspace, 1); + auto w_w_addr = GetDeviceAddress(workspace, kDim1); + auto w_w_c_addr = GetDeviceAddress(workspace, kDim2); // temp eigenvector before transpose - auto w_v_addr = GetDeviceAddress(workspace, 2); + auto w_v_addr = GetDeviceAddress(workspace, kDim3); CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMemcpyAsync(output_v_addr, inout_A_addr, m_ * m_ * sizeof(T), cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "copy input matrix failed"); size_t input_shape[kShape2dDims] = {m_, m_}; size_t input_axis[kShape2dDims] = {1, 0}; - size_t *dev_input_shape = nullptr; - cudaMalloc(reinterpret_cast(&dev_input_shape), kShape2dDims * sizeof(size_t)); - size_t *dev_input_axis = nullptr; - cudaMalloc(reinterpret_cast(&dev_input_axis), kShape2dDims * sizeof(size_t)); + size_t *dev_input_shape = GetDeviceAddress(workspace, kDim4); + size_t *dev_input_axis = GetDeviceAddress(workspace, kDim5); CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMemcpyAsync(dev_input_shape, input_shape, kShape2dDims * sizeof(size_t), cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)), @@ -126,18 +122,22 @@ class EighcGpuKernel : public GpuKernel { reinterpret_cast(stream_ptr)); int lwork = 0; + void *d_work = nullptr; if constexpr (std::is_same_v>) { cusolverDnCheevd_bufferSize(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast(output_v_addr), lda_, w_w_addr, &lwork); - CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMalloc(reinterpret_cast(&d_work), sizeof(T) * lwork), - "cal eigenvalues workspace failed"); - cusolverDnCheevd(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast(w_v_addr), lda_, w_w_addr, - reinterpret_cast(d_work), lwork, devInfo); } else { cusolverDnZheevd_bufferSize(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast(output_v_addr), lda_, w_w_addr, &lwork); - CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMalloc(reinterpret_cast(&d_work), sizeof(T) * lwork), - "cal eigenvalues workspace failed"); + } + d_work = device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(sizeof(T) * lwork); + if (!d_work) { + MS_LOG(EXCEPTION) << "GPU memory alloca failed."; + } + if constexpr (std::is_same_v>) { + cusolverDnCheevd(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast(w_v_addr), lda_, w_w_addr, + reinterpret_cast(d_work), lwork, devInfo); + } else { cusolverDnZheevd(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast(w_v_addr), lda_, w_w_addr, reinterpret_cast(d_work), lwork, devInfo); } @@ -145,28 +145,17 @@ class EighcGpuKernel : public GpuKernel { cudaMemcpyAsync(w_w_c_addr, w_w_addr, m_ * sizeof(D), cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), "copy eigenvalue from workspace to host failed"); + // convert real scalar to complex RealToComplex(m_, reinterpret_cast(w_w_c_addr), reinterpret_cast(output_w_addr), reinterpret_cast(stream_ptr)); CalTranspose(m_ * m_, w_v_addr, dev_input_shape, dev_input_axis, kShape2dDims, output_v_addr, reinterpret_cast(stream_ptr)); - if (dev_input_shape) { - cudaFree(dev_input_shape); - } - if (dev_input_axis) { - cudaFree(dev_input_axis); - } - // convert real scalar to complex - if (d_work) { - cudaFree(d_work); - } + device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(d_work); int info_gpu = 0; CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMemcpyAsync(&info_gpu, devInfo, sizeof(int), cudaMemcpyDeviceToHost, reinterpret_cast(stream_ptr)), "copy eigenvalues to outpu failed"); - if (devInfo) { - cudaFree(devInfo); - } if (info_gpu != 0) { MS_LOG_EXCEPTION << kernel_name_ << " launch gpu kernel fail for dtype:" << dtype_; } @@ -180,11 +169,16 @@ class EighcGpuKernel : public GpuKernel { // eigenvalues, cuda output original real scalar, should covert to complex output_size_list_.push_back(m_ * sizeof(T)); output_size_list_.push_back(m_ * m_ * sizeof(T)); + // result + workspace_size_list_.push_back(sizeof(int)); // for temp original eigenvalue real scalar workspace_size_list_.push_back(m_ * sizeof(D)); // for temp pre-transpose complex mitrx workspace_size_list_.push_back(m_ * sizeof(T)); workspace_size_list_.push_back(m_ * m_ * sizeof(T)); + // transpose scalar workspace + workspace_size_list_.push_back(kShape2dDims * sizeof(size_t)); + workspace_size_list_.push_back(kShape2dDims * sizeof(size_t)); } size_t m_{1}; diff --git a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_gpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_gpu_kernel.h index 0ef80e65625..831c8de1942 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_gpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_gpu_kernel.h @@ -83,7 +83,8 @@ class EighGpuKernel : public GpuKernel { } auto output_addr = GetDeviceAddress(outputs, kDim0); // output eigenvalues auto output_v_addr = GetDeviceAddress(outputs, kDim1); // output eigenvalues - auto w_v_addr = GetDeviceAddress(workspace, kDim0); // temp eigenvector before transpose + int *devInfo = GetDeviceAddress(workspace, kDim0); + auto w_v_addr = GetDeviceAddress(workspace, kDim1); // temp eigenvector before transpose CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMemcpyAsync(w_v_addr, inout_A_addr, m_ * m_ * sizeof(T), cudaMemcpyDeviceToDevice, reinterpret_cast(stream_ptr)), @@ -95,38 +96,31 @@ class EighGpuKernel : public GpuKernel { } else if constexpr (std::is_same_v) { cusolverDnDsyevd_bufferSize(cusolver_handle_, jobz_, uplo_, m_, inout_A_addr, lda_, output_addr, &lwork); } - int *devInfo = nullptr; - cudaMalloc(reinterpret_cast(&devInfo), sizeof(int)); - T *d_work = nullptr; - cudaMalloc(reinterpret_cast(&d_work), sizeof(T) * lwork); + + void *d_work = device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(sizeof(T) * lwork); if constexpr (std::is_same_v) { - cusolverDnSsyevd(cusolver_handle_, jobz_, uplo_, m_, w_v_addr, lda_, output_addr, d_work, lwork, devInfo); + cusolverDnSsyevd(cusolver_handle_, jobz_, uplo_, m_, w_v_addr, lda_, output_addr, reinterpret_cast(d_work), + lwork, devInfo); } else if constexpr (std::is_same_v) { - cusolverDnDsyevd(cusolver_handle_, jobz_, uplo_, m_, w_v_addr, lda_, output_addr, d_work, lwork, devInfo); + cusolverDnDsyevd(cusolver_handle_, jobz_, uplo_, m_, w_v_addr, lda_, output_addr, reinterpret_cast(d_work), + lwork, devInfo); } size_t input_shape[kShape2dDims] = {m_, m_}; size_t input_axis[kShape2dDims] = {1, 0}; - size_t *dev_input_shape = nullptr; - cudaMalloc(reinterpret_cast(&dev_input_shape), kShape2dDims * sizeof(size_t)); - size_t *dev_input_axis = nullptr; - cudaMalloc(reinterpret_cast(&dev_input_axis), kShape2dDims * sizeof(size_t)); + size_t *dev_input_shape = GetDeviceAddress(workspace, kDim2); + size_t *dev_input_axis = GetDeviceAddress(workspace, kDim3); cudaMemcpyAsync(dev_input_shape, input_shape, kShape2dDims * sizeof(size_t), cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)); cudaMemcpyAsync(dev_input_axis, input_axis, kShape2dDims * sizeof(size_t), cudaMemcpyHostToDevice, reinterpret_cast(stream_ptr)); CalTranspose(m_ * m_, w_v_addr, dev_input_shape, dev_input_axis, kShape2dDims, output_v_addr, reinterpret_cast(stream_ptr)); - if (d_work) { - cudaFree(d_work); - } + device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(d_work); int info_gpu = 0; CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMemcpyAsync(&info_gpu, devInfo, sizeof(int), cudaMemcpyDeviceToHost, reinterpret_cast(stream_ptr)), "copy to device result failed"); - if (devInfo) { - cudaFree(devInfo); - } if (info_gpu != 0) { MS_LOG_EXCEPTION << kernel_name_ << " launch gpu kernel fail for dtype:" << dtype_; } @@ -141,7 +135,12 @@ class EighGpuKernel : public GpuKernel { output_size_list_.push_back(m_ * sizeof(T)); // eigenvector output_size_list_.push_back(m_ * m_ * sizeof(T)); + // result + workspace_size_list_.push_back(sizeof(int)); workspace_size_list_.push_back(m_ * m_ * sizeof(T)); + // transpose scalar workspace + workspace_size_list_.push_back(kShape2dDims * sizeof(size_t)); + workspace_size_list_.push_back(kShape2dDims * sizeof(size_t)); } size_t m_{1};