forked from mindspore-Ecosystem/mindspore
!26672 Use GPU mem Allocator and workspace instead of self allocator
Merge pull request !26672 from wuwenbing/master
This commit is contained in:
commit
953acc0335
|
@ -86,34 +86,30 @@ class EighcGpuKernel : public GpuKernel {
|
||||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||||
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
|
||||||
// matrix A, input or output(eigenvector)
|
// matrix A, input or output(eigenvector)
|
||||||
auto inout_A_addr = GetDeviceAddress<T>(inputs, 0);
|
auto inout_A_addr = GetDeviceAddress<T>(inputs, kDim0);
|
||||||
if (lower_) {
|
if (lower_) {
|
||||||
uplo_ = CUBLAS_FILL_MODE_LOWER;
|
uplo_ = CUBLAS_FILL_MODE_LOWER;
|
||||||
} else {
|
} else {
|
||||||
uplo_ = CUBLAS_FILL_MODE_UPPER;
|
uplo_ = CUBLAS_FILL_MODE_UPPER;
|
||||||
}
|
}
|
||||||
size_t lda_ = m_;
|
size_t lda_ = m_;
|
||||||
int *devInfo = nullptr;
|
auto output_w_addr = GetDeviceAddress<T>(outputs, kDim0);
|
||||||
cudaMalloc(reinterpret_cast<void **>(&devInfo), sizeof(int));
|
|
||||||
T *d_work = nullptr;
|
|
||||||
auto output_w_addr = GetDeviceAddress<T>(outputs, 0);
|
|
||||||
// output eigenvector
|
// output eigenvector
|
||||||
auto output_v_addr = GetDeviceAddress<T>(outputs, 1);
|
auto output_v_addr = GetDeviceAddress<T>(outputs, kDim1);
|
||||||
|
int *devInfo = GetDeviceAddress<int>(workspace, kDim0);
|
||||||
// temp output eigenvalues real scalar
|
// temp output eigenvalues real scalar
|
||||||
auto w_w_addr = GetDeviceAddress<D>(workspace, 0);
|
auto w_w_addr = GetDeviceAddress<D>(workspace, kDim1);
|
||||||
auto w_w_c_addr = GetDeviceAddress<T>(workspace, 1);
|
auto w_w_c_addr = GetDeviceAddress<T>(workspace, kDim2);
|
||||||
// temp eigenvector before transpose
|
// temp eigenvector before transpose
|
||||||
auto w_v_addr = GetDeviceAddress<T>(workspace, 2);
|
auto w_v_addr = GetDeviceAddress<T>(workspace, kDim3);
|
||||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||||
cudaMemcpyAsync(output_v_addr, inout_A_addr, m_ * m_ * sizeof(T),
|
cudaMemcpyAsync(output_v_addr, inout_A_addr, m_ * m_ * sizeof(T),
|
||||||
cudaMemcpyDeviceToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
cudaMemcpyDeviceToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||||
"copy input matrix failed");
|
"copy input matrix failed");
|
||||||
size_t input_shape[kShape2dDims] = {m_, m_};
|
size_t input_shape[kShape2dDims] = {m_, m_};
|
||||||
size_t input_axis[kShape2dDims] = {1, 0};
|
size_t input_axis[kShape2dDims] = {1, 0};
|
||||||
size_t *dev_input_shape = nullptr;
|
size_t *dev_input_shape = GetDeviceAddress<size_t>(workspace, kDim4);
|
||||||
cudaMalloc(reinterpret_cast<void **>(&dev_input_shape), kShape2dDims * sizeof(size_t));
|
size_t *dev_input_axis = GetDeviceAddress<size_t>(workspace, kDim5);
|
||||||
size_t *dev_input_axis = nullptr;
|
|
||||||
cudaMalloc(reinterpret_cast<void **>(&dev_input_axis), kShape2dDims * sizeof(size_t));
|
|
||||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||||
cudaMemcpyAsync(dev_input_shape, input_shape, kShape2dDims * sizeof(size_t),
|
cudaMemcpyAsync(dev_input_shape, input_shape, kShape2dDims * sizeof(size_t),
|
||||||
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
cudaMemcpyHostToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||||
|
@ -126,18 +122,22 @@ class EighcGpuKernel : public GpuKernel {
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||||
|
|
||||||
int lwork = 0;
|
int lwork = 0;
|
||||||
|
void *d_work = nullptr;
|
||||||
if constexpr (std::is_same_v<T, Complex<float>>) {
|
if constexpr (std::is_same_v<T, Complex<float>>) {
|
||||||
cusolverDnCheevd_bufferSize(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast<cuComplex *>(output_v_addr),
|
cusolverDnCheevd_bufferSize(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast<cuComplex *>(output_v_addr),
|
||||||
lda_, w_w_addr, &lwork);
|
lda_, w_w_addr, &lwork);
|
||||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMalloc(reinterpret_cast<void **>(&d_work), sizeof(T) * lwork),
|
|
||||||
"cal eigenvalues workspace failed");
|
|
||||||
cusolverDnCheevd(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast<cuComplex *>(w_v_addr), lda_, w_w_addr,
|
|
||||||
reinterpret_cast<cuComplex *>(d_work), lwork, devInfo);
|
|
||||||
} else {
|
} else {
|
||||||
cusolverDnZheevd_bufferSize(cusolver_handle_, jobz_, uplo_, m_,
|
cusolverDnZheevd_bufferSize(cusolver_handle_, jobz_, uplo_, m_,
|
||||||
reinterpret_cast<cuDoubleComplex *>(output_v_addr), lda_, w_w_addr, &lwork);
|
reinterpret_cast<cuDoubleComplex *>(output_v_addr), lda_, w_w_addr, &lwork);
|
||||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMalloc(reinterpret_cast<void **>(&d_work), sizeof(T) * lwork),
|
}
|
||||||
"cal eigenvalues workspace failed");
|
d_work = device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(sizeof(T) * lwork);
|
||||||
|
if (!d_work) {
|
||||||
|
MS_LOG(EXCEPTION) << "GPU memory alloca failed.";
|
||||||
|
}
|
||||||
|
if constexpr (std::is_same_v<T, Complex<float>>) {
|
||||||
|
cusolverDnCheevd(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast<cuComplex *>(w_v_addr), lda_, w_w_addr,
|
||||||
|
reinterpret_cast<cuComplex *>(d_work), lwork, devInfo);
|
||||||
|
} else {
|
||||||
cusolverDnZheevd(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast<cuDoubleComplex *>(w_v_addr), lda_,
|
cusolverDnZheevd(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast<cuDoubleComplex *>(w_v_addr), lda_,
|
||||||
w_w_addr, reinterpret_cast<cuDoubleComplex *>(d_work), lwork, devInfo);
|
w_w_addr, reinterpret_cast<cuDoubleComplex *>(d_work), lwork, devInfo);
|
||||||
}
|
}
|
||||||
|
@ -145,28 +145,17 @@ class EighcGpuKernel : public GpuKernel {
|
||||||
cudaMemcpyAsync(w_w_c_addr, w_w_addr, m_ * sizeof(D), cudaMemcpyDeviceToDevice,
|
cudaMemcpyAsync(w_w_c_addr, w_w_addr, m_ * sizeof(D), cudaMemcpyDeviceToDevice,
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||||
"copy eigenvalue from workspace to host failed");
|
"copy eigenvalue from workspace to host failed");
|
||||||
|
// convert real scalar to complex
|
||||||
RealToComplex(m_, reinterpret_cast<D *>(w_w_c_addr), reinterpret_cast<D *>(output_w_addr),
|
RealToComplex(m_, reinterpret_cast<D *>(w_w_c_addr), reinterpret_cast<D *>(output_w_addr),
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||||
CalTranspose(m_ * m_, w_v_addr, dev_input_shape, dev_input_axis, kShape2dDims, output_v_addr,
|
CalTranspose(m_ * m_, w_v_addr, dev_input_shape, dev_input_axis, kShape2dDims, output_v_addr,
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||||
if (dev_input_shape) {
|
device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(d_work);
|
||||||
cudaFree(dev_input_shape);
|
|
||||||
}
|
|
||||||
if (dev_input_axis) {
|
|
||||||
cudaFree(dev_input_axis);
|
|
||||||
}
|
|
||||||
// convert real scalar to complex
|
|
||||||
if (d_work) {
|
|
||||||
cudaFree(d_work);
|
|
||||||
}
|
|
||||||
int info_gpu = 0;
|
int info_gpu = 0;
|
||||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||||
cudaMemcpyAsync(&info_gpu, devInfo, sizeof(int), cudaMemcpyDeviceToHost,
|
cudaMemcpyAsync(&info_gpu, devInfo, sizeof(int), cudaMemcpyDeviceToHost,
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||||
"copy eigenvalues to outpu failed");
|
"copy eigenvalues to outpu failed");
|
||||||
if (devInfo) {
|
|
||||||
cudaFree(devInfo);
|
|
||||||
}
|
|
||||||
if (info_gpu != 0) {
|
if (info_gpu != 0) {
|
||||||
MS_LOG_EXCEPTION << kernel_name_ << " launch gpu kernel fail for dtype:" << dtype_;
|
MS_LOG_EXCEPTION << kernel_name_ << " launch gpu kernel fail for dtype:" << dtype_;
|
||||||
}
|
}
|
||||||
|
@ -180,11 +169,16 @@ class EighcGpuKernel : public GpuKernel {
|
||||||
// eigenvalues, cuda output original real scalar, should covert to complex<ft32/64>
|
// eigenvalues, cuda output original real scalar, should covert to complex<ft32/64>
|
||||||
output_size_list_.push_back(m_ * sizeof(T));
|
output_size_list_.push_back(m_ * sizeof(T));
|
||||||
output_size_list_.push_back(m_ * m_ * sizeof(T));
|
output_size_list_.push_back(m_ * m_ * sizeof(T));
|
||||||
|
// result
|
||||||
|
workspace_size_list_.push_back(sizeof(int));
|
||||||
// for temp original eigenvalue real scalar
|
// for temp original eigenvalue real scalar
|
||||||
workspace_size_list_.push_back(m_ * sizeof(D));
|
workspace_size_list_.push_back(m_ * sizeof(D));
|
||||||
// for temp pre-transpose complex mitrx
|
// for temp pre-transpose complex mitrx
|
||||||
workspace_size_list_.push_back(m_ * sizeof(T));
|
workspace_size_list_.push_back(m_ * sizeof(T));
|
||||||
workspace_size_list_.push_back(m_ * m_ * sizeof(T));
|
workspace_size_list_.push_back(m_ * m_ * sizeof(T));
|
||||||
|
// transpose scalar workspace
|
||||||
|
workspace_size_list_.push_back(kShape2dDims * sizeof(size_t));
|
||||||
|
workspace_size_list_.push_back(kShape2dDims * sizeof(size_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t m_{1};
|
size_t m_{1};
|
||||||
|
|
|
@ -83,7 +83,8 @@ class EighGpuKernel : public GpuKernel {
|
||||||
}
|
}
|
||||||
auto output_addr = GetDeviceAddress<T>(outputs, kDim0); // output eigenvalues
|
auto output_addr = GetDeviceAddress<T>(outputs, kDim0); // output eigenvalues
|
||||||
auto output_v_addr = GetDeviceAddress<T>(outputs, kDim1); // output eigenvalues
|
auto output_v_addr = GetDeviceAddress<T>(outputs, kDim1); // output eigenvalues
|
||||||
auto w_v_addr = GetDeviceAddress<T>(workspace, kDim0); // temp eigenvector before transpose
|
int *devInfo = GetDeviceAddress<int>(workspace, kDim0);
|
||||||
|
auto w_v_addr = GetDeviceAddress<T>(workspace, kDim1); // temp eigenvector before transpose
|
||||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||||
cudaMemcpyAsync(w_v_addr, inout_A_addr, m_ * m_ * sizeof(T), cudaMemcpyDeviceToDevice,
|
cudaMemcpyAsync(w_v_addr, inout_A_addr, m_ * m_ * sizeof(T), cudaMemcpyDeviceToDevice,
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||||
|
@ -95,38 +96,31 @@ class EighGpuKernel : public GpuKernel {
|
||||||
} else if constexpr (std::is_same_v<T, double>) {
|
} else if constexpr (std::is_same_v<T, double>) {
|
||||||
cusolverDnDsyevd_bufferSize(cusolver_handle_, jobz_, uplo_, m_, inout_A_addr, lda_, output_addr, &lwork);
|
cusolverDnDsyevd_bufferSize(cusolver_handle_, jobz_, uplo_, m_, inout_A_addr, lda_, output_addr, &lwork);
|
||||||
}
|
}
|
||||||
int *devInfo = nullptr;
|
|
||||||
cudaMalloc(reinterpret_cast<void **>(&devInfo), sizeof(int));
|
void *d_work = device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(sizeof(T) * lwork);
|
||||||
T *d_work = nullptr;
|
|
||||||
cudaMalloc(reinterpret_cast<void **>(&d_work), sizeof(T) * lwork);
|
|
||||||
if constexpr (std::is_same_v<T, float>) {
|
if constexpr (std::is_same_v<T, float>) {
|
||||||
cusolverDnSsyevd(cusolver_handle_, jobz_, uplo_, m_, w_v_addr, lda_, output_addr, d_work, lwork, devInfo);
|
cusolverDnSsyevd(cusolver_handle_, jobz_, uplo_, m_, w_v_addr, lda_, output_addr, reinterpret_cast<T *>(d_work),
|
||||||
|
lwork, devInfo);
|
||||||
} else if constexpr (std::is_same_v<T, double>) {
|
} else if constexpr (std::is_same_v<T, double>) {
|
||||||
cusolverDnDsyevd(cusolver_handle_, jobz_, uplo_, m_, w_v_addr, lda_, output_addr, d_work, lwork, devInfo);
|
cusolverDnDsyevd(cusolver_handle_, jobz_, uplo_, m_, w_v_addr, lda_, output_addr, reinterpret_cast<T *>(d_work),
|
||||||
|
lwork, devInfo);
|
||||||
}
|
}
|
||||||
size_t input_shape[kShape2dDims] = {m_, m_};
|
size_t input_shape[kShape2dDims] = {m_, m_};
|
||||||
size_t input_axis[kShape2dDims] = {1, 0};
|
size_t input_axis[kShape2dDims] = {1, 0};
|
||||||
size_t *dev_input_shape = nullptr;
|
size_t *dev_input_shape = GetDeviceAddress<size_t>(workspace, kDim2);
|
||||||
cudaMalloc(reinterpret_cast<void **>(&dev_input_shape), kShape2dDims * sizeof(size_t));
|
size_t *dev_input_axis = GetDeviceAddress<size_t>(workspace, kDim3);
|
||||||
size_t *dev_input_axis = nullptr;
|
|
||||||
cudaMalloc(reinterpret_cast<void **>(&dev_input_axis), kShape2dDims * sizeof(size_t));
|
|
||||||
cudaMemcpyAsync(dev_input_shape, input_shape, kShape2dDims * sizeof(size_t), cudaMemcpyHostToDevice,
|
cudaMemcpyAsync(dev_input_shape, input_shape, kShape2dDims * sizeof(size_t), cudaMemcpyHostToDevice,
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||||
cudaMemcpyAsync(dev_input_axis, input_axis, kShape2dDims * sizeof(size_t), cudaMemcpyHostToDevice,
|
cudaMemcpyAsync(dev_input_axis, input_axis, kShape2dDims * sizeof(size_t), cudaMemcpyHostToDevice,
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||||
CalTranspose(m_ * m_, w_v_addr, dev_input_shape, dev_input_axis, kShape2dDims, output_v_addr,
|
CalTranspose(m_ * m_, w_v_addr, dev_input_shape, dev_input_axis, kShape2dDims, output_v_addr,
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr));
|
reinterpret_cast<cudaStream_t>(stream_ptr));
|
||||||
if (d_work) {
|
device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(d_work);
|
||||||
cudaFree(d_work);
|
|
||||||
}
|
|
||||||
int info_gpu = 0;
|
int info_gpu = 0;
|
||||||
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
|
||||||
cudaMemcpyAsync(&info_gpu, devInfo, sizeof(int), cudaMemcpyDeviceToHost,
|
cudaMemcpyAsync(&info_gpu, devInfo, sizeof(int), cudaMemcpyDeviceToHost,
|
||||||
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
reinterpret_cast<cudaStream_t>(stream_ptr)),
|
||||||
"copy to device result failed");
|
"copy to device result failed");
|
||||||
if (devInfo) {
|
|
||||||
cudaFree(devInfo);
|
|
||||||
}
|
|
||||||
if (info_gpu != 0) {
|
if (info_gpu != 0) {
|
||||||
MS_LOG_EXCEPTION << kernel_name_ << " launch gpu kernel fail for dtype:" << dtype_;
|
MS_LOG_EXCEPTION << kernel_name_ << " launch gpu kernel fail for dtype:" << dtype_;
|
||||||
}
|
}
|
||||||
|
@ -141,7 +135,12 @@ class EighGpuKernel : public GpuKernel {
|
||||||
output_size_list_.push_back(m_ * sizeof(T));
|
output_size_list_.push_back(m_ * sizeof(T));
|
||||||
// eigenvector
|
// eigenvector
|
||||||
output_size_list_.push_back(m_ * m_ * sizeof(T));
|
output_size_list_.push_back(m_ * m_ * sizeof(T));
|
||||||
|
// result
|
||||||
|
workspace_size_list_.push_back(sizeof(int));
|
||||||
workspace_size_list_.push_back(m_ * m_ * sizeof(T));
|
workspace_size_list_.push_back(m_ * m_ * sizeof(T));
|
||||||
|
// transpose scalar workspace
|
||||||
|
workspace_size_list_.push_back(kShape2dDims * sizeof(size_t));
|
||||||
|
workspace_size_list_.push_back(kShape2dDims * sizeof(size_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t m_{1};
|
size_t m_{1};
|
||||||
|
|
Loading…
Reference in New Issue