From 39da4c5f6b4e4fd571756fefd360447b5eae374d Mon Sep 17 00:00:00 2001 From: zhanzhan1 Date: Thu, 2 Mar 2023 11:56:11 +0800 Subject: [PATCH] fix reducesum input change --- .../gpu/kernel/arrays/array_reduce_gpu_kernel.cc | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/array_reduce_gpu_kernel.cc b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/array_reduce_gpu_kernel.cc index bcf823a5101..c86d4be941e 100644 --- a/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/array_reduce_gpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/gpu/kernel/arrays/array_reduce_gpu_kernel.cc @@ -209,6 +209,8 @@ bool ArrayReduceGpuKernelMod::Init(const BaseOperatorPtr &base_operator, const s void ArrayReduceGpuKernelMod::InitCudnnResource() { CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnGetTensorSizeInBytes(inputA_descriptor_, &input_size_), "cudnnGetTensorSizeInBytes failed."); + CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE(cudnnGetTensorSizeInBytes(outputC_descriptor_, &output_size_), + "cudnnGetTensorSizeInBytes failed."); CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE( cudnnGetReductionWorkspaceSize(cudnn_handle_, reduce_tensor_descriptor_, inputA_descriptor_, outputC_descriptor_, @@ -351,16 +353,16 @@ void ArrayReduceGpuKernelMod::LaunchIntKernel(const std::vector &inp const std::vector &outputs, void *stream_ptr) { S *input_addr = GetDeviceAddress(inputs, 0); S *output_addr = GetDeviceAddress(outputs, 0); - + S *workspace_addr = GetPossiblyNullDeviceAddress(workspace, 0); T alpha = static_cast(1.0f); T beta = static_cast(0.0f); - S *workspace_addr = GetPossiblyNullDeviceAddress(workspace, 0); - T *casted_input = GetDeviceAddress(inputs, 0); - T *output_before_cast = GetDeviceAddress(outputs, 0); + T *casted_input = reinterpret_cast(device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(input_size_)); + T *output_before_cast = + reinterpret_cast(device::gpu::GPUMemoryAllocator::GetInstance().AllocTensorMem(output_size_)); const int input_num = input_size_ / sizeof(T); - const int output_num = output_size_list_[kIndex0] / sizeof(S); + const int output_num = output_size_ / sizeof(S); Cast(input_num, input_addr, casted_input, reinterpret_cast(stream_ptr), GET_CTX_DEVICE_ID); CHECK_CUDNN_RET_WITH_EXCEPT_NOTRACE( @@ -368,6 +370,8 @@ void ArrayReduceGpuKernelMod::LaunchIntKernel(const std::vector &inp inputA_descriptor_, casted_input, &beta, outputC_descriptor_, output_before_cast), "cudnnReduceTensor failed."); Cast(output_num, output_before_cast, output_addr, reinterpret_cast(stream_ptr), GET_CTX_DEVICE_ID); + device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(casted_input); + device::gpu::GPUMemoryAllocator::GetInstance().FreeTensorMem(output_before_cast); return; }