!40685 fix col2im complex128 bug on master

Merge pull request !40685 from baihuawei/col2im0822
This commit is contained in:
i-robot 2022-08-22 09:28:52 +00:00 committed by Gitee
commit 100757c3fc
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
5 changed files with 94 additions and 91 deletions

View File

@ -26,34 +26,35 @@ template <typename T>
using Complex = mindspore::utils::Complex<T>;
template <typename T, typename S>
__global__ void Col2ImKernel(const T *input, T *output, const size_t num_kernels, const size_t per_batch_size,
const size_t per_channel_size, const size_t per_col_batch_size, const size_t out_height,
const size_t out_width, const size_t in_height, const size_t in_width,
const size_t kernel_height, const size_t kernel_width, const size_t pad_height,
const size_t pad_width, const size_t stride_height, const size_t stride_width,
const size_t dilation_height, const size_t dilation_width) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_kernels; i += blockDim.x * gridDim.x) {
__global__ void Col2ImKernel(const T *input, T *output, const uint32_t num_kernels, const uint32_t per_batch_size,
const uint32_t per_channel_size, const uint32_t per_col_batch_size,
const uint32_t out_height, const uint32_t out_width, const uint32_t in_height,
const uint32_t in_width, const uint32_t kernel_height, const uint32_t kernel_width,
const uint32_t pad_height, const uint32_t pad_width, const uint32_t stride_height,
const uint32_t stride_width, const uint32_t dilation_height,
const uint32_t dilation_width) {
for (uint32_t i = blockIdx.x * blockDim.x + threadIdx.x; i < num_kernels; i += blockDim.x * gridDim.x) {
S val = static_cast<S>(0);
size_t w_id = i % out_width + pad_width;
size_t h_id = i % per_batch_size / out_width % out_height + pad_height;
size_t c_id = i % per_batch_size / per_channel_size;
size_t n_col_offset = i / per_batch_size * per_col_batch_size;
size_t kernel_expand_h = (kernel_height - 1) * dilation_height + 1;
size_t kernel_expand_w = (kernel_width - 1) * dilation_width + 1;
uint32_t w_id = i % out_width + pad_width;
uint32_t h_id = i % per_batch_size / out_width % out_height + pad_height;
uint32_t c_id = i % per_batch_size / per_channel_size;
uint32_t n_col_offset = i / per_batch_size * per_col_batch_size;
uint32_t kernel_expand_h = (kernel_height - 1) * dilation_height + 1;
uint32_t kernel_expand_w = (kernel_width - 1) * dilation_width + 1;
// range coordinates
size_t out_height_start = h_id < kernel_expand_h ? 0 : (h_id - kernel_expand_h) / stride_height + 1;
size_t out_width_start = w_id < kernel_expand_w ? 0 : (w_id - kernel_expand_w) / stride_width + 1;
size_t out_height_end = min(h_id / stride_height + 1, in_height);
size_t out_width_end = min(w_id / stride_width + 1, in_width);
uint32_t out_height_start = h_id < kernel_expand_h ? 0 : (h_id - kernel_expand_h) / stride_height + 1;
uint32_t out_width_start = w_id < kernel_expand_w ? 0 : (w_id - kernel_expand_w) / stride_width + 1;
uint32_t out_height_end = min(h_id / stride_height + 1, in_height);
uint32_t out_width_end = min(w_id / stride_width + 1, in_width);
for (size_t height = out_height_start; height < out_height_end; ++height) {
for (size_t width = out_width_start; width < out_width_end; ++width) {
size_t kernel_h = (h_id - height * stride_height);
size_t kernel_w = (w_id - width * stride_width);
for (uint32_t height = out_height_start; height < out_height_end; ++height) {
for (uint32_t width = out_width_start; width < out_width_end; ++width) {
uint32_t kernel_h = (h_id - height * stride_height);
uint32_t kernel_w = (w_id - width * stride_width);
if (kernel_h % dilation_height == 0 && kernel_w % dilation_width == 0) {
kernel_h /= dilation_height;
kernel_w /= dilation_width;
size_t data_index =
uint32_t data_index =
n_col_offset +
(((c_id * kernel_height + kernel_h) * kernel_width + kernel_w) * in_height + height) * in_width + width;
val += (S)input[data_index];
@ -65,15 +66,15 @@ __global__ void Col2ImKernel(const T *input, T *output, const size_t num_kernels
}
template <typename T, typename S>
void Col2Im(const T *input, const size_t batch_size, const size_t channels, const size_t out_height,
const size_t out_width, const size_t in_height, const size_t in_width, const size_t kernel_height,
const size_t kernel_width, const size_t pad_height, const size_t pad_width, const size_t stride_height,
const size_t stride_width, const size_t dilation_height, const size_t dilation_width, T *output,
cudaStream_t cuda_stream) {
size_t per_channel_size = out_height * out_width;
size_t per_batch_size = channels * per_channel_size;
size_t num_kernels = batch_size * per_batch_size;
size_t per_col_batch_size = channels * in_height * in_width * kernel_width * kernel_height;
void Col2Im(const T *input, const uint32_t batch_size, const uint32_t channels, const uint32_t out_height,
const uint32_t out_width, const uint32_t in_height, const uint32_t in_width, const uint32_t kernel_height,
const uint32_t kernel_width, const uint32_t pad_height, const uint32_t pad_width,
const uint32_t stride_height, const uint32_t stride_width, const uint32_t dilation_height,
const uint32_t dilation_width, T *output, cudaStream_t cuda_stream) {
uint32_t per_channel_size = out_height * out_width;
uint32_t per_batch_size = channels * per_channel_size;
uint32_t num_kernels = batch_size * per_batch_size;
uint32_t per_col_batch_size = channels * in_height * in_width * kernel_width * kernel_height;
Col2ImKernel<T, S><<<GET_BLOCKS(num_kernels), GET_THREADS, 0, cuda_stream>>>(
input, output, num_kernels, per_batch_size, per_channel_size, per_col_batch_size, out_height, out_width, in_height,
in_width, kernel_height, kernel_width, pad_height, pad_width, stride_height, stride_width, dilation_height,
@ -82,33 +83,38 @@ void Col2Im(const T *input, const size_t batch_size, const size_t channels, cons
}
template CUDA_LIB_EXPORT void Col2Im<float, float>(
const float *input, const size_t batch_size, const size_t channels, const size_t out_height, const size_t out_width,
const size_t in_height, const size_t in_width, const size_t kernel_height, const size_t kernel_width,
const size_t pad_height, const size_t pad_width, const size_t stride_height, const size_t stride_width,
const size_t dilation_height, const size_t dilation_width, float *output, cudaStream_t cuda_stream);
const float *input, const uint32_t batch_size, const uint32_t channels, const uint32_t out_height,
const uint32_t out_width, const uint32_t in_height, const uint32_t in_width, const uint32_t kernel_height,
const uint32_t kernel_width, const uint32_t pad_height, const uint32_t pad_width, const uint32_t stride_height,
const uint32_t stride_width, const uint32_t dilation_height, const uint32_t dilation_width, float *output,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void Col2Im<half, float>(
const half *input, const size_t batch_size, const size_t channels, const size_t out_height, const size_t out_width,
const size_t in_height, const size_t in_width, const size_t kernel_height, const size_t kernel_width,
const size_t pad_height, const size_t pad_width, const size_t stride_height, const size_t stride_width,
const size_t dilation_height, const size_t dilation_width, half *output, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void Col2Im<half, float>(const half *input, const uint32_t batch_size, const uint32_t channels,
const uint32_t out_height, const uint32_t out_width,
const uint32_t in_height, const uint32_t in_width,
const uint32_t kernel_height, const uint32_t kernel_width,
const uint32_t pad_height, const uint32_t pad_width,
const uint32_t stride_height, const uint32_t stride_width,
const uint32_t dilation_height, const uint32_t dilation_width,
half *output, cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void Col2Im<double, double>(
const double *input, const size_t batch_size, const size_t channels, const size_t out_height, const size_t out_width,
const size_t in_height, const size_t in_width, const size_t kernel_height, const size_t kernel_width,
const size_t pad_height, const size_t pad_width, const size_t stride_height, const size_t stride_width,
const size_t dilation_height, const size_t dilation_width, double *output, cudaStream_t cuda_stream);
const double *input, const uint32_t batch_size, const uint32_t channels, const uint32_t out_height,
const uint32_t out_width, const uint32_t in_height, const uint32_t in_width, const uint32_t kernel_height,
const uint32_t kernel_width, const uint32_t pad_height, const uint32_t pad_width, const uint32_t stride_height,
const uint32_t stride_width, const uint32_t dilation_height, const uint32_t dilation_width, double *output,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void Col2Im<Complex<float>, Complex<float>>(
const Complex<float> *input, const size_t batch_size, const size_t channels, const size_t out_height,
const size_t out_width, const size_t in_height, const size_t in_width, const size_t kernel_height,
const size_t kernel_width, const size_t pad_height, const size_t pad_width, const size_t stride_height,
const size_t stride_width, const size_t dilation_height, const size_t dilation_width, Complex<float> *output,
const Complex<float> *input, const uint32_t batch_size, const uint32_t channels, const uint32_t out_height,
const uint32_t out_width, const uint32_t in_height, const uint32_t in_width, const uint32_t kernel_height,
const uint32_t kernel_width, const uint32_t pad_height, const uint32_t pad_width, const uint32_t stride_height,
const uint32_t stride_width, const uint32_t dilation_height, const uint32_t dilation_width, Complex<float> *output,
cudaStream_t cuda_stream);
template CUDA_LIB_EXPORT void Col2Im<Complex<double>, Complex<double>>(
const Complex<double> *input, const size_t batch_size, const size_t channels, const size_t out_height,
const size_t out_width, const size_t in_height, const size_t in_width, const size_t kernel_height,
const size_t kernel_width, const size_t pad_height, const size_t pad_width, const size_t stride_height,
const size_t stride_width, const size_t dilation_height, const size_t dilation_width, Complex<double> *output,
const Complex<double> *input, const uint32_t batch_size, const uint32_t channels, const uint32_t out_height,
const uint32_t out_width, const uint32_t in_height, const uint32_t in_width, const uint32_t kernel_height,
const uint32_t kernel_width, const uint32_t pad_height, const uint32_t pad_width, const uint32_t stride_height,
const uint32_t stride_width, const uint32_t dilation_height, const uint32_t dilation_width, Complex<double> *output,
cudaStream_t cuda_stream);

View File

@ -19,10 +19,10 @@
#include "plugin/device/gpu/kernel/cuda_impl/cuda_ops/cuda_common.h"
template <typename T, typename S>
CUDA_LIB_EXPORT void Col2Im(const T *input, const size_t batch_size, const size_t channels, const size_t out_height,
const size_t out_width, const size_t in_height, const size_t in_width,
const size_t kernel_height, const size_t kernel_width, const size_t pad_height,
const size_t pad_width, const size_t stride_height, const size_t stride_width,
const size_t dilation_height, const size_t dilation_width, T *output,
cudaStream_t cuda_stream);
CUDA_LIB_EXPORT void Col2Im(const T *input, const uint32_t batch_size, const uint32_t channels,
const uint32_t out_height, const uint32_t out_width, const uint32_t in_height,
const uint32_t in_width, const uint32_t kernel_height, const uint32_t kernel_width,
const uint32_t pad_height, const uint32_t pad_width, const uint32_t stride_height,
const uint32_t stride_width, const uint32_t dilation_height, const uint32_t dilation_width,
T *output, cudaStream_t cuda_stream);
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_GPU_KERNEL_CUDA_IMPL_CUDA_OPS_COL2IM_IMPL_CUH_

View File

@ -90,27 +90,27 @@ int Col2ImFwdGpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const st
}
auto input_shape = inputs[kIndex0]->GetShapeVector();
auto output_shape = outputs[kIndex0]->GetShapeVector();
batch_size_ = input_shape[kIndex0];
channels_ = input_shape[kIndex1];
out_height_ = output_shape[kIndex2];
out_width_ = output_shape[kIndex3];
batch_size_ = static_cast<uint32_t>(input_shape[kIndex0]);
channels_ = static_cast<uint32_t>(input_shape[kIndex1]);
out_height_ = static_cast<uint32_t>(output_shape[kIndex2]);
out_width_ = static_cast<uint32_t>(output_shape[kIndex3]);
auto kernel_size = GetValue<std::vector<int64_t>>(base_operator->GetAttr("kernel_size"));
auto dilation = GetValue<std::vector<int64_t>>(base_operator->GetAttr("dilation"));
auto padding = GetValue<std::vector<int64_t>>(base_operator->GetAttr("padding"));
auto stride = GetValue<std::vector<int64_t>>(base_operator->GetAttr("stride"));
pad_height_ = padding[kIndex0];
pad_width_ = padding[kIndex1];
kernel_height_ = kernel_size[kIndex0];
kernel_width_ = kernel_size[kIndex1];
stride_height_ = stride[kIndex0];
stride_width_ = stride[kIndex1];
dilation_height_ = dilation[kIndex0];
dilation_width_ = dilation[kIndex1];
in_height_ =
pad_height_ = static_cast<uint32_t>(padding[kIndex0]);
pad_width_ = static_cast<uint32_t>(padding[kIndex1]);
kernel_height_ = static_cast<uint32_t>(kernel_size[kIndex0]);
kernel_width_ = static_cast<uint32_t>(kernel_size[kIndex1]);
stride_height_ = static_cast<uint32_t>(stride[kIndex0]);
stride_width_ = static_cast<uint32_t>(stride[kIndex1]);
dilation_height_ = static_cast<uint32_t>(dilation[kIndex0]);
dilation_width_ = static_cast<uint32_t>(dilation[kIndex1]);
in_height_ = static_cast<uint32_t>(
(out_height_ + kPaddingDirection * pad_height_ - (dilation_height_ * (kernel_height_ - 1) + 1)) / stride_height_ +
1;
in_width_ =
(out_width_ + kPaddingDirection * pad_width_ - (dilation_width_ * (kernel_width_ - 1) + 1)) / stride_width_ + 1;
1);
in_width_ = static_cast<uint32_t>(
(out_width_ + kPaddingDirection * pad_width_ - (dilation_width_ * (kernel_width_ - 1) + 1)) / stride_width_ + 1);
return KRET_OK;
}

View File

@ -61,20 +61,20 @@ class Col2ImFwdGpuKernelMod : public NativeGpuKernelMod {
const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &)>;
private:
size_t batch_size_{0};
size_t channels_{0};
size_t out_height_{0};
size_t out_width_{0};
size_t in_height_{0};
size_t in_width_{0};
size_t pad_width_{0};
size_t pad_height_{0};
size_t kernel_height_{0};
size_t kernel_width_{0};
size_t stride_height_{0};
size_t stride_width_{0};
size_t dilation_height_{0};
size_t dilation_width_{0};
uint32_t batch_size_{0};
uint32_t channels_{0};
uint32_t out_height_{0};
uint32_t out_width_{0};
uint32_t in_height_{0};
uint32_t in_width_{0};
uint32_t pad_width_{0};
uint32_t pad_height_{0};
uint32_t kernel_height_{0};
uint32_t kernel_width_{0};
uint32_t stride_height_{0};
uint32_t stride_width_{0};
uint32_t dilation_height_{0};
uint32_t dilation_width_{0};
bool is_null_input_{false};
void *cuda_stream_{nullptr};
Col2ImFunc kernel_func_{};

View File

@ -3971,9 +3971,6 @@ def col2im(input_x, output_size, kernel_size, dilation, padding_value, stride):
``GPU``
Examples:
>>> import numpy as np
>>> import mindspore.ops as ops
>>> from mindspore import Tensor
>>> x = Tensor(input_data=np.random.rand(16, 16, 4, 25), dtype=mstype.float32)
>>> output_size = Tensor(input_data=[8, 8], dtype=mstype.int32)
>>> output = ops.col2im(x, output_size, [2, 2], [2, 2], [2, 2], [2, 2])