!26658 Support CPU pad 1D

Merge pull request !26658 from wanyiming/mod_pad
2021-11-25 06:20:31 +00:00 · 2021-11-25 06:20:31 +00:00 · 39c147dd37
parent 87b02a8bf4 d559b686fe
commit 39c147dd37
2 changed files with 55 additions and 61 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_cpu_kernel.cc
@ -22,41 +22,50 @@ namespace kernel {
 namespace {
 constexpr size_t kPadInputsNum = 1;
 constexpr size_t kPadOutputsNum = 1;
+constexpr size_t kPadElemSize = 2;
 }  // namespace

 void PadCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
-  paddings_ = AnfAlgo::GetNodeAttr<std::vector<std::vector<int64_t>>>(kernel_node, "paddings");
+  std::vector<std::vector<int64_t>> paddings_ =
+    AnfAlgo::GetNodeAttr<std::vector<std::vector<int64_t>>>(kernel_node, "paddings");
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
-  std::vector<size_t> input_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  std::vector<size_t> output_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);

-  shape_size_ = input_shape.size();
-  if (shape_size_ == 4) {  // shape adjustment from 2d/3d to 4d
-  } else if (shape_size_ == 3) {
-    (void)input_shape.insert(input_shape.begin(), 1);  // batch padding
-    shape_size_ = 4;
-  } else if (shape_size_ == 2) {
-    (void)input_shape.insert(input_shape.begin(), 2, 1);  // channel padding
-    shape_size_ = 4;
+  input_rank_ = input_shape_.size();
+  if (paddings_.size() != input_rank_) {
+    MS_LOG(EXCEPTION) << "PadCpuFwdKernel: paddings' size must be equal to the rank of the input.";
  }

-  for (size_t i = 0; i < shape_size_; ++i) {
-    tensor_size_ *= input_shape[i];
-    input_shape_.push_back(input_shape[i]);
+  for (size_t i = 0; i < paddings_.size(); i++) {
+    if (paddings_[i].size() != kPadElemSize) {
+      MS_LOG(EXCEPTION) << "PadCpuFwdKernel: each element in paddings must have size 2.";
+    }
+    flattened_paddings_.push_back(paddings_[i][0]);
+    flattened_paddings_.push_back(paddings_[i][1]);
  }

-  if (paddings_.size() == 4) {  // shape adjustment from 2d/3d to 4d
-  } else if (paddings_.size() == 3) {
-    (void)paddings_.insert(paddings_.begin(), 1, {0, 0});  // batch padding
-  } else if (paddings_.size() == 2) {
-    (void)paddings_.insert(paddings_.begin(), 2, {0, 0});  // channel padding
+  for (size_t i = 0; i < input_rank_; i++) {
+    input_size_ *= input_shape_[i];
+    output_size_ *=
+      (input_shape_[i] + flattened_paddings_[kPadElemSize * i] + flattened_paddings_[(kPadElemSize * i) + 1]);
  }

-  for (size_t i = 0; i < shape_size_; i++) {
-    size_t temp = input_shape[i] + LongToSize((paddings_[i][0] + paddings_[i][1]));  // compute new dim size
-    output_size_ *= temp;
-    output_shape_.push_back(temp);  // correct new dimension size
+  if (input_rank_ < 1) {
+    MS_LOG(EXCEPTION) << "For 'PadCpuKernel', the rank of input should be greater than or equal to 1, "
+                      << "but got the rank of input: " << input_rank_;
+  }
+  if (output_shape.size() != input_rank_) {
+    MS_LOG(EXCEPTION) << "For 'PadCpuKernel', the rank of input should be equal to the rank of output, "
+                      << "but got the rank of input: " << input_rank_
+                      << ", the rank of output: " << output_shape.size();
+  }
+  strides_.resize(input_rank_);
+  strides_[input_rank_ - 1] = 1;
+  for (int32_t i = input_rank_ - 2; i >= 0; i--) {
+    strides_[i] = output_shape[i + 1] * strides_[i + 1];
  }
 }

@ -79,43 +88,27 @@ bool PadCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const s
 }

 template <typename T>
-void PadCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const {
+bool PadCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const {
  const auto *inputs_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto *outputs_addr = reinterpret_cast<T *>(outputs[0]->addr);
-
-  const int pad_left = paddings_[3][0];
-  const int pad_top = paddings_[2][0];
-  const int pad_channel_before = paddings_[1][0];
-  const int pad_channel_after = paddings_[1][1];
-  const T pad_value = T(0);
-
-  const int channels_orig = input_shape_[1];
-  const int old_height = input_shape_[2];
-  const int old_width = input_shape_[3];
-  const int padded_height = output_shape_[2];
-  const int padded_width = output_shape_[3];
-
-  for (size_t pos = 0; pos < output_size_; ++pos) {
-    int block_num = (SizeToInt(pos) / padded_width) / padded_height;
-    const int padded_w = SizeToInt(pos) % padded_width;                    // x coordinate referred to by cur 'pos'
-    const int padded_h = (SizeToInt(pos) / padded_width) % padded_height;  // y coordinate referred to by cur 'pos'
-
-    int channels_new = channels_orig + pad_channel_after + pad_channel_before;  // new number of channels from padding
-    int channel_num = block_num % channels_new;                                 // current channel
-    int batch_item = block_num / channels_new;                                  // current item in batch
-
-    if (padded_h - pad_top < 0 || padded_w - pad_left < 0 || padded_h - pad_top >= old_height ||
-        padded_w - pad_left >= old_width || channel_num <= pad_channel_before - 1 ||
-        channel_num > channels_orig + pad_channel_before - 1) {
-      outputs_addr[pos] = pad_value;
-    } else {
-      // on a block/x,y position that isn't padding, copy data from the correct block/x,y pos the input
-      // calculate from number of blocks of padding (due to channel padding) inserted prior
-      int equiv_block_num = block_num - (batch_item * (pad_channel_before + pad_channel_after)) - pad_channel_before;
-      outputs_addr[pos] =
-        inputs_addr[(equiv_block_num * old_height + padded_h - pad_top) * old_width + padded_w - pad_left];
+  if (memset_s(outputs_addr, outputs[0]->size, 0, outputs[0]->size) != EOK) {
+    MS_LOG(EXCEPTION) << "Output buffer memset failed.";
+    return false;
  }
+
+  for (size_t gt_id = 0; gt_id < input_size_; ++gt_id) {
+    size_t linear_index = gt_id;
+    size_t padded_linear_index = 0;
+    for (size_t i = input_rank_; i >= 1; i--) {
+      size_t unravel_dimension = input_shape_[i - 1];
+      size_t unraveled_index = linear_index % unravel_dimension;
+      padded_linear_index += ((unraveled_index + flattened_paddings_[kPadElemSize * (i - 1)]) * strides_[i - 1]);
+      linear_index -= unraveled_index;
+      linear_index /= unravel_dimension;
    }
+    outputs_addr[padded_linear_index] = inputs_addr[gt_id];
+  }
+  return true;
 }
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_cpu_kernel.h
@ -37,15 +37,16 @@ class PadCPUKernel : public CPUKernel {

 private:
  template <typename T>
-  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;
+  bool LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;

  TypeId dtype_{kTypeUnknown};
-  uint64_t tensor_size_{1};
-  size_t shape_size_{1};
-  uint64_t output_size_{1};
  std::vector<std::vector<int64_t>> paddings_;
+  size_t input_rank_;
+  std::vector<int32_t> flattened_paddings_;
  std::vector<size_t> input_shape_;
-  std::vector<size_t> output_shape_;
+  std::vector<size_t> strides_;
+  size_t input_size_{1};
+  size_t output_size_{1};
 };

 MS_REG_CPU_KERNEL(Pad, KernelAttr(), PadCPUKernel);