From c0a7a23fa36e05e26d6ca92020db58067bf6f61c Mon Sep 17 00:00:00 2001 From: yangchun Date: Thu, 1 Apr 2021 21:41:49 +0800 Subject: [PATCH] Reduce/Transpose/TensorAdd add multi thread Support and Fix reduce bug! --- .../backend/kernel_compiler/cpu/cpu_kernel.cc | 115 ++++++++++++++ .../backend/kernel_compiler/cpu/cpu_kernel.h | 42 +++++ .../kernel_compiler/cpu/reduce_cpu_kernel.cc | 148 ++++++------------ .../kernel_compiler/cpu/reduce_cpu_kernel.h | 3 - .../cpu/tensoradd_cpu_kernel.cc | 124 ++------------- .../cpu/tensoradd_cpu_kernel.h | 6 - .../cpu/transpose_cpu_kernel.cc | 88 ++--------- .../cpu/transpose_cpu_kernel.h | 3 +- 8 files changed, 234 insertions(+), 295 deletions(-) diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc index 38acb68929..5d5d8b9de2 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.cc @@ -14,6 +14,8 @@ * limitations under the License. */ #include "backend/kernel_compiler/cpu/cpu_kernel.h" +#include +#include #include "common/thread_pool.h" namespace mindspore { @@ -119,5 +121,118 @@ std::vector CPUKernelUtils::FlatShapeByAxis(const std::vector &s return flat_shape; } +BroadcastIterator::BroadcastIterator(std::vector input_shape_a, std::vector input_shape_b, + std::vector output_shape) + : input_shape_a_(std::move(input_shape_a)), + input_shape_b_(std::move(input_shape_b)), + output_shape_(std::move(output_shape)) { + output_dimension_ = SizeToInt(output_shape_.size()); // Assign dimension to int for iterator + BroadcastShape(); + // Allocate strides memory + input_strides_a_.resize(output_dimension_); + input_strides_b_.resize(output_dimension_); + input_back_strides_a_.resize(output_dimension_); + input_back_strides_b_.resize(output_dimension_); + coordinates_.resize(output_dimension_); + InitStrides(); +} + +void BroadcastIterator::SetPos(size_t pos) { + for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) { + coordinates_[i] = pos % output_shape_[i]; + input_pos_[0] += coordinates_[i] * input_strides_a_[i]; + input_pos_[1] += coordinates_[i] * input_strides_b_[i]; + pos /= output_shape_[i]; + } +} + +void BroadcastIterator::GenNextPos() { + // Calculate output next coordinate + for (int i = output_dimension_ - 1; i >= 0; --i) { + if (coordinates_[i] + 1 == output_shape_[i]) { + coordinates_[i] = 0; + input_pos_[0] -= input_back_strides_a_[i]; + input_pos_[1] -= input_back_strides_b_[i]; + } else { + ++coordinates_[i]; + input_pos_[0] += input_strides_a_[i]; + input_pos_[1] += input_strides_b_[i]; + break; + } + } +} + +void BroadcastIterator::BroadcastShape() { + int input_dimension_a = input_shape_a_.size(); + if (input_dimension_a < output_dimension_) { + input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1); + } + + int input_dimension_b = input_shape_b_.size(); + if (input_dimension_b < output_dimension_) { + input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1); + } +} + +void BroadcastIterator::InitStrides() { + input_strides_a_[output_dimension_ - 1] = 1; + input_strides_b_[output_dimension_ - 1] = 1; + for (int i = output_dimension_ - 2; i >= 0; --i) { + input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1]; + input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1]; + input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1]; + input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1]; + } + + // Update strides for broadcast + // While the axis value is 1, the stride is 0 + std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(), + [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); + std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(), + [](const auto &a, const auto &b) { return b == 1 ? 0 : a; }); +} + +TransposeIterator::TransposeIterator(std::vector output_shape, std::vector axes, + const std::vector &input_shape) + : shape_(std::move(output_shape)), axes_(std::move(axes)) { + // Calculate strides + dimension_ = shape_.size(); + std::vector strides(dimension_, 1); + for (int i = dimension_ - 2; i >= 0; --i) { + strides[i] = input_shape[i + 1] * strides[i + 1]; + } + + // Swap shape ans strides and calculate back strides + strides_.resize(dimension_); + back_strides_.resize(dimension_); + for (int i = dimension_ - 1; i >= 0; --i) { + strides_[i] = strides[axes_[i]]; + back_strides_[i] = (shape_[i] - 1) * strides_[i]; + } + + // Calculate coordinate by pos + coordinates_.resize(dimension_); +} + +void TransposeIterator::SetPos(size_t pos) { + for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) { + coordinates_[i] = pos % shape_[i]; + pos_ += coordinates_[i] * strides_[i]; + pos /= shape_[i]; + } +} + +void TransposeIterator::GenNextPos() { + for (int i = dimension_ - 1; i >= 0; --i) { + if (coordinates_[i] + 1 == shape_[i]) { + coordinates_[i] = 0; + pos_ -= back_strides_[i]; + } else { + coordinates_[i]++; + pos_ += strides_[i]; + break; + } + } +} } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h index 3b337781d7..d1763cdb0a 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h @@ -145,6 +145,48 @@ class CPUKernelUtils { static void ParallelFor(const CTask &task, size_t count); static std::vector FlatShapeByAxis(const std::vector &shape, int axis); }; + +class BroadcastIterator { + public: + BroadcastIterator(std::vector input_shape_a, std::vector input_shape_b, + std::vector output_shape); + inline size_t GetInputPosA() const { return input_pos_[0]; } + inline size_t GetInputPosB() const { return input_pos_[1]; } + void SetPos(size_t pos); + void GenNextPos(); + + private: + void BroadcastShape(); + void InitStrides(); + + std::vector coordinates_; + std::vector input_shape_a_; + std::vector input_shape_b_; + std::vector output_shape_; + std::vector input_strides_a_; + std::vector input_strides_b_; + std::vector input_back_strides_a_; + std::vector input_back_strides_b_; + std::array input_pos_{0}; + int output_dimension_{0}; +}; + +class TransposeIterator { + public: + TransposeIterator(std::vector output_shape, std::vector axes, const std::vector &input_shape); + inline size_t GetPos() const { return pos_; } + void SetPos(size_t pos); + void GenNextPos(); + + private: + int dimension_{0}; + std::vector coordinates_; + std::vector shape_; + std::vector strides_; + std::vector back_strides_; + std::vector axes_; + size_t pos_{0}; +}; } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc index d24c15662c..2665c0d2ee 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.cc @@ -18,13 +18,10 @@ #include #include #include -#include +#include namespace mindspore { namespace kernel { -namespace { -const size_t kMaxDim = 10; -} // namespace template void ReduceCPUKernel::InitKernel(const CNodePtr &kernel_node) { MS_EXCEPTION_IF_NULL(kernel_node); @@ -37,10 +34,14 @@ void ReduceCPUKernel::InitKernel(const CNodePtr &kernel_node) { } else { MS_LOG(EXCEPTION) << "Attribute is invalid"; } + int dimension = input_shape_.size(); std::transform(axis_.begin(), axis_.end(), axis_.begin(), [dimension](const auto &a) { return a < 0 ? dimension + a : a; }); sort(axis_.begin(), axis_.end()); + // Delete the duplicate axis. + auto last = std::unique(axis_.begin(), axis_.end()); + axis_.erase(last, axis_.end()); auto kernel_name = AnfAlgo::GetCNodeName(kernel_node); if (kernel_name == "ReduceMax") { reduce_type_ = 1; @@ -55,10 +56,8 @@ void ReduceCPUKernel::InitKernel(const CNodePtr &kernel_node) { reduce_type_ = 4; reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; }; } else { - MS_LOG(EXCEPTION) << "unsupported reduce type: " << reduce_type_; + MS_LOG(EXCEPTION) << "unsupported reduce type: " << reduce_type_; } - - CheckParameter(); } template @@ -68,7 +67,7 @@ bool ReduceCPUKernel::Launch(const std::vector &inputs, size_t input_size = inputs[0]->size / sizeof(T); auto input_addr = reinterpret_cast(inputs[0]->addr); auto output_addr = reinterpret_cast(outputs[0]->addr); - if (axis_.empty()) { + if (axis_.empty() || input_shape_.empty() || input_shape_.size() == 1) { // Get one ret *output_addr = input_addr[0]; for (size_t i = 1; i < input_size; ++i) { @@ -78,107 +77,50 @@ bool ReduceCPUKernel::Launch(const std::vector &inputs, *output_addr /= input_size; } } else { - // transpose->calculate strides->calculate ret - std::vector out_shape; - std::vector strides; - std::vector back_strides; - size_t stride; - CalculateTransposeInfo(&out_shape, &strides, &back_strides, &stride); + // Calculate transpose axes and stride int dimension = input_shape_.size(); - std::vector coordinates(dimension); - auto get_next_pos = [&coordinates, &out_shape, &strides, &back_strides, &dimension](size_t &curr_pos) { - for (int i = dimension - 1; i >= 0; --i) { - if (coordinates[i] + 1 == out_shape[i]) { - coordinates[i] = 0; - curr_pos -= back_strides[i]; - } else { - coordinates[i]++; - curr_pos += strides[i]; - break; + size_t stride = 1; + std::vector axes(input_shape_.size()); + size_t j = 0; + size_t k = 0; + for (int i = 0; i < dimension; ++i) { + if (j == axis_.size() || i != axis_[j]) { + axes[k] = i; + ++k; + } else { + stride *= input_shape_[i]; + ++j; + } + } + for (auto &it : axis_) { + axes[k] = it; + ++k; + } + // Calculate transpose shape + std::vector transpose_shape(input_shape_.size()); + for (int i = 0; i < dimension; ++i) { + transpose_shape[i] = input_shape_[axes[i]]; + } + size_t output_size = outputs[0]->size / sizeof(T); + TransposeIterator base_iter(std::move(transpose_shape), std::move(axes), input_shape_); + auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) { + auto iter = base_iter; + iter.SetPos(start * stride); + for (size_t i = start; i < end; ++i) { + output_addr[i] = input_addr[iter.GetPos()]; + iter.GenNextPos(); + for (size_t j = 1; j < stride; ++j) { + reduce_func_(input_addr, iter.GetPos(), &output_addr[i]); + iter.GenNextPos(); + } + if (reduce_type_ == 4) { // 4 is reduce mean + output_addr[i] /= stride; } } }; - size_t output_size = outputs[0]->size / sizeof(T); - size_t pos = 0; - for (size_t i = 0; i < output_size; ++i) { - if (i != 0) { - get_next_pos(pos); - } - output_addr[i] = input_addr[pos]; - for (size_t j = 1; j < stride; ++j) { - get_next_pos(pos); - reduce_func_(input_addr, pos, &output_addr[i]); - } - if (reduce_type_ == 4) { // 4 is reduce mean - output_addr[i] /= stride; - } - } + CPUKernelUtils::ParallelFor(task, output_size); } return true; } - -template -void ReduceCPUKernel::CalculateTransposeInfo(std::vector *new_shape, std::vector *strides, - std::vector *back_strides, size_t *stride) const { - int dimension = input_shape_.size(); - std::vector input_strides(dimension); - input_strides[dimension - 1] = 1; - for (int i = dimension - 2; i >= 0; --i) { - input_strides[i] = input_shape_[i + 1] * input_strides[i + 1]; - } - - // Calculate transpose axes and stride - std::vector axes(dimension); - int j = 0; - int k = 0; - *stride = 1; - for (int i = 0; i < dimension; ++i) { - if (i != axis_[j]) { - axes[k] = i; - ++k; - } else { - *stride *= input_shape_[i]; - ++j; - } - } - for (auto &it : axis_) { - axes[k] = it; - ++k; - } - - // Calculate strides, new_shape, back strides - strides->resize(dimension); - new_shape->resize(dimension); - back_strides->resize(dimension); - for (int i = dimension - 1; i >= 0; --i) { - (*strides)[i] = input_strides[axes[i]]; - (*new_shape)[i] = input_shape_[axes[i]]; - (*back_strides)[i] = ((*new_shape)[i] - 1) * (*strides)[i]; - } -} - -template -void ReduceCPUKernel::CheckParameter() const { - if (input_shape_.empty() || input_shape_.size() > kMaxDim) { - MS_LOG(EXCEPTION) << "Invalid input tensor of dimension: " << input_shape_.size(); - } - - if (axis_.empty()) { - MS_LOG(INFO) << "axis is empty"; - return; - } - - std::unordered_set checker(axis_.begin(), axis_.end()); - if (checker.size() != axis_.size()) { - MS_LOG(EXCEPTION) << "Duplicate value in axis"; - } - - int maxDimension = input_shape_.size(); - for (auto &axis : axis_) { - if (axis >= maxDimension) { - MS_LOG(EXCEPTION) << "Invalid value in axis: " << axis; - } - } -} } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h index acbb3ad070..f6d1e5353d 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reduce_cpu_kernel.h @@ -34,9 +34,6 @@ class ReduceCPUKernel : public CPUKernel { const std::vector &outputs) override; private: - void CheckParameter() const; - void CalculateTransposeInfo(std::vector *new_shape, std::vector *strides, - std::vector *back_strides, size_t *stride) const; std::vector input_shape_; std::vector axis_; int reduce_type_{0}; diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.cc index 33525de99c..7bc2b17a2b 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.cc @@ -14,71 +14,11 @@ * limitations under the License. */ #include "backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h" +#include #include namespace mindspore { namespace kernel { -namespace { -struct Iterator { - std::vector coordinates_; - std::vector input_shape_a_; - std::vector input_shape_b_; - std::vector output_shape_; - std::vector input_strides_a_; - std::vector input_strides_b_; - int output_dimension_pos_{0}; - size_t pos_{0}; - Iterator(const std::vector &input_shape_a, const std::vector &input_shape_b, - const std::vector &output_shape, const std::vector &input_strides_a, - const std::vector &input_strides_b, size_t pos) - : input_shape_a_(input_shape_a), - input_shape_b_(input_shape_b), - output_shape_(output_shape), - input_strides_a_(input_strides_a), - input_strides_b_(input_strides_b), - pos_{pos} { - output_dimension_pos_ = output_shape.size() - 1; - // Calculate coordinate with pos - coordinates_.resize(output_dimension_pos_ + 1); - int tmp = pos_; - for (int i = output_dimension_pos_; i >= 0 && tmp != 0; --i) { - coordinates_[i] = tmp % output_shape_[i]; - tmp /= output_shape_[i]; - } - } - - void UpdateCoordinates() { - // Calculate output next coordinate - for (int i = output_dimension_pos_; i >= 0; --i) { - if (coordinates_[i] + 1 == output_shape_[i]) { - coordinates_[i] = 0; - } else { - ++coordinates_[i]; - break; - } - } - } - - void GenPoints(std::array *position) { - auto &idx = *position; - idx = {0, 0}; - for (int k = 0; k < output_dimension_pos_; ++k) { - if (input_shape_a_[k] > 1) { - idx[0] += coordinates_[k] * input_strides_a_[k]; - } - if (input_shape_b_[k] > 1) { - idx[1] += coordinates_[k] * input_strides_b_[k]; - } - } - if (input_shape_a_[output_dimension_pos_] > 1) { - idx[0] += coordinates_[output_dimension_pos_]; - } - if (input_shape_b_[output_dimension_pos_] > 1) { - idx[1] += coordinates_[output_dimension_pos_]; - } - } -}; -} // namespace void TensorAddCPUKernel::InitKernel(const CNodePtr &kernel_node) { MS_EXCEPTION_IF_NULL(kernel_node); @@ -96,55 +36,25 @@ bool TensorAddCPUKernel::Launch(const std::vector &inputs, auto output_addr = reinterpret_cast(outputs[0]->addr); auto output_size = outputs[0]->size / sizeof(float); if (input_shape_a_ == input_shape_b_) { - NormalProcess(input_addr_a, input_addr_b, output_addr, output_size); + auto task = [output_addr, input_addr_a, input_addr_b](size_t start, size_t end) { + for (size_t i = start; i < end; ++i) { + output_addr[i] = input_addr_a[i] + input_addr_b[i]; + } + }; + CPUKernelUtils::ParallelFor(task, output_size); } else { // Broadcast - BroadcastProcess(input_addr_a, input_addr_b, output_addr, output_size); + BroadcastIterator base_iter(input_shape_a_, input_shape_b_, output_shape_); + auto task = [&base_iter, output_addr, input_addr_a, input_addr_b](size_t start, size_t end) { + auto iter = base_iter; + iter.SetPos(start); + for (size_t i = start; i < end; ++i) { + output_addr[i] = input_addr_a[iter.GetInputPosA()] + input_addr_b[iter.GetInputPosB()]; + iter.GenNextPos(); + } + }; + CPUKernelUtils::ParallelFor(task, output_size); } return true; } - -void TensorAddCPUKernel::NormalProcess(const float *input_a, const float *input_b, float *output, size_t size) { - auto task = [output, input_a, input_b](size_t start, size_t end) { - for (size_t i = start; i < end; ++i) { - output[i] = input_a[i] + input_b[i]; - } - }; - CPUKernelUtils::ParallelFor(task, size); -} - -void TensorAddCPUKernel::BroadcastProcess(const float *input_a, const float *input_b, float *output, size_t size) { - // Broadcast shape - int dimension = output_shape_.size(); - int input_dimension_a = input_shape_a_.size(); - if (input_dimension_a < dimension) { - input_shape_a_.insert(input_shape_a_.begin(), dimension - input_dimension_a, 1); - } - int input_dimension_b = input_shape_b_.size(); - if (input_dimension_b < dimension) { - input_shape_b_.insert(input_shape_b_.begin(), dimension - input_dimension_b, 1); - } - - // Calculate strides - CalculateStrides(input_shape_a_, &input_strides_a_); - CalculateStrides(input_shape_b_, &input_strides_b_); - - auto task = [this, input_a, input_b, output](size_t start, size_t end) { - Iterator iter(input_shape_a_, input_shape_b_, output_shape_, input_strides_a_, input_strides_b_, start); - std::array position{0}; - for (size_t i = start; i < end; ++i) { - iter.GenPoints(&position); - output[i] = input_a[position[0]] + input_b[position[1]]; - iter.UpdateCoordinates(); - } - }; - CPUKernelUtils::ParallelFor(task, size); -} - -void TensorAddCPUKernel::CalculateStrides(const std::vector &shape, std::vector *strides) { - strides->resize(shape.size(), 1); - for (int i = shape.size() - 2; i >= 0; --i) { - (*strides)[i] = shape[i + 1] * (*strides)[i + 1]; - } -} } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h index 3d65ea6f67..cefd2d8757 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h @@ -34,15 +34,9 @@ class TensorAddCPUKernel : public CPUKernel { const std::vector &outputs) override; private: - static void NormalProcess(const float *input_a, const float *input_b, float *output, size_t size); - void BroadcastProcess(const float *input_a, const float *input_b, float *output, size_t size); - static void CalculateStrides(const std::vector &, std::vector *); std::vector input_shape_a_; std::vector input_shape_b_; - // Define follow var for Broadcast std::vector output_shape_; - std::vector input_strides_a_; - std::vector input_strides_b_; }; MS_REG_CPU_KERNEL( diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc index 8428038392..f205e5abae 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.cc @@ -17,21 +17,16 @@ #include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h" #include #include -#include #include "runtime/device/cpu/cpu_device_address.h" namespace mindspore { namespace kernel { -namespace { -const size_t kMaxDim = 10; -} - void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) { MS_EXCEPTION_IF_NULL(kernel_node); input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); - axes_ = AnfAlgo::GetNodeAttr>(kernel_node, "perm"); - CheckParameter(); + auto tmp = AnfAlgo::GetNodeAttr>(kernel_node, "perm"); + axes_ = {tmp.begin(), tmp.end()}; dtype_ = AnfAlgo ::GetPrevNodeOutputDeviceDataType(kernel_node, 0); if (dtype_ == kTypeUnknown) { dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0); @@ -63,77 +58,22 @@ bool TransposeCPUFwdKernel::Launch(const std::vector &inputs return true; } -void TransposeCPUFwdKernel::CheckParameter() const { - if (input_shape_.size() > kMaxDim) { - MS_LOG(EXCEPTION) << "Input tensor is " << input_shape_.size() << ", out of bound max dimension 10"; - } - - if (input_shape_.empty()) { - MS_LOG(EXCEPTION) << "Input tensor is empty"; - } - - if (input_shape_.size() != axes_.size()) { - MS_LOG(EXCEPTION) << "Input perm size is not equal with input shape"; - } - - // Input axes include the same axis - std::unordered_set unique_axes{axes_.begin(), axes_.end()}; - if (unique_axes.size() != axes_.size()) { - MS_LOG(EXCEPTION) << "Input perm is illegal, it has the same axis"; - } - - // Input axes not in ture range(input_shape_.size()) - int64_t shape_size = input_shape_.size(); - for (auto &axis : axes_) { - if (axis < 0 || axis >= shape_size) { - MS_LOG(EXCEPTION) << "Input perm axis is out of bound input shape size"; - } - } -} - template void TransposeCPUFwdKernel::LaunchKernel(const std::vector &inputs, const std::vector &outputs) { - int dimension = input_shape_.size(); - // Calculate input tensor strides - std::array input_strides{0}; - input_strides[dimension - 1] = 1; - for (int i = dimension - 2; i >= 0; --i) { - input_strides[i] = input_shape_[i + 1] * input_strides[i + 1]; - } - - // Calculate output strides and back strides - std::array strides{0}; - std::array back_strides{0}; - for (int i = dimension - 1; i >= 0; --i) { - strides[i] = input_strides[axes_[i]]; - back_strides[i] = (output_shape_[i] - 1) * strides[i]; - } - - std::array coordinates{0}; - auto get_next_pos = [&coordinates, &strides, &back_strides, &dimension, this](int curr_pos) { - for (int i = dimension - 1; i >= 0; --i) { - if (coordinates[i] + 1 == output_shape_[i]) { - coordinates[i] = 0; - curr_pos -= back_strides[i]; - } else { - coordinates[i]++; - curr_pos += strides[i]; - break; - } - } - return curr_pos; - }; - - auto input = reinterpret_cast(inputs[0]->addr); - auto output = reinterpret_cast(outputs[0]->addr); + auto input_addr = reinterpret_cast(inputs[0]->addr); + auto output_addr = reinterpret_cast(outputs[0]->addr); size_t size = IntToSize(inputs[0]->size / sizeof(T)); - output[0] = input[0]; - int pos = 0; - for (size_t i = 1; i < size; ++i) { - pos = get_next_pos(pos); - output[i] = input[pos]; - } + TransposeIterator base_iter(output_shape_, axes_, input_shape_); + auto task = [&base_iter, input_addr, output_addr](size_t start, size_t end) { + auto iter = base_iter; + iter.SetPos(start); + for (size_t i = start; i < end; ++i) { + output_addr[i] = input_addr[iter.GetPos()]; + iter.GenNextPos(); + } + }; + CPUKernelUtils::ParallelFor(task, size); } } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h index 6656db53c5..cc20ef553c 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/transpose_cpu_kernel.h @@ -34,13 +34,12 @@ class TransposeCPUFwdKernel : public CPUKernel { const std::vector &outputs) override; private: - void CheckParameter() const; template void LaunchKernel(const std::vector &inputs, const std::vector &outputs); std::vector input_shape_; std::vector output_shape_; - std::vector axes_; + std::vector axes_; TypeId dtype_{kTypeUnknown}; using TypeKernel = std::function &, const std::vector &)>;