forked from OSSInnovation/mindspore
!14380 Reduce/Transpose/TensorAdd add multi thread Support and Fix reduce bug
From: @yang_chun Reviewed-by: @c_34,@wuxuejian Signed-off-by: @wuxuejian
This commit is contained in:
commit
a691eeb645
|
@ -14,6 +14,8 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||||
|
#include <algorithm>
|
||||||
|
#include <utility>
|
||||||
#include "common/thread_pool.h"
|
#include "common/thread_pool.h"
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
|
@ -119,5 +121,118 @@ std::vector<size_t> CPUKernelUtils::FlatShapeByAxis(const std::vector<size_t> &s
|
||||||
return flat_shape;
|
return flat_shape;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
BroadcastIterator::BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
|
||||||
|
std::vector<size_t> output_shape)
|
||||||
|
: input_shape_a_(std::move(input_shape_a)),
|
||||||
|
input_shape_b_(std::move(input_shape_b)),
|
||||||
|
output_shape_(std::move(output_shape)) {
|
||||||
|
output_dimension_ = SizeToInt(output_shape_.size()); // Assign dimension to int for iterator
|
||||||
|
BroadcastShape();
|
||||||
|
// Allocate strides memory
|
||||||
|
input_strides_a_.resize(output_dimension_);
|
||||||
|
input_strides_b_.resize(output_dimension_);
|
||||||
|
input_back_strides_a_.resize(output_dimension_);
|
||||||
|
input_back_strides_b_.resize(output_dimension_);
|
||||||
|
coordinates_.resize(output_dimension_);
|
||||||
|
InitStrides();
|
||||||
|
}
|
||||||
|
|
||||||
|
void BroadcastIterator::SetPos(size_t pos) {
|
||||||
|
for (int i = output_dimension_ - 1; i >= 0 && pos != 0; --i) {
|
||||||
|
coordinates_[i] = pos % output_shape_[i];
|
||||||
|
input_pos_[0] += coordinates_[i] * input_strides_a_[i];
|
||||||
|
input_pos_[1] += coordinates_[i] * input_strides_b_[i];
|
||||||
|
pos /= output_shape_[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void BroadcastIterator::GenNextPos() {
|
||||||
|
// Calculate output next coordinate
|
||||||
|
for (int i = output_dimension_ - 1; i >= 0; --i) {
|
||||||
|
if (coordinates_[i] + 1 == output_shape_[i]) {
|
||||||
|
coordinates_[i] = 0;
|
||||||
|
input_pos_[0] -= input_back_strides_a_[i];
|
||||||
|
input_pos_[1] -= input_back_strides_b_[i];
|
||||||
|
} else {
|
||||||
|
++coordinates_[i];
|
||||||
|
input_pos_[0] += input_strides_a_[i];
|
||||||
|
input_pos_[1] += input_strides_b_[i];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void BroadcastIterator::BroadcastShape() {
|
||||||
|
int input_dimension_a = input_shape_a_.size();
|
||||||
|
if (input_dimension_a < output_dimension_) {
|
||||||
|
input_shape_a_.insert(input_shape_a_.begin(), output_dimension_ - input_dimension_a, 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
int input_dimension_b = input_shape_b_.size();
|
||||||
|
if (input_dimension_b < output_dimension_) {
|
||||||
|
input_shape_b_.insert(input_shape_b_.begin(), output_dimension_ - input_dimension_b, 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void BroadcastIterator::InitStrides() {
|
||||||
|
input_strides_a_[output_dimension_ - 1] = 1;
|
||||||
|
input_strides_b_[output_dimension_ - 1] = 1;
|
||||||
|
for (int i = output_dimension_ - 2; i >= 0; --i) {
|
||||||
|
input_strides_a_[i] = input_shape_a_[i + 1] * input_strides_a_[i + 1];
|
||||||
|
input_strides_b_[i] = input_shape_b_[i + 1] * input_strides_b_[i + 1];
|
||||||
|
input_back_strides_a_[i + 1] = (input_shape_a_[i + 1] - 1) * input_strides_a_[i + 1];
|
||||||
|
input_back_strides_b_[i + 1] = (input_shape_b_[i + 1] - 1) * input_strides_b_[i + 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update strides for broadcast
|
||||||
|
// While the axis value is 1, the stride is 0
|
||||||
|
std::transform(input_strides_a_.begin(), input_strides_a_.end(), input_shape_a_.begin(), input_strides_a_.begin(),
|
||||||
|
[](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
|
||||||
|
std::transform(input_strides_b_.begin(), input_strides_b_.end(), input_shape_b_.begin(), input_strides_b_.begin(),
|
||||||
|
[](const auto &a, const auto &b) { return b == 1 ? 0 : a; });
|
||||||
|
}
|
||||||
|
|
||||||
|
TransposeIterator::TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes,
|
||||||
|
const std::vector<size_t> &input_shape)
|
||||||
|
: shape_(std::move(output_shape)), axes_(std::move(axes)) {
|
||||||
|
// Calculate strides
|
||||||
|
dimension_ = shape_.size();
|
||||||
|
std::vector<uint32_t> strides(dimension_, 1);
|
||||||
|
for (int i = dimension_ - 2; i >= 0; --i) {
|
||||||
|
strides[i] = input_shape[i + 1] * strides[i + 1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Swap shape ans strides and calculate back strides
|
||||||
|
strides_.resize(dimension_);
|
||||||
|
back_strides_.resize(dimension_);
|
||||||
|
for (int i = dimension_ - 1; i >= 0; --i) {
|
||||||
|
strides_[i] = strides[axes_[i]];
|
||||||
|
back_strides_[i] = (shape_[i] - 1) * strides_[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Calculate coordinate by pos
|
||||||
|
coordinates_.resize(dimension_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void TransposeIterator::SetPos(size_t pos) {
|
||||||
|
for (int i = dimension_ - 1; i >= 0 && pos != 0; --i) {
|
||||||
|
coordinates_[i] = pos % shape_[i];
|
||||||
|
pos_ += coordinates_[i] * strides_[i];
|
||||||
|
pos /= shape_[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void TransposeIterator::GenNextPos() {
|
||||||
|
for (int i = dimension_ - 1; i >= 0; --i) {
|
||||||
|
if (coordinates_[i] + 1 == shape_[i]) {
|
||||||
|
coordinates_[i] = 0;
|
||||||
|
pos_ -= back_strides_[i];
|
||||||
|
} else {
|
||||||
|
coordinates_[i]++;
|
||||||
|
pos_ += strides_[i];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -145,6 +145,48 @@ class CPUKernelUtils {
|
||||||
static void ParallelFor(const CTask &task, size_t count);
|
static void ParallelFor(const CTask &task, size_t count);
|
||||||
static std::vector<size_t> FlatShapeByAxis(const std::vector<size_t> &shape, int axis);
|
static std::vector<size_t> FlatShapeByAxis(const std::vector<size_t> &shape, int axis);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class BroadcastIterator {
|
||||||
|
public:
|
||||||
|
BroadcastIterator(std::vector<size_t> input_shape_a, std::vector<size_t> input_shape_b,
|
||||||
|
std::vector<size_t> output_shape);
|
||||||
|
inline size_t GetInputPosA() const { return input_pos_[0]; }
|
||||||
|
inline size_t GetInputPosB() const { return input_pos_[1]; }
|
||||||
|
void SetPos(size_t pos);
|
||||||
|
void GenNextPos();
|
||||||
|
|
||||||
|
private:
|
||||||
|
void BroadcastShape();
|
||||||
|
void InitStrides();
|
||||||
|
|
||||||
|
std::vector<size_t> coordinates_;
|
||||||
|
std::vector<size_t> input_shape_a_;
|
||||||
|
std::vector<size_t> input_shape_b_;
|
||||||
|
std::vector<size_t> output_shape_;
|
||||||
|
std::vector<size_t> input_strides_a_;
|
||||||
|
std::vector<size_t> input_strides_b_;
|
||||||
|
std::vector<size_t> input_back_strides_a_;
|
||||||
|
std::vector<size_t> input_back_strides_b_;
|
||||||
|
std::array<size_t, 2> input_pos_{0};
|
||||||
|
int output_dimension_{0};
|
||||||
|
};
|
||||||
|
|
||||||
|
class TransposeIterator {
|
||||||
|
public:
|
||||||
|
TransposeIterator(std::vector<size_t> output_shape, std::vector<size_t> axes, const std::vector<size_t> &input_shape);
|
||||||
|
inline size_t GetPos() const { return pos_; }
|
||||||
|
void SetPos(size_t pos);
|
||||||
|
void GenNextPos();
|
||||||
|
|
||||||
|
private:
|
||||||
|
int dimension_{0};
|
||||||
|
std::vector<size_t> coordinates_;
|
||||||
|
std::vector<size_t> shape_;
|
||||||
|
std::vector<size_t> strides_;
|
||||||
|
std::vector<size_t> back_strides_;
|
||||||
|
std::vector<size_t> axes_;
|
||||||
|
size_t pos_{0};
|
||||||
|
};
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
||||||
|
|
|
@ -18,13 +18,10 @@
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <unordered_set>
|
#include <utility>
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace kernel {
|
namespace kernel {
|
||||||
namespace {
|
|
||||||
const size_t kMaxDim = 10;
|
|
||||||
} // namespace
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||||
|
@ -37,10 +34,14 @@ void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||||
} else {
|
} else {
|
||||||
MS_LOG(EXCEPTION) << "Attribute is invalid";
|
MS_LOG(EXCEPTION) << "Attribute is invalid";
|
||||||
}
|
}
|
||||||
|
|
||||||
int dimension = input_shape_.size();
|
int dimension = input_shape_.size();
|
||||||
std::transform(axis_.begin(), axis_.end(), axis_.begin(),
|
std::transform(axis_.begin(), axis_.end(), axis_.begin(),
|
||||||
[dimension](const auto &a) { return a < 0 ? dimension + a : a; });
|
[dimension](const auto &a) { return a < 0 ? dimension + a : a; });
|
||||||
sort(axis_.begin(), axis_.end());
|
sort(axis_.begin(), axis_.end());
|
||||||
|
// Delete the duplicate axis.
|
||||||
|
auto last = std::unique(axis_.begin(), axis_.end());
|
||||||
|
axis_.erase(last, axis_.end());
|
||||||
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
auto kernel_name = AnfAlgo::GetCNodeName(kernel_node);
|
||||||
if (kernel_name == "ReduceMax") {
|
if (kernel_name == "ReduceMax") {
|
||||||
reduce_type_ = 1;
|
reduce_type_ = 1;
|
||||||
|
@ -55,10 +56,8 @@ void ReduceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||||
reduce_type_ = 4;
|
reduce_type_ = 4;
|
||||||
reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
|
reduce_func_ = [](const T *input, size_t pos, T *out) { *out += input[pos]; };
|
||||||
} else {
|
} else {
|
||||||
MS_LOG(EXCEPTION) << "unsupported reduce type: " << reduce_type_;
|
MS_LOG(EXCEPTION) << "unsupported reduce type: " << reduce_type_;
|
||||||
}
|
}
|
||||||
|
|
||||||
CheckParameter();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -68,7 +67,7 @@ bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||||
size_t input_size = inputs[0]->size / sizeof(T);
|
size_t input_size = inputs[0]->size / sizeof(T);
|
||||||
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||||
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||||
if (axis_.empty()) {
|
if (axis_.empty() || input_shape_.empty() || input_shape_.size() == 1) {
|
||||||
// Get one ret
|
// Get one ret
|
||||||
*output_addr = input_addr[0];
|
*output_addr = input_addr[0];
|
||||||
for (size_t i = 1; i < input_size; ++i) {
|
for (size_t i = 1; i < input_size; ++i) {
|
||||||
|
@ -78,107 +77,50 @@ bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||||
*output_addr /= input_size;
|
*output_addr /= input_size;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// transpose->calculate strides->calculate ret
|
// Calculate transpose axes and stride
|
||||||
std::vector<size_t> out_shape;
|
|
||||||
std::vector<size_t> strides;
|
|
||||||
std::vector<size_t> back_strides;
|
|
||||||
size_t stride;
|
|
||||||
CalculateTransposeInfo(&out_shape, &strides, &back_strides, &stride);
|
|
||||||
int dimension = input_shape_.size();
|
int dimension = input_shape_.size();
|
||||||
std::vector<size_t> coordinates(dimension);
|
size_t stride = 1;
|
||||||
auto get_next_pos = [&coordinates, &out_shape, &strides, &back_strides, &dimension](size_t &curr_pos) {
|
std::vector<size_t> axes(input_shape_.size());
|
||||||
for (int i = dimension - 1; i >= 0; --i) {
|
size_t j = 0;
|
||||||
if (coordinates[i] + 1 == out_shape[i]) {
|
size_t k = 0;
|
||||||
coordinates[i] = 0;
|
for (int i = 0; i < dimension; ++i) {
|
||||||
curr_pos -= back_strides[i];
|
if (j == axis_.size() || i != axis_[j]) {
|
||||||
} else {
|
axes[k] = i;
|
||||||
coordinates[i]++;
|
++k;
|
||||||
curr_pos += strides[i];
|
} else {
|
||||||
break;
|
stride *= input_shape_[i];
|
||||||
|
++j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (auto &it : axis_) {
|
||||||
|
axes[k] = it;
|
||||||
|
++k;
|
||||||
|
}
|
||||||
|
// Calculate transpose shape
|
||||||
|
std::vector<size_t> transpose_shape(input_shape_.size());
|
||||||
|
for (int i = 0; i < dimension; ++i) {
|
||||||
|
transpose_shape[i] = input_shape_[axes[i]];
|
||||||
|
}
|
||||||
|
size_t output_size = outputs[0]->size / sizeof(T);
|
||||||
|
TransposeIterator base_iter(std::move(transpose_shape), std::move(axes), input_shape_);
|
||||||
|
auto task = [this, &base_iter, input_addr, output_addr, stride](size_t start, size_t end) {
|
||||||
|
auto iter = base_iter;
|
||||||
|
iter.SetPos(start * stride);
|
||||||
|
for (size_t i = start; i < end; ++i) {
|
||||||
|
output_addr[i] = input_addr[iter.GetPos()];
|
||||||
|
iter.GenNextPos();
|
||||||
|
for (size_t j = 1; j < stride; ++j) {
|
||||||
|
reduce_func_(input_addr, iter.GetPos(), &output_addr[i]);
|
||||||
|
iter.GenNextPos();
|
||||||
|
}
|
||||||
|
if (reduce_type_ == 4) { // 4 is reduce mean
|
||||||
|
output_addr[i] /= stride;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
size_t output_size = outputs[0]->size / sizeof(T);
|
CPUKernelUtils::ParallelFor(task, output_size);
|
||||||
size_t pos = 0;
|
|
||||||
for (size_t i = 0; i < output_size; ++i) {
|
|
||||||
if (i != 0) {
|
|
||||||
get_next_pos(pos);
|
|
||||||
}
|
|
||||||
output_addr[i] = input_addr[pos];
|
|
||||||
for (size_t j = 1; j < stride; ++j) {
|
|
||||||
get_next_pos(pos);
|
|
||||||
reduce_func_(input_addr, pos, &output_addr[i]);
|
|
||||||
}
|
|
||||||
if (reduce_type_ == 4) { // 4 is reduce mean
|
|
||||||
output_addr[i] /= stride;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
void ReduceCPUKernel<T>::CalculateTransposeInfo(std::vector<size_t> *new_shape, std::vector<size_t> *strides,
|
|
||||||
std::vector<size_t> *back_strides, size_t *stride) const {
|
|
||||||
int dimension = input_shape_.size();
|
|
||||||
std::vector<size_t> input_strides(dimension);
|
|
||||||
input_strides[dimension - 1] = 1;
|
|
||||||
for (int i = dimension - 2; i >= 0; --i) {
|
|
||||||
input_strides[i] = input_shape_[i + 1] * input_strides[i + 1];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate transpose axes and stride
|
|
||||||
std::vector<size_t> axes(dimension);
|
|
||||||
int j = 0;
|
|
||||||
int k = 0;
|
|
||||||
*stride = 1;
|
|
||||||
for (int i = 0; i < dimension; ++i) {
|
|
||||||
if (i != axis_[j]) {
|
|
||||||
axes[k] = i;
|
|
||||||
++k;
|
|
||||||
} else {
|
|
||||||
*stride *= input_shape_[i];
|
|
||||||
++j;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
for (auto &it : axis_) {
|
|
||||||
axes[k] = it;
|
|
||||||
++k;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate strides, new_shape, back strides
|
|
||||||
strides->resize(dimension);
|
|
||||||
new_shape->resize(dimension);
|
|
||||||
back_strides->resize(dimension);
|
|
||||||
for (int i = dimension - 1; i >= 0; --i) {
|
|
||||||
(*strides)[i] = input_strides[axes[i]];
|
|
||||||
(*new_shape)[i] = input_shape_[axes[i]];
|
|
||||||
(*back_strides)[i] = ((*new_shape)[i] - 1) * (*strides)[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
void ReduceCPUKernel<T>::CheckParameter() const {
|
|
||||||
if (input_shape_.empty() || input_shape_.size() > kMaxDim) {
|
|
||||||
MS_LOG(EXCEPTION) << "Invalid input tensor of dimension: " << input_shape_.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (axis_.empty()) {
|
|
||||||
MS_LOG(INFO) << "axis is empty";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::unordered_set<int> checker(axis_.begin(), axis_.end());
|
|
||||||
if (checker.size() != axis_.size()) {
|
|
||||||
MS_LOG(EXCEPTION) << "Duplicate value in axis";
|
|
||||||
}
|
|
||||||
|
|
||||||
int maxDimension = input_shape_.size();
|
|
||||||
for (auto &axis : axis_) {
|
|
||||||
if (axis >= maxDimension) {
|
|
||||||
MS_LOG(EXCEPTION) << "Invalid value in axis: " << axis;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -34,9 +34,6 @@ class ReduceCPUKernel : public CPUKernel {
|
||||||
const std::vector<AddressPtr> &outputs) override;
|
const std::vector<AddressPtr> &outputs) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void CheckParameter() const;
|
|
||||||
void CalculateTransposeInfo(std::vector<size_t> *new_shape, std::vector<size_t> *strides,
|
|
||||||
std::vector<size_t> *back_strides, size_t *stride) const;
|
|
||||||
std::vector<size_t> input_shape_;
|
std::vector<size_t> input_shape_;
|
||||||
std::vector<int64_t> axis_;
|
std::vector<int64_t> axis_;
|
||||||
int reduce_type_{0};
|
int reduce_type_{0};
|
||||||
|
|
|
@ -14,71 +14,11 @@
|
||||||
* limitations under the License.
|
* limitations under the License.
|
||||||
*/
|
*/
|
||||||
#include "backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h"
|
#include "backend/kernel_compiler/cpu/tensoradd_cpu_kernel.h"
|
||||||
|
#include <functional>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace kernel {
|
namespace kernel {
|
||||||
namespace {
|
|
||||||
struct Iterator {
|
|
||||||
std::vector<size_t> coordinates_;
|
|
||||||
std::vector<size_t> input_shape_a_;
|
|
||||||
std::vector<size_t> input_shape_b_;
|
|
||||||
std::vector<size_t> output_shape_;
|
|
||||||
std::vector<size_t> input_strides_a_;
|
|
||||||
std::vector<size_t> input_strides_b_;
|
|
||||||
int output_dimension_pos_{0};
|
|
||||||
size_t pos_{0};
|
|
||||||
Iterator(const std::vector<size_t> &input_shape_a, const std::vector<size_t> &input_shape_b,
|
|
||||||
const std::vector<size_t> &output_shape, const std::vector<size_t> &input_strides_a,
|
|
||||||
const std::vector<size_t> &input_strides_b, size_t pos)
|
|
||||||
: input_shape_a_(input_shape_a),
|
|
||||||
input_shape_b_(input_shape_b),
|
|
||||||
output_shape_(output_shape),
|
|
||||||
input_strides_a_(input_strides_a),
|
|
||||||
input_strides_b_(input_strides_b),
|
|
||||||
pos_{pos} {
|
|
||||||
output_dimension_pos_ = output_shape.size() - 1;
|
|
||||||
// Calculate coordinate with pos
|
|
||||||
coordinates_.resize(output_dimension_pos_ + 1);
|
|
||||||
int tmp = pos_;
|
|
||||||
for (int i = output_dimension_pos_; i >= 0 && tmp != 0; --i) {
|
|
||||||
coordinates_[i] = tmp % output_shape_[i];
|
|
||||||
tmp /= output_shape_[i];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void UpdateCoordinates() {
|
|
||||||
// Calculate output next coordinate
|
|
||||||
for (int i = output_dimension_pos_; i >= 0; --i) {
|
|
||||||
if (coordinates_[i] + 1 == output_shape_[i]) {
|
|
||||||
coordinates_[i] = 0;
|
|
||||||
} else {
|
|
||||||
++coordinates_[i];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void GenPoints(std::array<size_t, 2> *position) {
|
|
||||||
auto &idx = *position;
|
|
||||||
idx = {0, 0};
|
|
||||||
for (int k = 0; k < output_dimension_pos_; ++k) {
|
|
||||||
if (input_shape_a_[k] > 1) {
|
|
||||||
idx[0] += coordinates_[k] * input_strides_a_[k];
|
|
||||||
}
|
|
||||||
if (input_shape_b_[k] > 1) {
|
|
||||||
idx[1] += coordinates_[k] * input_strides_b_[k];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (input_shape_a_[output_dimension_pos_] > 1) {
|
|
||||||
idx[0] += coordinates_[output_dimension_pos_];
|
|
||||||
}
|
|
||||||
if (input_shape_b_[output_dimension_pos_] > 1) {
|
|
||||||
idx[1] += coordinates_[output_dimension_pos_];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
void TensorAddCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
void TensorAddCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||||
|
@ -96,55 +36,25 @@ bool TensorAddCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||||
auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
|
auto output_addr = reinterpret_cast<float *>(outputs[0]->addr);
|
||||||
auto output_size = outputs[0]->size / sizeof(float);
|
auto output_size = outputs[0]->size / sizeof(float);
|
||||||
if (input_shape_a_ == input_shape_b_) {
|
if (input_shape_a_ == input_shape_b_) {
|
||||||
NormalProcess(input_addr_a, input_addr_b, output_addr, output_size);
|
auto task = [output_addr, input_addr_a, input_addr_b](size_t start, size_t end) {
|
||||||
|
for (size_t i = start; i < end; ++i) {
|
||||||
|
output_addr[i] = input_addr_a[i] + input_addr_b[i];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
CPUKernelUtils::ParallelFor(task, output_size);
|
||||||
} else { // Broadcast
|
} else { // Broadcast
|
||||||
BroadcastProcess(input_addr_a, input_addr_b, output_addr, output_size);
|
BroadcastIterator base_iter(input_shape_a_, input_shape_b_, output_shape_);
|
||||||
|
auto task = [&base_iter, output_addr, input_addr_a, input_addr_b](size_t start, size_t end) {
|
||||||
|
auto iter = base_iter;
|
||||||
|
iter.SetPos(start);
|
||||||
|
for (size_t i = start; i < end; ++i) {
|
||||||
|
output_addr[i] = input_addr_a[iter.GetInputPosA()] + input_addr_b[iter.GetInputPosB()];
|
||||||
|
iter.GenNextPos();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
CPUKernelUtils::ParallelFor(task, output_size);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void TensorAddCPUKernel::NormalProcess(const float *input_a, const float *input_b, float *output, size_t size) {
|
|
||||||
auto task = [output, input_a, input_b](size_t start, size_t end) {
|
|
||||||
for (size_t i = start; i < end; ++i) {
|
|
||||||
output[i] = input_a[i] + input_b[i];
|
|
||||||
}
|
|
||||||
};
|
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
|
||||||
}
|
|
||||||
|
|
||||||
void TensorAddCPUKernel::BroadcastProcess(const float *input_a, const float *input_b, float *output, size_t size) {
|
|
||||||
// Broadcast shape
|
|
||||||
int dimension = output_shape_.size();
|
|
||||||
int input_dimension_a = input_shape_a_.size();
|
|
||||||
if (input_dimension_a < dimension) {
|
|
||||||
input_shape_a_.insert(input_shape_a_.begin(), dimension - input_dimension_a, 1);
|
|
||||||
}
|
|
||||||
int input_dimension_b = input_shape_b_.size();
|
|
||||||
if (input_dimension_b < dimension) {
|
|
||||||
input_shape_b_.insert(input_shape_b_.begin(), dimension - input_dimension_b, 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate strides
|
|
||||||
CalculateStrides(input_shape_a_, &input_strides_a_);
|
|
||||||
CalculateStrides(input_shape_b_, &input_strides_b_);
|
|
||||||
|
|
||||||
auto task = [this, input_a, input_b, output](size_t start, size_t end) {
|
|
||||||
Iterator iter(input_shape_a_, input_shape_b_, output_shape_, input_strides_a_, input_strides_b_, start);
|
|
||||||
std::array<size_t, 2> position{0};
|
|
||||||
for (size_t i = start; i < end; ++i) {
|
|
||||||
iter.GenPoints(&position);
|
|
||||||
output[i] = input_a[position[0]] + input_b[position[1]];
|
|
||||||
iter.UpdateCoordinates();
|
|
||||||
}
|
|
||||||
};
|
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
|
||||||
}
|
|
||||||
|
|
||||||
void TensorAddCPUKernel::CalculateStrides(const std::vector<size_t> &shape, std::vector<size_t> *strides) {
|
|
||||||
strides->resize(shape.size(), 1);
|
|
||||||
for (int i = shape.size() - 2; i >= 0; --i) {
|
|
||||||
(*strides)[i] = shape[i + 1] * (*strides)[i + 1];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -34,15 +34,9 @@ class TensorAddCPUKernel : public CPUKernel {
|
||||||
const std::vector<AddressPtr> &outputs) override;
|
const std::vector<AddressPtr> &outputs) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
static void NormalProcess(const float *input_a, const float *input_b, float *output, size_t size);
|
|
||||||
void BroadcastProcess(const float *input_a, const float *input_b, float *output, size_t size);
|
|
||||||
static void CalculateStrides(const std::vector<size_t> &, std::vector<size_t> *);
|
|
||||||
std::vector<size_t> input_shape_a_;
|
std::vector<size_t> input_shape_a_;
|
||||||
std::vector<size_t> input_shape_b_;
|
std::vector<size_t> input_shape_b_;
|
||||||
// Define follow var for Broadcast
|
|
||||||
std::vector<size_t> output_shape_;
|
std::vector<size_t> output_shape_;
|
||||||
std::vector<size_t> input_strides_a_;
|
|
||||||
std::vector<size_t> input_strides_b_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
MS_REG_CPU_KERNEL(
|
MS_REG_CPU_KERNEL(
|
||||||
|
|
|
@ -17,21 +17,16 @@
|
||||||
#include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h"
|
#include "backend/kernel_compiler/cpu/transpose_cpu_kernel.h"
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <unordered_set>
|
|
||||||
#include "runtime/device/cpu/cpu_device_address.h"
|
#include "runtime/device/cpu/cpu_device_address.h"
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace kernel {
|
namespace kernel {
|
||||||
namespace {
|
|
||||||
const size_t kMaxDim = 10;
|
|
||||||
}
|
|
||||||
|
|
||||||
void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) {
|
void TransposeCPUFwdKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||||
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||||
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||||
axes_ = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "perm");
|
auto tmp = AnfAlgo::GetNodeAttr<std::vector<int64_t>>(kernel_node, "perm");
|
||||||
CheckParameter();
|
axes_ = {tmp.begin(), tmp.end()};
|
||||||
dtype_ = AnfAlgo ::GetPrevNodeOutputDeviceDataType(kernel_node, 0);
|
dtype_ = AnfAlgo ::GetPrevNodeOutputDeviceDataType(kernel_node, 0);
|
||||||
if (dtype_ == kTypeUnknown) {
|
if (dtype_ == kTypeUnknown) {
|
||||||
dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
|
dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
|
||||||
|
@ -63,77 +58,22 @@ bool TransposeCPUFwdKernel::Launch(const std::vector<kernel::AddressPtr> &inputs
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void TransposeCPUFwdKernel::CheckParameter() const {
|
|
||||||
if (input_shape_.size() > kMaxDim) {
|
|
||||||
MS_LOG(EXCEPTION) << "Input tensor is " << input_shape_.size() << ", out of bound max dimension 10";
|
|
||||||
}
|
|
||||||
|
|
||||||
if (input_shape_.empty()) {
|
|
||||||
MS_LOG(EXCEPTION) << "Input tensor is empty";
|
|
||||||
}
|
|
||||||
|
|
||||||
if (input_shape_.size() != axes_.size()) {
|
|
||||||
MS_LOG(EXCEPTION) << "Input perm size is not equal with input shape";
|
|
||||||
}
|
|
||||||
|
|
||||||
// Input axes include the same axis
|
|
||||||
std::unordered_set<int64_t> unique_axes{axes_.begin(), axes_.end()};
|
|
||||||
if (unique_axes.size() != axes_.size()) {
|
|
||||||
MS_LOG(EXCEPTION) << "Input perm is illegal, it has the same axis";
|
|
||||||
}
|
|
||||||
|
|
||||||
// Input axes not in ture range(input_shape_.size())
|
|
||||||
int64_t shape_size = input_shape_.size();
|
|
||||||
for (auto &axis : axes_) {
|
|
||||||
if (axis < 0 || axis >= shape_size) {
|
|
||||||
MS_LOG(EXCEPTION) << "Input perm axis is out of bound input shape size";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void TransposeCPUFwdKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
|
void TransposeCPUFwdKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
|
||||||
const std::vector<AddressPtr> &outputs) {
|
const std::vector<AddressPtr> &outputs) {
|
||||||
int dimension = input_shape_.size();
|
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||||
// Calculate input tensor strides
|
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||||
std::array<uint32_t, kMaxDim> input_strides{0};
|
|
||||||
input_strides[dimension - 1] = 1;
|
|
||||||
for (int i = dimension - 2; i >= 0; --i) {
|
|
||||||
input_strides[i] = input_shape_[i + 1] * input_strides[i + 1];
|
|
||||||
}
|
|
||||||
|
|
||||||
// Calculate output strides and back strides
|
|
||||||
std::array<uint32_t, kMaxDim> strides{0};
|
|
||||||
std::array<uint32_t, kMaxDim> back_strides{0};
|
|
||||||
for (int i = dimension - 1; i >= 0; --i) {
|
|
||||||
strides[i] = input_strides[axes_[i]];
|
|
||||||
back_strides[i] = (output_shape_[i] - 1) * strides[i];
|
|
||||||
}
|
|
||||||
|
|
||||||
std::array<uint32_t, kMaxDim> coordinates{0};
|
|
||||||
auto get_next_pos = [&coordinates, &strides, &back_strides, &dimension, this](int curr_pos) {
|
|
||||||
for (int i = dimension - 1; i >= 0; --i) {
|
|
||||||
if (coordinates[i] + 1 == output_shape_[i]) {
|
|
||||||
coordinates[i] = 0;
|
|
||||||
curr_pos -= back_strides[i];
|
|
||||||
} else {
|
|
||||||
coordinates[i]++;
|
|
||||||
curr_pos += strides[i];
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return curr_pos;
|
|
||||||
};
|
|
||||||
|
|
||||||
auto input = reinterpret_cast<T *>(inputs[0]->addr);
|
|
||||||
auto output = reinterpret_cast<T *>(outputs[0]->addr);
|
|
||||||
size_t size = IntToSize(inputs[0]->size / sizeof(T));
|
size_t size = IntToSize(inputs[0]->size / sizeof(T));
|
||||||
output[0] = input[0];
|
TransposeIterator base_iter(output_shape_, axes_, input_shape_);
|
||||||
int pos = 0;
|
auto task = [&base_iter, input_addr, output_addr](size_t start, size_t end) {
|
||||||
for (size_t i = 1; i < size; ++i) {
|
auto iter = base_iter;
|
||||||
pos = get_next_pos(pos);
|
iter.SetPos(start);
|
||||||
output[i] = input[pos];
|
for (size_t i = start; i < end; ++i) {
|
||||||
}
|
output_addr[i] = input_addr[iter.GetPos()];
|
||||||
|
iter.GenNextPos();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
CPUKernelUtils::ParallelFor(task, size);
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -34,13 +34,12 @@ class TransposeCPUFwdKernel : public CPUKernel {
|
||||||
const std::vector<AddressPtr> &outputs) override;
|
const std::vector<AddressPtr> &outputs) override;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void CheckParameter() const;
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||||
|
|
||||||
std::vector<size_t> input_shape_;
|
std::vector<size_t> input_shape_;
|
||||||
std::vector<size_t> output_shape_;
|
std::vector<size_t> output_shape_;
|
||||||
std::vector<int64_t> axes_;
|
std::vector<size_t> axes_;
|
||||||
TypeId dtype_{kTypeUnknown};
|
TypeId dtype_{kTypeUnknown};
|
||||||
using TypeKernel =
|
using TypeKernel =
|
||||||
std::function<void(TransposeCPUFwdKernel *, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &)>;
|
std::function<void(TransposeCPUFwdKernel *, const std::vector<AddressPtr> &, const std::vector<AddressPtr> &)>;
|
||||||
|
|
Loading…
Reference in New Issue