acc arthitic op

This commit is contained in:
fangzehua 2022-10-27 16:18:58 +08:00
parent 527906dbd0
commit ec43c8b63b
9 changed files with 271 additions and 339 deletions

View File

@ -139,7 +139,7 @@ mindspore.set_context
- **enable_compile_cache** (bool) - 表示是否加载或者保存前端编译的图。当 `enable_compile_cache` 被设置为True时在第一次执行的过程中一个硬件无关的编译缓存会被生成并且导出为一个MINDIR文件。当该网络被再次执行时如果 `enable_compile_cache` 仍然为True并且网络脚本没有被更改那么这个编译缓存会被加载。注意目前只支持有限的Python脚本更改的自动检测这意味着可能有正确性风险。默认值False。这是一个实验特性可能会被更改或者删除。
- **compile_cache_path** (str) - 保存前端图编译缓存的路径。默认值:"."。如果目录不存在,系统会自动创建这个目录。缓存会被保存到如下目录: `compile_cache_path/rank_${rank_id}/``rank_id` 是集群上当前设备的ID。
- **inter_op_parallel_num** (int) - 算子间并行数控制。 默认值为0表示由框架默认指定。
- **runtime_num_threads** (int) - 运行时actor和CPU算子核使用的线程池线程数必须大于0。默认值为30如果同时运行多个进程应将该值设置得小一些以避免线程争用。
- **runtime_num_threads** (int) - 运行时actor和CPU算子核使用的线程池线程数必须大于等于0。默认值为30如果同时运行多个进程应将该值设置得小一些以避免线程争用。
- **disable_format_transform** (bool) - 表示是否取消NCHW到NHWC的自动格式转换功能。当fp16的网络性能不如fp32的时可以设置 `disable_format_transform` 为True以尝试提高训练性能。默认值False。
- **support_binary** (bool) - 是否支持在图形模式下运行.pyc或.so。如果要支持在图形模式下运行.so或.pyc可将 `support_binary` 置为True并运行一次.py文件从而将接口源码保存到接口定义.py文件中因此要保证该文件可写。然后将.py文件编译成.pyc或.so文件即可在图模式下运行。
- **memory_optimize_level** (str) - 内存优化级别默认值O0。其值必须在 ['O0', 'O1'] 范围中。

View File

@ -28,6 +28,7 @@
#include "plugin/device/cpu/kernel/nnacl/fp32/mul_fp32.h"
#include "plugin/device/cpu/kernel/nnacl/fp32/power_fp32.h"
#include "plugin/device/cpu/kernel/nnacl/fp32/sub_fp32.h"
#include "plugin/device/cpu/kernel/nnacl/fp32/add_fp32.h"
namespace mindspore {
namespace kernel {
@ -130,6 +131,7 @@ class ArithmeticCpuTypeFunc : public CpuKernelFunc {
CPUKernelUtils::GetElementNumEveryDim(input_shape2_, &input_element_num2_);
output_element_num_.clear();
CPUKernelUtils::GetElementNumEveryDim(output_shape_, &output_element_num_);
is_init_broadcast_ = false;
return KRET_OK;
}
@ -180,6 +182,20 @@ class ArithmeticCpuTypeFunc : public CpuKernelFunc {
}
private:
void InitBroadCast() {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
base_iter.SetPos(0);
input_index1_.clear();
input_index2_.clear();
input_index1_.resize(output_size_);
input_index2_.resize(output_size_);
for (size_t i = 0; i < output_size_; i++) {
input_index1_[i] = base_iter.GetInputPosA();
input_index2_[i] = base_iter.GetInputPosB();
base_iter.GenNextPos();
}
is_init_broadcast_ = true;
}
void InitComputeFunc() {
if (kernel_name_ == kAssignAdd || kernel_name_ == kAssignSub) {
return;
@ -229,10 +245,13 @@ class ArithmeticCpuTypeFunc : public CpuKernelFunc {
ShapeVector input_shape1_;
ShapeVector input_shape2_;
std::vector<size_t> input_index1_;
std::vector<size_t> input_index2_;
std::vector<size_t> input_element_num1_;
std::vector<size_t> input_element_num2_;
ShapeVector output_shape_;
std::vector<size_t> output_element_num_;
bool is_init_broadcast_{false};
using TypeComputeFunc = std::function<void(ArithmeticCpuTypeFunc *, const T *in_x, const T *in_y, T *out)>;
TypeComputeFunc compute_func_{nullptr};
@ -268,13 +287,32 @@ void ArithmeticCpuTypeFunc<T>::AssignSub(T *input1, const T *input2, T *out) {
template <typename T>
void ArithmeticCpuTypeFunc<T>::Add(const T *input1, const T *input2, T *out) {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if constexpr (std::is_same_v<T, float>) {
if (input_shape1_ == input_shape2_) {
auto task = [input1, input2, out](size_t start, size_t end) {
(void)ElementAdd(input1 + start, input2 + start, out + start, end - start);
};
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
return;
}
if (op_para_.in_elements_num0_ == 1 || op_para_.in_elements_num1_ == 1) {
auto task = [this, input1, input2, out](size_t start, size_t end) {
if (op_para_.in_elements_num0_ == 1) {
(void)ElementOptAdd(input1, input2 + start, out + start, end - start, &op_para_);
} else {
(void)ElementOptAdd(input1 + start, input2, out + start, end - start, &op_para_);
}
};
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
return;
}
}
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
out[i] = static_cast<T>(input1[iter.GetInputPosA()] + input2[iter.GetInputPosB()]);
iter.GenNextPos();
out[i] = static_cast<T>(input1[input_index1_[i]] + input2[input_index2_[i]]);
}
};
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@ -282,13 +320,12 @@ void ArithmeticCpuTypeFunc<T>::Add(const T *input1, const T *input2, T *out) {
template <typename T>
void ArithmeticCpuTypeFunc<T>::AddV2(const T *input1, const T *input2, T *out) {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
out[i] = static_cast<T>(input1[iter.GetInputPosA()] + input2[iter.GetInputPosB()]);
iter.GenNextPos();
out[i] = static_cast<T>(input1[input_index1_[i]] + input2[input_index2_[i]]);
}
};
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@ -316,14 +353,12 @@ void ArithmeticCpuTypeFunc<T>::Sub(const T *input1, const T *input2, T *out) {
return;
}
}
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
for (size_t i = start; i < end; i++) {
out[i] = static_cast<T>(input1[iter.GetInputPosA()] - input2[iter.GetInputPosB()]);
iter.GenNextPos();
out[i] = static_cast<T>(input1[input_index1_[i]] - input2[input_index2_[i]]);
}
};
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@ -351,17 +386,16 @@ void ArithmeticCpuTypeFunc<T>::Mul(const T *input1, const T *input2, T *out) {
return;
}
}
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
if constexpr (std::is_same_v<T, bool>) {
out[i] = static_cast<T>(input1[iter.GetInputPosA()] && input2[iter.GetInputPosB()]);
out[i] = static_cast<T>(input1[input_index1_[i]] && input2[input_index2_[i]]);
} else {
out[i] = static_cast<T>(input1[iter.GetInputPosA()] * input2[iter.GetInputPosB()]);
out[i] = static_cast<T>(input1[input_index1_[i]] * input2[input_index2_[i]]);
}
iter.GenNextPos();
}
};
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@ -391,14 +425,14 @@ void ArithmeticCpuTypeFunc<T>::RealDiv(const T *input1, const T *input2, T *out)
return;
}
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto dividend = input1[iter.GetInputPosA()];
auto divisor = input2[iter.GetInputPosB()];
iter.GenNextPos();
auto dividend = input1[input_index1_[i]];
auto divisor = input2[input_index2_[i]];
auto zero = static_cast<T>(0);
if (divisor == zero) {
if (dividend == zero) {
@ -442,14 +476,14 @@ void ArithmeticCpuTypeFunc<T>::RealDivComplex(const T *input1, const T *input2,
return;
}
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto dividend = input1[iter.GetInputPosA()];
auto divisor = input2[iter.GetInputPosB()];
iter.GenNextPos();
auto dividend = input1[input_index1_[i]];
auto divisor = input2[input_index2_[i]];
auto zero = static_cast<T>(0);
if (divisor == zero) {
out[i] = std::numeric_limits<T>::quiet_NaN();
@ -463,14 +497,14 @@ void ArithmeticCpuTypeFunc<T>::RealDivComplex(const T *input1, const T *input2,
template <typename T>
void ArithmeticCpuTypeFunc<T>::Div(const T *input1, const T *input2, T *out) {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto dividend = input1[iter.GetInputPosA()];
auto divisor = input2[iter.GetInputPosB()];
iter.GenNextPos();
auto dividend = input1[input_index1_[i]];
auto divisor = input2[input_index2_[i]];
auto zero = static_cast<T>(0);
if (divisor == zero) {
if (dividend == zero) {
@ -492,14 +526,14 @@ void ArithmeticCpuTypeFunc<T>::Div(const T *input1, const T *input2, T *out) {
template <typename T>
void ArithmeticCpuTypeFunc<T>::DivComplex(const T *input1, const T *input2, T *out) {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto dividend = input1[iter.GetInputPosA()];
auto divisor = input2[iter.GetInputPosB()];
iter.GenNextPos();
auto dividend = input1[input_index1_[i]];
auto divisor = input2[input_index2_[i]];
auto zero = static_cast<T>(0);
if (divisor == zero) {
if (dividend == zero) {
@ -516,14 +550,14 @@ void ArithmeticCpuTypeFunc<T>::DivComplex(const T *input1, const T *input2, T *o
template <typename T>
void ArithmeticCpuTypeFunc<T>::DivNoNan(const T *input1, const T *input2, T *out) {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto dividend = input1[iter.GetInputPosA()];
auto divisor = input2[iter.GetInputPosB()];
iter.GenNextPos();
auto dividend = input1[input_index1_[i]];
auto divisor = input2[input_index2_[i]];
auto zero = static_cast<T>(0);
if constexpr (std::is_same_v<T, double>) {
if (common::IsDoubleEqual(divisor, zero)) {
@ -551,14 +585,14 @@ void ArithmeticCpuTypeFunc<T>::DivNoNan(const T *input1, const T *input2, T *out
template <typename T>
void ArithmeticCpuTypeFunc<T>::FloorDiv(const T *input1, const T *input2, T *out) {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto dividend = input1[iter.GetInputPosA()];
auto divisor = input2[iter.GetInputPosB()];
iter.GenNextPos();
auto dividend = input1[input_index1_[i]];
auto divisor = input2[input_index2_[i]];
auto zero = static_cast<T>(0);
if (divisor == zero) {
if (dividend == zero) {
@ -580,14 +614,14 @@ void ArithmeticCpuTypeFunc<T>::FloorDiv(const T *input1, const T *input2, T *out
template <typename T>
void ArithmeticCpuTypeFunc<T>::Mod(const T *input1, const T *input2, T *out) {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto x = static_cast<double>(input1[iter.GetInputPosA()]);
auto y = static_cast<double>(input2[iter.GetInputPosB()]);
iter.GenNextPos();
auto x = static_cast<double>(input1[input_index1_[i]]);
auto y = static_cast<double>(input2[input_index2_[i]]);
auto data_div = x / y;
auto data_div_min = data_div < 0.0 ? data_div : 0.0;
auto data_div_max = data_div > 0.0 ? data_div : 0.0;
@ -602,14 +636,14 @@ void ArithmeticCpuTypeFunc<T>::Mod(const T *input1, const T *input2, T *out) {
template <typename T>
void ArithmeticCpuTypeFunc<T>::FloorMod(const T *input1, const T *input2, T *out) {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto x = static_cast<double>(input1[iter.GetInputPosA()]);
auto y = static_cast<double>(input2[iter.GetInputPosB()]);
iter.GenNextPos();
auto x = static_cast<double>(input1[input_index1_[i]]);
auto y = static_cast<double>(input2[input_index2_[i]]);
auto res = x - floor(x / y) * y;
out[i] = static_cast<T>((std::abs(res) > 1e-9) && ((res < 0.0) != (y < 0.0)) ? res + y : res);
}
@ -650,70 +684,63 @@ void ArithmeticCpuTypeFunc<T>::Pow(const T *input1, const T *input2, T *out) {
}
}
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
if (!is_init_broadcast_) {
InitBroadCast();
}
if (output_size_ > kMaxPowSerialSize) {
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto x = static_cast<double>(input1[iter.GetInputPosA()]);
auto y = static_cast<double>(input2[iter.GetInputPosB()]);
auto x = static_cast<double>(input1[input_index1_[i]]);
auto y = static_cast<double>(input2[input_index2_[i]]);
out[i] = static_cast<T>(std::pow(x, y));
iter.GenNextPos();
}
};
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
} else {
base_iter.SetPos(0);
for (size_t i = 0; i < output_size_; i++) {
auto sx = static_cast<double>(input1[base_iter.GetInputPosA()]);
auto sy = static_cast<double>(input2[base_iter.GetInputPosB()]);
auto sx = static_cast<double>(input1[input_index1_[i]]);
auto sy = static_cast<double>(input2[input_index2_[i]]);
out[i] = static_cast<T>(std::pow(sx, sy));
base_iter.GenNextPos();
}
}
}
template <typename T>
void ArithmeticCpuTypeFunc<T>::PowComplex(const T *input1, const T *input2, T *out) {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
if (!is_init_broadcast_) {
InitBroadCast();
}
if (output_size_ > kMaxPowSerialSize) {
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto x = (input1[iter.GetInputPosA()]);
auto y = (input2[iter.GetInputPosB()]);
auto x = (input1[input_index1_[i]]);
auto y = (input2[input_index2_[i]]);
out[i] = static_cast<T>(std::pow(x, y));
iter.GenNextPos();
}
};
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
} else {
base_iter.SetPos(0);
for (size_t i = 0; i < output_size_; i++) {
auto sx = (input1[base_iter.GetInputPosA()]);
auto sy = (input2[base_iter.GetInputPosB()]);
auto sx = (input1[input_index1_[i]]);
auto sy = (input2[input_index2_[i]]);
out[i] = static_cast<T>(std::pow(sx, sy));
base_iter.GenNextPos();
}
}
}
template <typename T>
void ArithmeticCpuTypeFunc<T>::SquaredDifference(const T *input1, const T *input2, T *out) {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
T diff = input1[iter.GetInputPosA()] - input2[iter.GetInputPosB()];
T diff = input1[input_index1_[i]] - input2[input_index2_[i]];
if constexpr (std::is_same_v<T, bool>) {
out[i] = static_cast<T>(diff);
} else {
out[i] = static_cast<T>(diff * diff);
}
iter.GenNextPos();
}
};
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@ -721,14 +748,13 @@ void ArithmeticCpuTypeFunc<T>::SquaredDifference(const T *input1, const T *input
template <typename T>
void ArithmeticCpuTypeFunc<T>::SquaredDifferenceComplex(const T *input1, const T *input2, T *out) {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
T diff = input1[iter.GetInputPosA()] - input2[iter.GetInputPosB()];
T diff = input1[input_index1_[i]] - input2[input_index2_[i]];
out[i] = static_cast<T>(std::conj(diff) * diff);
iter.GenNextPos();
}
};
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@ -736,15 +762,15 @@ void ArithmeticCpuTypeFunc<T>::SquaredDifferenceComplex(const T *input1, const T
template <typename T>
void ArithmeticCpuTypeFunc<T>::Xlogy(const T *input1, const T *input2, T *out) {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto x1 = input1[iter.GetInputPosA()];
auto x2 = input2[iter.GetInputPosB()];
auto x1 = input1[input_index1_[i]];
auto x2 = input2[input_index2_[i]];
auto logx2 = log(x2);
iter.GenNextPos();
if constexpr (std::is_same_v<T, bool>) {
out[i] = static_cast<T>(x1 && static_cast<bool>(logx2));
} else {
@ -757,14 +783,13 @@ void ArithmeticCpuTypeFunc<T>::Xlogy(const T *input1, const T *input2, T *out) {
template <typename T>
void ArithmeticCpuTypeFunc<T>::Atan2(const T *input1, const T *input2, T *out) {
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
if (!is_init_broadcast_) {
InitBroadCast();
}
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
out[i] = static_cast<T>(
atan2(static_cast<double>(input1[iter.GetInputPosA()]), static_cast<double>(input2[iter.GetInputPosB()])));
iter.GenNextPos();
atan2(static_cast<double>(input1[input_index1_[i]]), static_cast<double>(input2[input_index2_[i]])));
}
};
ParallelLaunchAutoSearch(task, output_size_, this, &parallel_search_info_);
@ -776,6 +801,41 @@ std::shared_ptr<CpuKernelFunc> SpecializeArithFunc() {
}
using ArithmeticCpuFuncCreator = std::function<std::shared_ptr<CpuKernelFunc>()>;
static std::map<std::string, std::vector<std::pair<KernelAttr, ArithmeticCpuFuncCreator>>> kernel_attr_list = {
{kAdd,
{{KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
SpecializeArithFunc<int8_t>},
{KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
SpecializeArithFunc<int16_t>},
{KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
SpecializeArithFunc<int32_t>},
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
SpecializeArithFunc<float>},
{KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
SpecializeArithFunc<int64_t>},
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
SpecializeArithFunc<double>},
{KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
SpecializeArithFunc<uint8_t>},
{KernelAttr().AddInputAttr(kNumberTypeUInt16).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
SpecializeArithFunc<uint16_t>},
{KernelAttr().AddInputAttr(kNumberTypeUInt32).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
SpecializeArithFunc<uint32_t>},
{KernelAttr().AddInputAttr(kNumberTypeUInt64).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
SpecializeArithFunc<uint64_t>},
{KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
SpecializeArithFunc<bool>},
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
SpecializeArithFunc<float16>},
{KernelAttr()
.AddInputAttr(kNumberTypeComplex64)
.AddInputAttr(kNumberTypeComplex64)
.AddOutputAttr(kNumberTypeComplex64),
SpecializeArithFunc<complex64>},
{KernelAttr()
.AddInputAttr(kNumberTypeComplex128)
.AddInputAttr(kNumberTypeComplex128)
.AddOutputAttr(kNumberTypeComplex128),
SpecializeArithFunc<complex128>}}},
{kSub,
{{KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
SpecializeArithFunc<int8_t>},
@ -1182,7 +1242,8 @@ std::vector<KernelAttr> ArithmeticCpuKernelMod::GetOpSupport() {
return support_list;
}
MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, Add,
[]() { return std::make_shared<ArithmeticCpuKernelMod>(kAdd); });
MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, Sub,
[]() { return std::make_shared<ArithmeticCpuKernelMod>(kSub); });
MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, Mul,

View File

@ -25,6 +25,7 @@
#include "plugin/device/cpu/hal/device/cpu_device_address.h"
#include "utils/check_convert_utils.h"
#include "ops/reduce.h"
#include "plugin/device/cpu/kernel/nnacl/errorcode.h"
namespace mindspore {
namespace kernel {
@ -60,6 +61,8 @@ class ReduceCpuKernelFunc : public CpuKernelFunc {
void AccelerateLongVector(T *input_addr, T *output_addr, size_t input_size);
void ChooseFunc(const std::string &kernel_name_);
void HandleInputAxis();
void SpecialExcute();
void CalAxesAndStride(std::vector<size_t> *axes, size_t *stride);
enum class ReduceFuncType {
kReduceAllType,
@ -179,6 +182,26 @@ void ReduceAny(const T *in, T *out, size_t start, size_t end, TransposeIterator
}
}
template <typename T>
void ReduceCpuKernelFunc<T>::SpecialExcute() {
// special accelerate for axis = 1 and input has 2 dims
if ((reduce_type_ == ReduceFuncType::kReduceMeanType || reduce_type_ == ReduceFuncType::kReduceSumType) &&
axis_.size() == 1 && axis_[0] == 1 && input_shape_.size() == kDim2) {
simple_execute_ = true;
}
// special accelerate for axis[0] = 0 and other dims for axis is 1.
if (reduce_type_ == ReduceFuncType::kReduceSumType && axis_.size() >= 1 && axis_[0] == 0 &&
input_shape_.size() >= kDim2) {
simple_execute_ = true;
for (size_t i = 1; i < axis_.size(); ++i) {
if (static_cast<int64_t>(input_shape_.size()) > axis_[i] && input_shape_[axis_[i]] != 1) {
simple_execute_ = false;
break;
}
}
}
}
template <typename T>
void ReduceCpuKernelFunc<T>::HandleInputAxis() {
int64_t dimension = SizeToLong(input_shape_.size());
@ -201,12 +224,8 @@ void ReduceCpuKernelFunc<T>::HandleInputAxis() {
sort(axis_.begin(), axis_.end());
auto last = std::unique(axis_.begin(), axis_.end());
axis_.erase(last, axis_.end());
// special accelerate for axis = 1 and input has 2 dims
if constexpr (std::is_same<T, float>::value) {
if ((reduce_type_ == ReduceFuncType::kReduceMeanType || reduce_type_ == ReduceFuncType::kReduceSumType) &&
axis_.size() == 1 && axis_[0] == 1 && input_shape_.size() == kDim2) {
simple_execute_ = true;
}
SpecialExcute();
}
}
@ -285,6 +304,26 @@ void ReduceCpuKernelFunc<T>::InitFunc(const BaseOperatorPtr &base_operator, cons
ChooseFunc(kernel_name_);
}
template <typename T>
void ReduceCpuKernelFunc<T>::CalAxesAndStride(std::vector<size_t> *axes, size_t *stride) {
int dimension = SizeToInt(input_shape_.size());
size_t j = 0;
size_t k = 0;
for (int i = 0; i < dimension; ++i) {
if (j == axis_.size() || i != axis_[j]) {
(*axes)[k] = IntToSize(i);
++k;
} else {
*stride *= LongToSize(input_shape_[IntToSize(i)]);
++j;
}
}
for (auto &it : axis_) {
(*axes)[k] = IntToSize(it);
++k;
}
}
template <typename T>
bool ReduceCpuKernelFunc<T>::RunFunc(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &,
@ -314,43 +353,41 @@ bool ReduceCpuKernelFunc<T>::RunFunc(const std::vector<kernel::AddressPtr> &inpu
}
} else {
// Calculate transpose axes and stride
int dimension = SizeToInt(input_shape_.size());
size_t stride = 1;
std::vector<size_t> axes(input_shape_.size());
size_t j = 0;
size_t k = 0;
for (int i = 0; i < dimension; ++i) {
if (j == axis_.size() || i != axis_[j]) {
axes[k] = i;
++k;
} else {
stride *= LongToSize(input_shape_[IntToSize(i)]);
++j;
}
}
for (auto &it : axis_) {
axes[k] = it;
++k;
}
CalAxesAndStride(&axes, &stride);
size_t output_size = outputs[0]->size / sizeof(T);
if constexpr (std::is_same<T, float>::value) {
if (simple_execute_) {
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
(void)ReduceSumDim2Axis1(stride, input_addr + i * stride, output_addr + i);
if (reduce_type_ == ReduceFuncType::kReduceMeanType) {
output_addr[i] /= SizeToFloat(stride);
if (axis_[0] == 1) {
auto task = [&](size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
(void)ReduceSumDim2Axis1(stride, input_addr + i * stride, output_addr + i);
if (reduce_type_ == ReduceFuncType::kReduceMeanType) {
output_addr[i] /= SizeToFloat(stride);
}
}
}
};
ParallelLaunchAutoSearch(task, output_size, this, &parallel_search_info_);
return true;
};
ParallelLaunchAutoSearch(task, output_size, this, &parallel_search_info_);
return true;
} else {
auto task = [&](size_t start, size_t end) {
int ret =
ReduceSumDim2Axis0(end - start, output_size, input_shape_[0], input_addr + start, output_addr + start);
if (ret != NNACL_OK) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', ReduceSumDim2Axis0 failed.Error no: " << ret;
}
};
ParallelLaunchAutoSearch(task, output_size, this, &parallel_search_info_);
return true;
}
}
}
// Calculate transpose shape
std::vector<int64_t> transpose_shape(input_shape_.size());
int dimension = SizeToInt(input_shape_.size());
for (int i = 0; i < dimension; ++i) {
transpose_shape[i] = input_shape_[axes[i]];
}

View File

@ -1,113 +0,0 @@
/**
* Copyright 2020-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plugin/device/cpu/kernel/tensoradd_cpu_kernel.h"
#include <algorithm>
#include <functional>
#include <utility>
#include <vector>
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kTensorAddInputsSize = 2;
constexpr size_t kTensorAddOutputsSize = 1;
} // namespace
void TensorAddCpuKernelMod::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
// Init shape and strides
input_shape_a_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
input_shape_b_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
auto kernel_attr = GetKernelAttrFromNode(kernel_node);
auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
if (!is_match) {
MS_LOG(EXCEPTION) << "Add does not support this kernel data type: " << kernel_attr;
}
kernel_func_ = func_list_[index].second;
}
template <typename T>
bool TensorAddCpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kTensorAddInputsSize, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kTensorAddOutputsSize, kernel_name_);
T *input_addr_a = reinterpret_cast<T *>(inputs[0]->addr);
T *input_addr_b = reinterpret_cast<T *>(inputs[1]->addr);
T *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
size_t output_size = outputs[0]->size / sizeof(T);
if (input_shape_a_ == input_shape_b_) {
auto task = [output_addr, input_addr_a, input_addr_b](size_t start, size_t end) {
for (size_t i = start; i < end; ++i) {
output_addr[i] = input_addr_a[i] + input_addr_b[i];
}
};
ParallelLaunchAutoSearch(task, output_size, this, &parallel_search_info_);
} else { // Broadcast
BroadcastIterator base_iter(input_shape_a_, input_shape_b_, output_shape_);
auto task = [&base_iter, output_addr, input_addr_a, input_addr_b](size_t start, size_t end) {
auto iter = base_iter;
iter.SetPos(start);
for (size_t i = start; i < end; ++i) {
output_addr[i] = input_addr_a[iter.GetInputPosA()] + input_addr_b[iter.GetInputPosB()];
iter.GenNextPos();
}
};
ParallelLaunchAutoSearch(task, output_size, this, &parallel_search_info_);
}
return true;
}
std::vector<std::pair<KernelAttr, TensorAddCpuKernelMod::AddFunc>> TensorAddCpuKernelMod::func_list_ = {
{KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
&TensorAddCpuKernelMod::LaunchKernel<int64_t>},
{KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
&TensorAddCpuKernelMod::LaunchKernel<int32_t>},
{KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
&TensorAddCpuKernelMod::LaunchKernel<int16_t>},
{KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
&TensorAddCpuKernelMod::LaunchKernel<int8_t>},
{KernelAttr().AddInputAttr(kNumberTypeUInt64).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
&TensorAddCpuKernelMod::LaunchKernel<uint64_t>},
{KernelAttr().AddInputAttr(kNumberTypeUInt32).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
&TensorAddCpuKernelMod::LaunchKernel<uint32_t>},
{KernelAttr().AddInputAttr(kNumberTypeUInt16).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
&TensorAddCpuKernelMod::LaunchKernel<uint16_t>},
{KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
&TensorAddCpuKernelMod::LaunchKernel<uint8_t>},
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
&TensorAddCpuKernelMod::LaunchKernel<double>},
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
&TensorAddCpuKernelMod::LaunchKernel<float>},
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
&TensorAddCpuKernelMod::LaunchKernel<float16>},
{KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
&TensorAddCpuKernelMod::LaunchKernel<bool>}};
std::vector<KernelAttr> TensorAddCpuKernelMod::GetOpSupport() {
std::vector<KernelAttr> support_list;
(void)std::transform(func_list_.begin(), func_list_.end(), std::back_inserter(support_list),
[](const std::pair<KernelAttr, AddFunc> &pair) { return pair.first; });
return support_list;
}
MS_KERNEL_FACTORY_REG(NativeCpuKernelMod, Add, TensorAddCpuKernelMod);
} // namespace kernel
} // namespace mindspore

View File

@ -1,57 +0,0 @@
/**
* Copyright 2020-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TENSORADD_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TENSORADD_CPU_KERNEL_H_
#include <vector>
#include <utility>
#include "plugin/device/cpu/kernel/cpu_kernel.h"
#include "plugin/factory/ms_factory.h"
namespace mindspore {
namespace kernel {
class TensorAddCpuKernelMod : public DeprecatedNativeCpuKernelMod {
public:
TensorAddCpuKernelMod() = default;
~TensorAddCpuKernelMod() override = default;
void InitKernel(const CNodePtr &kernel_node) override;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override {
return kernel_func_(this, inputs, workspace, outputs);
}
std::vector<KernelAttr> GetOpSupport() override;
private:
template <typename T>
bool LaunchKernel(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &workspace,
const std::vector<kernel::AddressPtr> &outputs);
using AddFunc = std::function<bool(TensorAddCpuKernelMod *, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &)>;
static std::vector<std::pair<KernelAttr, AddFunc>> func_list_;
AddFunc kernel_func_;
std::vector<int64_t> input_shape_a_;
std::vector<int64_t> input_shape_b_;
std::vector<int64_t> output_shape_;
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TENSORADD_CPU_KERNEL_H_

View File

@ -71,7 +71,6 @@ int ActorMgr::Initialize(bool use_inner_pool, size_t actor_thread_num, size_t ma
return MINDRT_ERROR;
}
inner_pool_->SetActorThreadNum(actor_thread_num);
inner_pool_->DisableOccupiedActorThread();
inner_pool_->SetKernelThreadNum(max_thread_num - actor_thread_num);
}
if (inner_pool_ != nullptr) {

View File

@ -191,10 +191,15 @@ void Worker::Active(std::vector<TaskSplit> *task_list, int task_id_start, int ta
std::lock_guard<std::mutex> _l(mutex_);
// add the first to task_, and others to queue.
status_ = kThreadBusy;
task_id_.store(task_id_start, std::memory_order_relaxed);
THREAD_TEST_TRUE(task_ == nullptr);
task_.store((*task_list)[0].task_, std::memory_order_release);
for (int i = task_id_start + 1; i < task_id_end; ++i) {
Task *task = task_.load(std::memory_order_consume);
int to_atomic_task = 0;
if (task == nullptr) {
task_id_.store(task_id_start, std::memory_order_relaxed);
THREAD_TEST_TRUE(task_ == nullptr);
task_.store((*task_list)[0].task_, std::memory_order_release);
to_atomic_task = 1;
}
for (int i = task_id_start + to_atomic_task; i < task_id_end; ++i) {
while (!local_task_queue_->Enqueue(&(*task_list)[i])) {
}
}

View File

@ -169,7 +169,7 @@ class MS_CORE_API ThreadPool {
void DisableOccupiedActorThread() { occupied_actor_thread_ = false; }
void SetActorThreadNum(size_t actor_thread_num) { actor_thread_num_ = actor_thread_num; }
void SetKernelThreadNum(size_t kernel_thread_num) { kernel_thread_num_ = kernel_thread_num; }
size_t GetKernelThreadNum() const { return kernel_thread_num_; }
size_t GetKernelThreadNum() const { return kernel_thread_num_ + actor_thread_num_; }
size_t GetActorThreadNum() const { return actor_thread_num_; }
void SetKernelThreadMaxSpinCount(int spin_count);
void SetSpinCountMaxValue();

View File

@ -375,8 +375,8 @@ class _Context:
def set_runtime_num_threads(self, runtime_num_threads):
"""Check and set runtime_num_threads."""
if runtime_num_threads <= 0:
raise ValueError("The num of thread must bigger than 0.")
if runtime_num_threads < 0:
raise ValueError("The num of thread must bigger than or equal to 0.")
self.set_param(ms_ctx_param.runtime_num_threads, runtime_num_threads)
def set_op_timeout(self, op_timeout):
@ -924,7 +924,7 @@ def set_context(**kwargs):
inter_op_parallel_num(int): The thread number of op parallel at the same time. Default value is 0,
which means use the default num.
runtime_num_threads(int): The thread pool number of cpu kernel used in runtime,
which must bigger than 0. Default value is 30, if you run many processes at
which must bigger than or equal to 0. Default value is 30, if you run many processes at
the same time, you should set the value smaller to avoid thread contention.
disable_format_transform (bool): Whether to disable the automatic format transform function from NCHW to NHWC.
When the network training performance of fp16 is worse than fp32,