forked from mindspore-Ecosystem/mindspore
acc arthitic op
This commit is contained in:
parent
527906dbd0
commit
ec43c8b63b
|
@ -139,7 +139,7 @@ mindspore.set_context
|
|||
- **enable_compile_cache** (bool) - 表示是否加载或者保存前端编译的图。当 `enable_compile_cache` 被设置为True时,在第一次执行的过程中,一个硬件无关的编译缓存会被生成并且导出为一个MINDIR文件。当该网络被再次执行时,如果 `enable_compile_cache` 仍然为True并且网络脚本没有被更改,那么这个编译缓存会被加载。注意目前只支持有限的Python脚本更改的自动检测,这意味着可能有正确性风险。默认值:False。这是一个实验特性,可能会被更改或者删除。
|
||||
- **compile_cache_path** (str) - 保存前端图编译缓存的路径。默认值:"."。如果目录不存在,系统会自动创建这个目录。缓存会被保存到如下目录: `compile_cache_path/rank_${rank_id}/` 。 `rank_id` 是集群上当前设备的ID。
|
||||
- **inter_op_parallel_num** (int) - 算子间并行数控制。 默认值为0,表示由框架默认指定。
|
||||
- **runtime_num_threads** (int) - 运行时actor和CPU算子核使用的线程池线程数,必须大于0。默认值为30,如果同时运行多个进程,应将该值设置得小一些,以避免线程争用。
|
||||
- **runtime_num_threads** (int) - 运行时actor和CPU算子核使用的线程池线程数,必须大于等于0。默认值为30,如果同时运行多个进程,应将该值设置得小一些,以避免线程争用。
|
||||
- **disable_format_transform** (bool) - 表示是否取消NCHW到NHWC的自动格式转换功能。当fp16的网络性能不如fp32的时,可以设置 `disable_format_transform` 为True,以尝试提高训练性能。默认值:False。
|
||||
- **support_binary** (bool) - 是否支持在图形模式下运行.pyc或.so。如果要支持在图形模式下运行.so或.pyc,可将 `support_binary` 置为True,并运行一次.py文件,从而将接口源码保存到接口定义.py文件中,因此要保证该文件可写。然后将.py文件编译成.pyc或.so文件,即可在图模式下运行。
|
||||
- **memory_optimize_level** (str) - 内存优化级别,默认值:O0。其值必须在 ['O0', 'O1'] 范围中。
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
#include "plugin/device/cpu/kernel/nnacl/fp32/mul_fp32.h"
|
||||
#include "plugin/device/cpu/kernel/nnacl/fp32/power_fp32.h"
|
||||
#include "plugin/device/cpu/kernel/nnacl/fp32/sub_fp32.h"
|
||||
#include "plugin/device/cpu/kernel/nnacl/fp32/add_fp32.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -130,6 +131,7 @@ class ArithmeticCpuTypeFunc : public CpuKernelFunc {
|
|||
CPUKernelUtils::GetElementNumEveryDim(input_shape2_, &input_element_num2_);
|
||||
output_element_num_.clear();
|
||||
CPUKernelUtils::GetElementNumEveryDim(output_shape_, &output_element_num_);
|
||||
is_init_broadcast_ = false;
|
||||
return KRET_OK;
|
||||
}
|
||||
|
||||
|
@ -180,6 +182,20 @@ class ArithmeticCpuTypeFunc : public CpuKernelFunc {
|
|||
}
|
||||
|
||||
private:
|
||||
void InitBroadCast() {
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
base_iter.SetPos(0);
|
||||
input_index1_.clear();
|
||||
input_index2_.clear();
|
||||
input_index1_.resize(output_size_);
|
||||
input_index2_.resize(output_size_);
|
||||
for (size_t i = 0; i < output_size_; i++) {
|
||||
input_index1_[i] = base_iter.GetInputPosA();
|
||||
input_index2_[i] = base_iter.GetInputPosB();
|
||||
base_iter.GenNextPos();
|
||||
}
|
||||
is_init_broadcast_ = true;
|
||||
}
|
||||
void InitComputeFunc() {
|
||||
if (kernel_name_ == kAssignAdd || kernel_name_ == kAssignSub) {
|
||||
return;
|
||||
|
@ -229,10 +245,13 @@ class ArithmeticCpuTypeFunc : public CpuKernelFunc {
|
|||
|
||||
ShapeVector input_shape1_;
|
||||
ShapeVector input_shape2_;
|
||||
std::vector<size_t> input_index1_;
|
||||
std::vector<size_t> input_index2_;
|
||||
std::vector<size_t> input_element_num1_;
|
||||
std::vector<size_t> input_element_num2_;
|
||||
ShapeVector output_shape_;
|
||||
std::vector<size_t> output_element_num_;
|
||||
bool is_init_broadcast_{false};
|
||||
|
||||
using TypeComputeFunc = std::function<void(ArithmeticCpuTypeFunc *, const T *in_x, const T *in_y, T *out)>;
|
||||
TypeComputeFunc compute_func_{nullptr};
|
||||
|
@ -268,13 +287,32 @@ void ArithmeticCpuTypeFunc<T>::AssignSub(T *input1, const T *input2, T *out) {
|
|||
|
||||
template <typename T>
|
||||
void ArithmeticCpuTypeFunc<T>::Add(const T *input1, const T *input2, T *out) {
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if constexpr (std::is_same_v<T, float>) {
|
||||
if (input_shape1_ == input_shape2_) {
|
||||
auto task = [input1, input2, out](size_t start, size_t end) {
|
||||
(void)ElementAdd(input1 + start, input2 + start, out + start, end - start);
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||
return;
|
||||
}
|
||||
if (op_para_.in_elements_num0_ == 1 || op_para_.in_elements_num1_ == 1) {
|
||||
auto task = [this, input1, input2, out](size_t start, size_t end) {
|
||||
if (op_para_.in_elements_num0_ == 1) {
|
||||
(void)ElementOptAdd(input1, input2 + start, out + start, end - start, &op_para_);
|
||||
} else {
|
||||
(void)ElementOptAdd(input1 + start, input2, out + start, end - start, &op_para_);
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
out[i] = static_cast<T>(input1[iter.GetInputPosA()] + input2[iter.GetInputPosB()]);
|
||||
iter.GenNextPos();
|
||||
out[i] = static_cast<T>(input1[input_index1_[i]] + input2[input_index2_[i]]);
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||
|
@ -282,13 +320,12 @@ void ArithmeticCpuTypeFunc<T>::Add(const T *input1, const T *input2, T *out) {
|
|||
|
||||
template <typename T>
|
||||
void ArithmeticCpuTypeFunc<T>::AddV2(const T *input1, const T *input2, T *out) {
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
out[i] = static_cast<T>(input1[iter.GetInputPosA()] + input2[iter.GetInputPosB()]);
|
||||
iter.GenNextPos();
|
||||
out[i] = static_cast<T>(input1[input_index1_[i]] + input2[input_index2_[i]]);
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||
|
@ -316,14 +353,12 @@ void ArithmeticCpuTypeFunc<T>::Sub(const T *input1, const T *input2, T *out) {
|
|||
return;
|
||||
}
|
||||
}
|
||||
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
for (size_t i = start; i < end; i++) {
|
||||
out[i] = static_cast<T>(input1[iter.GetInputPosA()] - input2[iter.GetInputPosB()]);
|
||||
iter.GenNextPos();
|
||||
out[i] = static_cast<T>(input1[input_index1_[i]] - input2[input_index2_[i]]);
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||
|
@ -351,17 +386,16 @@ void ArithmeticCpuTypeFunc<T>::Mul(const T *input1, const T *input2, T *out) {
|
|||
return;
|
||||
}
|
||||
}
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
if constexpr (std::is_same_v<T, bool>) {
|
||||
out[i] = static_cast<T>(input1[iter.GetInputPosA()] && input2[iter.GetInputPosB()]);
|
||||
out[i] = static_cast<T>(input1[input_index1_[i]] && input2[input_index2_[i]]);
|
||||
} else {
|
||||
out[i] = static_cast<T>(input1[iter.GetInputPosA()] * input2[iter.GetInputPosB()]);
|
||||
out[i] = static_cast<T>(input1[input_index1_[i]] * input2[input_index2_[i]]);
|
||||
}
|
||||
iter.GenNextPos();
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||
|
@ -391,14 +425,14 @@ void ArithmeticCpuTypeFunc<T>::RealDiv(const T *input1, const T *input2, T *out)
|
|||
return;
|
||||
}
|
||||
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
auto dividend = input1[iter.GetInputPosA()];
|
||||
auto divisor = input2[iter.GetInputPosB()];
|
||||
iter.GenNextPos();
|
||||
auto dividend = input1[input_index1_[i]];
|
||||
auto divisor = input2[input_index2_[i]];
|
||||
|
||||
auto zero = static_cast<T>(0);
|
||||
if (divisor == zero) {
|
||||
if (dividend == zero) {
|
||||
|
@ -442,14 +476,14 @@ void ArithmeticCpuTypeFunc<T>::RealDivComplex(const T *input1, const T *input2,
|
|||
return;
|
||||
}
|
||||
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
auto dividend = input1[iter.GetInputPosA()];
|
||||
auto divisor = input2[iter.GetInputPosB()];
|
||||
iter.GenNextPos();
|
||||
auto dividend = input1[input_index1_[i]];
|
||||
auto divisor = input2[input_index2_[i]];
|
||||
|
||||
auto zero = static_cast<T>(0);
|
||||
if (divisor == zero) {
|
||||
out[i] = std::numeric_limits<T>::quiet_NaN();
|
||||
|
@ -463,14 +497,14 @@ void ArithmeticCpuTypeFunc<T>::RealDivComplex(const T *input1, const T *input2,
|
|||
|
||||
template <typename T>
|
||||
void ArithmeticCpuTypeFunc<T>::Div(const T *input1, const T *input2, T *out) {
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
auto dividend = input1[iter.GetInputPosA()];
|
||||
auto divisor = input2[iter.GetInputPosB()];
|
||||
iter.GenNextPos();
|
||||
auto dividend = input1[input_index1_[i]];
|
||||
auto divisor = input2[input_index2_[i]];
|
||||
|
||||
auto zero = static_cast<T>(0);
|
||||
if (divisor == zero) {
|
||||
if (dividend == zero) {
|
||||
|
@ -492,14 +526,14 @@ void ArithmeticCpuTypeFunc<T>::Div(const T *input1, const T *input2, T *out) {
|
|||
|
||||
template <typename T>
|
||||
void ArithmeticCpuTypeFunc<T>::DivComplex(const T *input1, const T *input2, T *out) {
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
auto dividend = input1[iter.GetInputPosA()];
|
||||
auto divisor = input2[iter.GetInputPosB()];
|
||||
iter.GenNextPos();
|
||||
auto dividend = input1[input_index1_[i]];
|
||||
auto divisor = input2[input_index2_[i]];
|
||||
|
||||
auto zero = static_cast<T>(0);
|
||||
if (divisor == zero) {
|
||||
if (dividend == zero) {
|
||||
|
@ -516,14 +550,14 @@ void ArithmeticCpuTypeFunc<T>::DivComplex(const T *input1, const T *input2, T *o
|
|||
|
||||
template <typename T>
|
||||
void ArithmeticCpuTypeFunc<T>::DivNoNan(const T *input1, const T *input2, T *out) {
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
auto dividend = input1[iter.GetInputPosA()];
|
||||
auto divisor = input2[iter.GetInputPosB()];
|
||||
iter.GenNextPos();
|
||||
auto dividend = input1[input_index1_[i]];
|
||||
auto divisor = input2[input_index2_[i]];
|
||||
|
||||
auto zero = static_cast<T>(0);
|
||||
if constexpr (std::is_same_v<T, double>) {
|
||||
if (common::IsDoubleEqual(divisor, zero)) {
|
||||
|
@ -551,14 +585,14 @@ void ArithmeticCpuTypeFunc<T>::DivNoNan(const T *input1, const T *input2, T *out
|
|||
|
||||
template <typename T>
|
||||
void ArithmeticCpuTypeFunc<T>::FloorDiv(const T *input1, const T *input2, T *out) {
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
auto dividend = input1[iter.GetInputPosA()];
|
||||
auto divisor = input2[iter.GetInputPosB()];
|
||||
iter.GenNextPos();
|
||||
auto dividend = input1[input_index1_[i]];
|
||||
auto divisor = input2[input_index2_[i]];
|
||||
|
||||
auto zero = static_cast<T>(0);
|
||||
if (divisor == zero) {
|
||||
if (dividend == zero) {
|
||||
|
@ -580,14 +614,14 @@ void ArithmeticCpuTypeFunc<T>::FloorDiv(const T *input1, const T *input2, T *out
|
|||
|
||||
template <typename T>
|
||||
void ArithmeticCpuTypeFunc<T>::Mod(const T *input1, const T *input2, T *out) {
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
auto x = static_cast<double>(input1[iter.GetInputPosA()]);
|
||||
auto y = static_cast<double>(input2[iter.GetInputPosB()]);
|
||||
iter.GenNextPos();
|
||||
auto x = static_cast<double>(input1[input_index1_[i]]);
|
||||
auto y = static_cast<double>(input2[input_index2_[i]]);
|
||||
|
||||
auto data_div = x / y;
|
||||
auto data_div_min = data_div < 0.0 ? data_div : 0.0;
|
||||
auto data_div_max = data_div > 0.0 ? data_div : 0.0;
|
||||
|
@ -602,14 +636,14 @@ void ArithmeticCpuTypeFunc<T>::Mod(const T *input1, const T *input2, T *out) {
|
|||
|
||||
template <typename T>
|
||||
void ArithmeticCpuTypeFunc<T>::FloorMod(const T *input1, const T *input2, T *out) {
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
auto x = static_cast<double>(input1[iter.GetInputPosA()]);
|
||||
auto y = static_cast<double>(input2[iter.GetInputPosB()]);
|
||||
iter.GenNextPos();
|
||||
auto x = static_cast<double>(input1[input_index1_[i]]);
|
||||
auto y = static_cast<double>(input2[input_index2_[i]]);
|
||||
|
||||
auto res = x - floor(x / y) * y;
|
||||
out[i] = static_cast<T>((std::abs(res) > 1e-9) && ((res < 0.0) != (y < 0.0)) ? res + y : res);
|
||||
}
|
||||
|
@ -650,70 +684,63 @@ void ArithmeticCpuTypeFunc<T>::Pow(const T *input1, const T *input2, T *out) {
|
|||
}
|
||||
}
|
||||
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
if (output_size_ > kMaxPowSerialSize) {
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
auto x = static_cast<double>(input1[iter.GetInputPosA()]);
|
||||
auto y = static_cast<double>(input2[iter.GetInputPosB()]);
|
||||
auto x = static_cast<double>(input1[input_index1_[i]]);
|
||||
auto y = static_cast<double>(input2[input_index2_[i]]);
|
||||
out[i] = static_cast<T>(std::pow(x, y));
|
||||
iter.GenNextPos();
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||
} else {
|
||||
base_iter.SetPos(0);
|
||||
for (size_t i = 0; i < output_size_; i++) {
|
||||
auto sx = static_cast<double>(input1[base_iter.GetInputPosA()]);
|
||||
auto sy = static_cast<double>(input2[base_iter.GetInputPosB()]);
|
||||
auto sx = static_cast<double>(input1[input_index1_[i]]);
|
||||
auto sy = static_cast<double>(input2[input_index2_[i]]);
|
||||
out[i] = static_cast<T>(std::pow(sx, sy));
|
||||
base_iter.GenNextPos();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ArithmeticCpuTypeFunc<T>::PowComplex(const T *input1, const T *input2, T *out) {
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
if (output_size_ > kMaxPowSerialSize) {
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
auto x = (input1[iter.GetInputPosA()]);
|
||||
auto y = (input2[iter.GetInputPosB()]);
|
||||
auto x = (input1[input_index1_[i]]);
|
||||
auto y = (input2[input_index2_[i]]);
|
||||
out[i] = static_cast<T>(std::pow(x, y));
|
||||
iter.GenNextPos();
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||
} else {
|
||||
base_iter.SetPos(0);
|
||||
for (size_t i = 0; i < output_size_; i++) {
|
||||
auto sx = (input1[base_iter.GetInputPosA()]);
|
||||
auto sy = (input2[base_iter.GetInputPosB()]);
|
||||
auto sx = (input1[input_index1_[i]]);
|
||||
auto sy = (input2[input_index2_[i]]);
|
||||
out[i] = static_cast<T>(std::pow(sx, sy));
|
||||
base_iter.GenNextPos();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ArithmeticCpuTypeFunc<T>::SquaredDifference(const T *input1, const T *input2, T *out) {
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
T diff = input1[iter.GetInputPosA()] - input2[iter.GetInputPosB()];
|
||||
T diff = input1[input_index1_[i]] - input2[input_index2_[i]];
|
||||
if constexpr (std::is_same_v<T, bool>) {
|
||||
out[i] = static_cast<T>(diff);
|
||||
} else {
|
||||
out[i] = static_cast<T>(diff * diff);
|
||||
}
|
||||
iter.GenNextPos();
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||
|
@ -721,14 +748,13 @@ void ArithmeticCpuTypeFunc<T>::SquaredDifference(const T *input1, const T *input
|
|||
|
||||
template <typename T>
|
||||
void ArithmeticCpuTypeFunc<T>::SquaredDifferenceComplex(const T *input1, const T *input2, T *out) {
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
T diff = input1[iter.GetInputPosA()] - input2[iter.GetInputPosB()];
|
||||
T diff = input1[input_index1_[i]] - input2[input_index2_[i]];
|
||||
out[i] = static_cast<T>(std::conj(diff) * diff);
|
||||
iter.GenNextPos();
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||
|
@ -736,15 +762,15 @@ void ArithmeticCpuTypeFunc<T>::SquaredDifferenceComplex(const T *input1, const T
|
|||
|
||||
template <typename T>
|
||||
void ArithmeticCpuTypeFunc<T>::Xlogy(const T *input1, const T *input2, T *out) {
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
auto x1 = input1[iter.GetInputPosA()];
|
||||
auto x2 = input2[iter.GetInputPosB()];
|
||||
auto x1 = input1[input_index1_[i]];
|
||||
auto x2 = input2[input_index2_[i]];
|
||||
auto logx2 = log(x2);
|
||||
iter.GenNextPos();
|
||||
|
||||
if constexpr (std::is_same_v<T, bool>) {
|
||||
out[i] = static_cast<T>(x1 && static_cast<bool>(logx2));
|
||||
} else {
|
||||
|
@ -757,14 +783,13 @@ void ArithmeticCpuTypeFunc<T>::Xlogy(const T *input1, const T *input2, T *out) {
|
|||
|
||||
template <typename T>
|
||||
void ArithmeticCpuTypeFunc<T>::Atan2(const T *input1, const T *input2, T *out) {
|
||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
if (!is_init_broadcast_) {
|
||||
InitBroadCast();
|
||||
}
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
out[i] = static_cast<T>(
|
||||
atan2(static_cast<double>(input1[iter.GetInputPosA()]), static_cast<double>(input2[iter.GetInputPosB()])));
|
||||
iter.GenNextPos();
|
||||
atan2(static_cast<double>(input1[input_index1_[i]]), static_cast<double>(input2[input_index2_[i]])));
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||
|
@ -776,6 +801,41 @@ std::shared_ptr<CpuKernelFunc> SpecializeArithFunc() {
|
|||
}
|
||||
using ArithmeticCpuFuncCreator = std::function<std::shared_ptr<CpuKernelFunc>()>;
|
||||
static std::map<std::string, std::vector<std::pair<KernelAttr, ArithmeticCpuFuncCreator>>> kernel_attr_list = {
|
||||
{kAdd,
|
||||
{{KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
|
||||
SpecializeArithFunc<int8_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
|
||||
SpecializeArithFunc<int16_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
SpecializeArithFunc<int32_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
SpecializeArithFunc<float>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
|
||||
SpecializeArithFunc<int64_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
|
||||
SpecializeArithFunc<double>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
|
||||
SpecializeArithFunc<uint8_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeUInt16).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
|
||||
SpecializeArithFunc<uint16_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeUInt32).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
|
||||
SpecializeArithFunc<uint32_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeUInt64).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
|
||||
SpecializeArithFunc<uint64_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
|
||||
SpecializeArithFunc<bool>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
SpecializeArithFunc<float16>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeComplex64)
|
||||
.AddInputAttr(kNumberTypeComplex64)
|
||||
.AddOutputAttr(kNumberTypeComplex64),
|
||||
SpecializeArithFunc<complex64>},
|
||||
{KernelAttr()
|
||||
.AddInputAttr(kNumberTypeComplex128)
|
||||
.AddInputAttr(kNumberTypeComplex128)
|
||||
.AddOutputAttr(kNumberTypeComplex128),
|
||||
SpecializeArithFunc<complex128>}}},
|
||||
{kSub,
|
||||
{{KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
|
||||
SpecializeArithFunc<int8_t>},
|
||||
|
@ -1182,7 +1242,8 @@ std::vector<KernelAttr> ArithmeticCpuKernelMod::GetOpSupport() {
|
|||
|
||||
return support_list;
|
||||
}
|
||||
|
||||
MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, Add,
|
||||
[]() { return std::make_shared<ArithmeticCpuKernelMod>(kAdd); });
|
||||
MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, Sub,
|
||||
[]() { return std::make_shared<ArithmeticCpuKernelMod>(kSub); });
|
||||
MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, Mul,
|
||||
|
|
|
@ -25,6 +25,7 @@
|
|||
#include "plugin/device/cpu/hal/device/cpu_device_address.h"
|
||||
#include "utils/check_convert_utils.h"
|
||||
#include "ops/reduce.h"
|
||||
#include "plugin/device/cpu/kernel/nnacl/errorcode.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
|
@ -60,6 +61,8 @@ class ReduceCpuKernelFunc : public CpuKernelFunc {
|
|||
void AccelerateLongVector(T *input_addr, T *output_addr, size_t input_size);
|
||||
void ChooseFunc(const std::string &kernel_name_);
|
||||
void HandleInputAxis();
|
||||
void SpecialExcute();
|
||||
void CalAxesAndStride(std::vector<size_t> *axes, size_t *stride);
|
||||
|
||||
enum class ReduceFuncType {
|
||||
kReduceAllType,
|
||||
|
@ -179,6 +182,26 @@ void ReduceAny(const T *in, T *out, size_t start, size_t end, TransposeIterator
|
|||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ReduceCpuKernelFunc<T>::SpecialExcute() {
|
||||
// special accelerate for axis = 1 and input has 2 dims
|
||||
if ((reduce_type_ == ReduceFuncType::kReduceMeanType || reduce_type_ == ReduceFuncType::kReduceSumType) &&
|
||||
axis_.size() == 1 && axis_[0] == 1 && input_shape_.size() == kDim2) {
|
||||
simple_execute_ = true;
|
||||
}
|
||||
// special accelerate for axis[0] = 0 and other dims for axis is 1.
|
||||
if (reduce_type_ == ReduceFuncType::kReduceSumType && axis_.size() >= 1 && axis_[0] == 0 &&
|
||||
input_shape_.size() >= kDim2) {
|
||||
simple_execute_ = true;
|
||||
for (size_t i = 1; i < axis_.size(); ++i) {
|
||||
if (static_cast<int64_t>(input_shape_.size()) > axis_[i] && input_shape_[axis_[i]] != 1) {
|
||||
simple_execute_ = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ReduceCpuKernelFunc<T>::HandleInputAxis() {
|
||||
int64_t dimension = SizeToLong(input_shape_.size());
|
||||
|
@ -201,12 +224,8 @@ void ReduceCpuKernelFunc<T>::HandleInputAxis() {
|
|||
sort(axis_.begin(), axis_.end());
|
||||
auto last = std::unique(axis_.begin(), axis_.end());
|
||||
axis_.erase(last, axis_.end());
|
||||
// special accelerate for axis = 1 and input has 2 dims
|
||||
if constexpr (std::is_same<T, float>::value) {
|
||||
if ((reduce_type_ == ReduceFuncType::kReduceMeanType || reduce_type_ == ReduceFuncType::kReduceSumType) &&
|
||||
axis_.size() == 1 && axis_[0] == 1 && input_shape_.size() == kDim2) {
|
||||
simple_execute_ = true;
|
||||
}
|
||||
SpecialExcute();
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -285,6 +304,26 @@ void ReduceCpuKernelFunc<T>::InitFunc(const BaseOperatorPtr &base_operator, cons
|
|||
ChooseFunc(kernel_name_);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void ReduceCpuKernelFunc<T>::CalAxesAndStride(std::vector<size_t> *axes, size_t *stride) {
|
||||
int dimension = SizeToInt(input_shape_.size());
|
||||
size_t j = 0;
|
||||
size_t k = 0;
|
||||
for (int i = 0; i < dimension; ++i) {
|
||||
if (j == axis_.size() || i != axis_[j]) {
|
||||
(*axes)[k] = IntToSize(i);
|
||||
++k;
|
||||
} else {
|
||||
*stride *= LongToSize(input_shape_[IntToSize(i)]);
|
||||
++j;
|
||||
}
|
||||
}
|
||||
for (auto &it : axis_) {
|
||||
(*axes)[k] = IntToSize(it);
|
||||
++k;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool ReduceCpuKernelFunc<T>::RunFunc(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &,
|
||||
|
@ -314,43 +353,41 @@ bool ReduceCpuKernelFunc<T>::RunFunc(const std::vector<kernel::AddressPtr> &inpu
|
|||
}
|
||||
} else {
|
||||
// Calculate transpose axes and stride
|
||||
int dimension = SizeToInt(input_shape_.size());
|
||||
size_t stride = 1;
|
||||
std::vector<size_t> axes(input_shape_.size());
|
||||
size_t j = 0;
|
||||
size_t k = 0;
|
||||
for (int i = 0; i < dimension; ++i) {
|
||||
if (j == axis_.size() || i != axis_[j]) {
|
||||
axes[k] = i;
|
||||
++k;
|
||||
} else {
|
||||
stride *= LongToSize(input_shape_[IntToSize(i)]);
|
||||
++j;
|
||||
}
|
||||
}
|
||||
for (auto &it : axis_) {
|
||||
axes[k] = it;
|
||||
++k;
|
||||
}
|
||||
CalAxesAndStride(&axes, &stride);
|
||||
|
||||
size_t output_size = outputs[0]->size / sizeof(T);
|
||||
if constexpr (std::is_same<T, float>::value) {
|
||||
if (simple_execute_) {
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
(void)ReduceSumDim2Axis1(stride, input_addr + i * stride, output_addr + i);
|
||||
if (reduce_type_ == ReduceFuncType::kReduceMeanType) {
|
||||
output_addr[i] /= SizeToFloat(stride);
|
||||
if (axis_[0] == 1) {
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
(void)ReduceSumDim2Axis1(stride, input_addr + i * stride, output_addr + i);
|
||||
if (reduce_type_ == ReduceFuncType::kReduceMeanType) {
|
||||
output_addr[i] /= SizeToFloat(stride);
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size, this, ¶llel_search_info_);
|
||||
return true;
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size, this, ¶llel_search_info_);
|
||||
return true;
|
||||
} else {
|
||||
auto task = [&](size_t start, size_t end) {
|
||||
int ret =
|
||||
ReduceSumDim2Axis0(end - start, output_size, input_shape_[0], input_addr + start, output_addr + start);
|
||||
if (ret != NNACL_OK) {
|
||||
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', ReduceSumDim2Axis0 failed.Error no: " << ret;
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size, this, ¶llel_search_info_);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Calculate transpose shape
|
||||
std::vector<int64_t> transpose_shape(input_shape_.size());
|
||||
int dimension = SizeToInt(input_shape_.size());
|
||||
for (int i = 0; i < dimension; ++i) {
|
||||
transpose_shape[i] = input_shape_[axes[i]];
|
||||
}
|
||||
|
|
|
@ -1,113 +0,0 @@
|
|||
/**
|
||||
* Copyright 2020-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "plugin/device/cpu/kernel/tensoradd_cpu_kernel.h"
|
||||
#include <algorithm>
|
||||
#include <functional>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace {
|
||||
constexpr size_t kTensorAddInputsSize = 2;
|
||||
constexpr size_t kTensorAddOutputsSize = 1;
|
||||
} // namespace
|
||||
|
||||
void TensorAddCpuKernelMod::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_name_ = common::AnfAlgo::GetCNodeName(kernel_node);
|
||||
// Init shape and strides
|
||||
input_shape_a_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
input_shape_b_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
|
||||
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||
|
||||
auto kernel_attr = GetKernelAttrFromNode(kernel_node);
|
||||
auto [is_match, index] = MatchKernelAttr(kernel_attr, GetOpSupport());
|
||||
if (!is_match) {
|
||||
MS_LOG(EXCEPTION) << "Add does not support this kernel data type: " << kernel_attr;
|
||||
}
|
||||
kernel_func_ = func_list_[index].second;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool TensorAddCpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kTensorAddInputsSize, kernel_name_);
|
||||
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kTensorAddOutputsSize, kernel_name_);
|
||||
T *input_addr_a = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
T *input_addr_b = reinterpret_cast<T *>(inputs[1]->addr);
|
||||
T *output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
size_t output_size = outputs[0]->size / sizeof(T);
|
||||
if (input_shape_a_ == input_shape_b_) {
|
||||
auto task = [output_addr, input_addr_a, input_addr_b](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
output_addr[i] = input_addr_a[i] + input_addr_b[i];
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size, this, ¶llel_search_info_);
|
||||
} else { // Broadcast
|
||||
BroadcastIterator base_iter(input_shape_a_, input_shape_b_, output_shape_);
|
||||
auto task = [&base_iter, output_addr, input_addr_a, input_addr_b](size_t start, size_t end) {
|
||||
auto iter = base_iter;
|
||||
iter.SetPos(start);
|
||||
for (size_t i = start; i < end; ++i) {
|
||||
output_addr[i] = input_addr_a[iter.GetInputPosA()] + input_addr_b[iter.GetInputPosB()];
|
||||
iter.GenNextPos();
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, output_size, this, ¶llel_search_info_);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
std::vector<std::pair<KernelAttr, TensorAddCpuKernelMod::AddFunc>> TensorAddCpuKernelMod::func_list_ = {
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
|
||||
&TensorAddCpuKernelMod::LaunchKernel<int64_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
|
||||
&TensorAddCpuKernelMod::LaunchKernel<int32_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt16).AddInputAttr(kNumberTypeInt16).AddOutputAttr(kNumberTypeInt16),
|
||||
&TensorAddCpuKernelMod::LaunchKernel<int16_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeInt8).AddInputAttr(kNumberTypeInt8).AddOutputAttr(kNumberTypeInt8),
|
||||
&TensorAddCpuKernelMod::LaunchKernel<int8_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeUInt64).AddInputAttr(kNumberTypeUInt64).AddOutputAttr(kNumberTypeUInt64),
|
||||
&TensorAddCpuKernelMod::LaunchKernel<uint64_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeUInt32).AddInputAttr(kNumberTypeUInt32).AddOutputAttr(kNumberTypeUInt32),
|
||||
&TensorAddCpuKernelMod::LaunchKernel<uint32_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeUInt16).AddInputAttr(kNumberTypeUInt16).AddOutputAttr(kNumberTypeUInt16),
|
||||
&TensorAddCpuKernelMod::LaunchKernel<uint16_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeUInt8).AddInputAttr(kNumberTypeUInt8).AddOutputAttr(kNumberTypeUInt8),
|
||||
&TensorAddCpuKernelMod::LaunchKernel<uint8_t>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
|
||||
&TensorAddCpuKernelMod::LaunchKernel<double>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
&TensorAddCpuKernelMod::LaunchKernel<float>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
&TensorAddCpuKernelMod::LaunchKernel<float16>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeBool).AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
|
||||
&TensorAddCpuKernelMod::LaunchKernel<bool>}};
|
||||
|
||||
std::vector<KernelAttr> TensorAddCpuKernelMod::GetOpSupport() {
|
||||
std::vector<KernelAttr> support_list;
|
||||
(void)std::transform(func_list_.begin(), func_list_.end(), std::back_inserter(support_list),
|
||||
[](const std::pair<KernelAttr, AddFunc> &pair) { return pair.first; });
|
||||
return support_list;
|
||||
}
|
||||
|
||||
MS_KERNEL_FACTORY_REG(NativeCpuKernelMod, Add, TensorAddCpuKernelMod);
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
|
@ -1,57 +0,0 @@
|
|||
/**
|
||||
* Copyright 2020-2022 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TENSORADD_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TENSORADD_CPU_KERNEL_H_
|
||||
|
||||
#include <vector>
|
||||
#include <utility>
|
||||
#include "plugin/device/cpu/kernel/cpu_kernel.h"
|
||||
#include "plugin/factory/ms_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
class TensorAddCpuKernelMod : public DeprecatedNativeCpuKernelMod {
|
||||
public:
|
||||
TensorAddCpuKernelMod() = default;
|
||||
~TensorAddCpuKernelMod() override = default;
|
||||
|
||||
void InitKernel(const CNodePtr &kernel_node) override;
|
||||
|
||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) override {
|
||||
return kernel_func_(this, inputs, workspace, outputs);
|
||||
}
|
||||
|
||||
std::vector<KernelAttr> GetOpSupport() override;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
bool LaunchKernel(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &workspace,
|
||||
const std::vector<kernel::AddressPtr> &outputs);
|
||||
using AddFunc = std::function<bool(TensorAddCpuKernelMod *, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &)>;
|
||||
static std::vector<std::pair<KernelAttr, AddFunc>> func_list_;
|
||||
AddFunc kernel_func_;
|
||||
|
||||
std::vector<int64_t> input_shape_a_;
|
||||
std::vector<int64_t> input_shape_b_;
|
||||
std::vector<int64_t> output_shape_;
|
||||
};
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_TENSORADD_CPU_KERNEL_H_
|
|
@ -71,7 +71,6 @@ int ActorMgr::Initialize(bool use_inner_pool, size_t actor_thread_num, size_t ma
|
|||
return MINDRT_ERROR;
|
||||
}
|
||||
inner_pool_->SetActorThreadNum(actor_thread_num);
|
||||
inner_pool_->DisableOccupiedActorThread();
|
||||
inner_pool_->SetKernelThreadNum(max_thread_num - actor_thread_num);
|
||||
}
|
||||
if (inner_pool_ != nullptr) {
|
||||
|
|
|
@ -191,10 +191,15 @@ void Worker::Active(std::vector<TaskSplit> *task_list, int task_id_start, int ta
|
|||
std::lock_guard<std::mutex> _l(mutex_);
|
||||
// add the first to task_, and others to queue.
|
||||
status_ = kThreadBusy;
|
||||
task_id_.store(task_id_start, std::memory_order_relaxed);
|
||||
THREAD_TEST_TRUE(task_ == nullptr);
|
||||
task_.store((*task_list)[0].task_, std::memory_order_release);
|
||||
for (int i = task_id_start + 1; i < task_id_end; ++i) {
|
||||
Task *task = task_.load(std::memory_order_consume);
|
||||
int to_atomic_task = 0;
|
||||
if (task == nullptr) {
|
||||
task_id_.store(task_id_start, std::memory_order_relaxed);
|
||||
THREAD_TEST_TRUE(task_ == nullptr);
|
||||
task_.store((*task_list)[0].task_, std::memory_order_release);
|
||||
to_atomic_task = 1;
|
||||
}
|
||||
for (int i = task_id_start + to_atomic_task; i < task_id_end; ++i) {
|
||||
while (!local_task_queue_->Enqueue(&(*task_list)[i])) {
|
||||
}
|
||||
}
|
||||
|
|
|
@ -169,7 +169,7 @@ class MS_CORE_API ThreadPool {
|
|||
void DisableOccupiedActorThread() { occupied_actor_thread_ = false; }
|
||||
void SetActorThreadNum(size_t actor_thread_num) { actor_thread_num_ = actor_thread_num; }
|
||||
void SetKernelThreadNum(size_t kernel_thread_num) { kernel_thread_num_ = kernel_thread_num; }
|
||||
size_t GetKernelThreadNum() const { return kernel_thread_num_; }
|
||||
size_t GetKernelThreadNum() const { return kernel_thread_num_ + actor_thread_num_; }
|
||||
size_t GetActorThreadNum() const { return actor_thread_num_; }
|
||||
void SetKernelThreadMaxSpinCount(int spin_count);
|
||||
void SetSpinCountMaxValue();
|
||||
|
|
|
@ -375,8 +375,8 @@ class _Context:
|
|||
|
||||
def set_runtime_num_threads(self, runtime_num_threads):
|
||||
"""Check and set runtime_num_threads."""
|
||||
if runtime_num_threads <= 0:
|
||||
raise ValueError("The num of thread must bigger than 0.")
|
||||
if runtime_num_threads < 0:
|
||||
raise ValueError("The num of thread must bigger than or equal to 0.")
|
||||
self.set_param(ms_ctx_param.runtime_num_threads, runtime_num_threads)
|
||||
|
||||
def set_op_timeout(self, op_timeout):
|
||||
|
@ -924,7 +924,7 @@ def set_context(**kwargs):
|
|||
inter_op_parallel_num(int): The thread number of op parallel at the same time. Default value is 0,
|
||||
which means use the default num.
|
||||
runtime_num_threads(int): The thread pool number of cpu kernel used in runtime,
|
||||
which must bigger than 0. Default value is 30, if you run many processes at
|
||||
which must bigger than or equal to 0. Default value is 30, if you run many processes at
|
||||
the same time, you should set the value smaller to avoid thread contention.
|
||||
disable_format_transform (bool): Whether to disable the automatic format transform function from NCHW to NHWC.
|
||||
When the network training performance of fp16 is worse than fp32,
|
||||
|
|
Loading…
Reference in New Issue