forked from mindspore-Ecosystem/mindspore
!25762 Unified CPU thread pool
Merge pull request !25762 from 范吉斌/unified_thread
This commit is contained in:
commit
b6bf3cd641
|
@ -60,7 +60,7 @@ void AdamCPUKernel::LaunchAdam(const std::vector<kernel::AddressPtr> &inputs, co
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, lens);
|
ParallelLaunchAutoSearch(task, lens, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void AdamCPUKernel::LaunchAdamNnacl(const std::vector<kernel::AddressPtr> &inputs,
|
void AdamCPUKernel::LaunchAdamNnacl(const std::vector<kernel::AddressPtr> &inputs,
|
||||||
|
|
|
@ -54,7 +54,7 @@ void AdamDeltaCPUKernel::LaunchAdamDelta(T *delta, T *m, T *v, float lr, float b
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void AdamDeltaCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
void AdamDeltaCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||||
|
|
|
@ -61,7 +61,7 @@ void AdamWeightDecayCPUKernel::LaunchAdamWeightDecay(const std::vector<AddressPt
|
||||||
var[i] -= lr * update;
|
var[i] -= lr * update;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, lens);
|
ParallelLaunchAutoSearch(task, lens, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
void AdamWeightDecayCPUKernel::LaunchAdamWeightDecayNnacl(const std::vector<AddressPtr> &inputs,
|
void AdamWeightDecayCPUKernel::LaunchAdamWeightDecayNnacl(const std::vector<AddressPtr> &inputs,
|
||||||
|
|
|
@ -62,18 +62,18 @@ void ElementRealDiv(const T *input1, const T *input2, T *out, size_t size, size_
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticCPUKernel<T>::AssignAdd(T *input1, const T *input2, T *out) const {
|
void ArithmeticCPUKernel<T>::AssignAdd(T *input1, const T *input2, T *out) {
|
||||||
auto task = [&input1, &input2, &out](size_t start, size_t end) {
|
auto task = [&input1, &input2, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = input1[i] + input2[i];
|
out[i] = input1[i] + input2[i];
|
||||||
input1[i] = out[i];
|
input1[i] = out[i];
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticCPUKernel<T>::Add(const T *input1, const T *input2, T *out) const {
|
void ArithmeticCPUKernel<T>::Add(const T *input1, const T *input2, T *out) {
|
||||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||||
auto iter = base_iter;
|
auto iter = base_iter;
|
||||||
|
@ -83,7 +83,7 @@ void ArithmeticCPUKernel<T>::Add(const T *input1, const T *input2, T *out) const
|
||||||
iter.GenNextPos();
|
iter.GenNextPos();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -118,7 +118,7 @@ void ArithmeticCPUKernel<T>::Sub(const T *input1, const T *input2, T *out) {
|
||||||
iter.GenNextPos();
|
iter.GenNextPos();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_, kMaxSubSerialSize);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -152,7 +152,7 @@ void ArithmeticCPUKernel<T>::Mul(const T *input1, const T *input2, T *out) {
|
||||||
iter.GenNextPos();
|
iter.GenNextPos();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -203,11 +203,11 @@ void ArithmeticCPUKernel<T>::RealDiv(const T *input1, const T *input2, T *out) {
|
||||||
out[i] = dividend / divisor;
|
out[i] = dividend / divisor;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticCPUKernel<T>::Div(const T *input1, const T *input2, T *out) const {
|
void ArithmeticCPUKernel<T>::Div(const T *input1, const T *input2, T *out) {
|
||||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||||
auto iter = base_iter;
|
auto iter = base_iter;
|
||||||
|
@ -232,11 +232,11 @@ void ArithmeticCPUKernel<T>::Div(const T *input1, const T *input2, T *out) const
|
||||||
out[i] = dividend / divisor;
|
out[i] = dividend / divisor;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out) const {
|
void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out) {
|
||||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||||
auto iter = base_iter;
|
auto iter = base_iter;
|
||||||
|
@ -261,11 +261,11 @@ void ArithmeticCPUKernel<T>::FloorDiv(const T *input1, const T *input2, T *out)
|
||||||
out[i] = static_cast<T>(floor(static_cast<double>(dividend) / static_cast<double>(divisor)));
|
out[i] = static_cast<T>(floor(static_cast<double>(dividend) / static_cast<double>(divisor)));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticCPUKernel<T>::Mod(const T *input1, const T *input2, T *out) const {
|
void ArithmeticCPUKernel<T>::Mod(const T *input1, const T *input2, T *out) {
|
||||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||||
auto iter = base_iter;
|
auto iter = base_iter;
|
||||||
|
@ -283,11 +283,11 @@ void ArithmeticCPUKernel<T>::Mod(const T *input1, const T *input2, T *out) const
|
||||||
out[i] = static_cast<T>(x - data_div_res * y);
|
out[i] = static_cast<T>(x - data_div_res * y);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticCPUKernel<T>::FloorMod(const T *input1, const T *input2, T *out) const {
|
void ArithmeticCPUKernel<T>::FloorMod(const T *input1, const T *input2, T *out) {
|
||||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||||
auto iter = base_iter;
|
auto iter = base_iter;
|
||||||
|
@ -300,11 +300,11 @@ void ArithmeticCPUKernel<T>::FloorMod(const T *input1, const T *input2, T *out)
|
||||||
out[i] = static_cast<T>((std::abs(res) > 1e-9) && ((res < 0.0) != (y < 0.0)) ? res + y : res);
|
out[i] = static_cast<T>((std::abs(res) > 1e-9) && ((res < 0.0) != (y < 0.0)) ? res + y : res);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) const {
|
void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) {
|
||||||
if constexpr (std::is_same_v<T, float>) {
|
if constexpr (std::is_same_v<T, float>) {
|
||||||
auto is_power_single = [this]() {
|
auto is_power_single = [this]() {
|
||||||
bool is_power_single = false;
|
bool is_power_single = false;
|
||||||
|
@ -324,14 +324,14 @@ void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) const
|
||||||
auto task = [&](size_t start, size_t end) {
|
auto task = [&](size_t start, size_t end) {
|
||||||
(void)Power(input1 + start, input2, out + start, end - start, 1, 0, true);
|
(void)Power(input1 + start, input2, out + start, end - start, 1, 0, true);
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (is_power_single()) {
|
if (is_power_single()) {
|
||||||
auto task = [&](size_t start, size_t end) {
|
auto task = [&](size_t start, size_t end) {
|
||||||
(void)Power(input1 + start, input2 + start, out + start, end - start, 1, 0, false);
|
(void)Power(input1 + start, input2 + start, out + start, end - start, 1, 0, false);
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -348,7 +348,7 @@ void ArithmeticCPUKernel<T>::Pow(const T *input1, const T *input2, T *out) const
|
||||||
iter.GenNextPos();
|
iter.GenNextPos();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
} else {
|
} else {
|
||||||
base_iter.SetPos(0);
|
base_iter.SetPos(0);
|
||||||
for (size_t i = 0; i < output_size_; i++) {
|
for (size_t i = 0; i < output_size_; i++) {
|
||||||
|
@ -376,7 +376,7 @@ void ArithmeticCPUKernel<T>::SquaredDifference(const T *input1, const T *input2,
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticCPUKernel<T>::Atan2(const T *input1, const T *input2, T *out) const {
|
void ArithmeticCPUKernel<T>::Atan2(const T *input1, const T *input2, T *out) {
|
||||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||||
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
auto task = [&input1, &input2, &out, &base_iter](size_t start, size_t end) {
|
||||||
auto iter = base_iter;
|
auto iter = base_iter;
|
||||||
|
@ -387,7 +387,7 @@ void ArithmeticCPUKernel<T>::Atan2(const T *input1, const T *input2, T *out) con
|
||||||
iter.GenNextPos();
|
iter.GenNextPos();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|
|
@ -39,16 +39,16 @@ class ArithmeticCPUKernel : public CPUKernel {
|
||||||
private:
|
private:
|
||||||
void InitComputeFunc();
|
void InitComputeFunc();
|
||||||
void Sub(const T *input1, const T *input2, T *out);
|
void Sub(const T *input1, const T *input2, T *out);
|
||||||
void Add(const T *input1, const T *input2, T *out) const;
|
void Add(const T *input1, const T *input2, T *out);
|
||||||
void Mul(const T *input1, const T *input2, T *out);
|
void Mul(const T *input1, const T *input2, T *out);
|
||||||
void RealDiv(const T *input1, const T *input2, T *out);
|
void RealDiv(const T *input1, const T *input2, T *out);
|
||||||
void Div(const T *input1, const T *input2, T *out) const;
|
void Div(const T *input1, const T *input2, T *out);
|
||||||
void FloorDiv(const T *input1, const T *input2, T *out) const;
|
void FloorDiv(const T *input1, const T *input2, T *out);
|
||||||
void Mod(const T *input1, const T *input2, T *out) const;
|
void Mod(const T *input1, const T *input2, T *out);
|
||||||
void FloorMod(const T *input1, const T *input2, T *out) const;
|
void FloorMod(const T *input1, const T *input2, T *out);
|
||||||
void Pow(const T *input1, const T *input2, T *out) const;
|
void Pow(const T *input1, const T *input2, T *out);
|
||||||
void AssignAdd(T *input1, const T *input2, T *out) const;
|
void AssignAdd(T *input1, const T *input2, T *out);
|
||||||
void Atan2(const T *input1, const T *input2, T *out) const;
|
void Atan2(const T *input1, const T *input2, T *out);
|
||||||
void SquaredDifference(const T *input1, const T *input2, T *out);
|
void SquaredDifference(const T *input1, const T *input2, T *out);
|
||||||
|
|
||||||
using TypeComputeFunc = std::function<void(ArithmeticCPUKernel *, const T *in_x, const T *in_y, T *out)>;
|
using TypeComputeFunc = std::function<void(ArithmeticCPUKernel *, const T *in_x, const T *in_y, T *out)>;
|
||||||
|
|
|
@ -32,7 +32,7 @@ constexpr size_t kOutputsNum = 1;
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *out) const {
|
void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *out) {
|
||||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||||
if (output_size_ > kMaxLessSerialSize) {
|
if (output_size_ > kMaxLessSerialSize) {
|
||||||
auto task = [&](size_t start, size_t end) {
|
auto task = [&](size_t start, size_t end) {
|
||||||
|
@ -45,7 +45,7 @@ void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *o
|
||||||
iter.GenNextPos();
|
iter.GenNextPos();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
} else {
|
} else {
|
||||||
base_iter.SetPos(0);
|
base_iter.SetPos(0);
|
||||||
for (size_t i = 0; i < output_size_; i++) {
|
for (size_t i = 0; i < output_size_; i++) {
|
||||||
|
@ -58,7 +58,7 @@ void ArithmeticLogicCPUKernel<T>::Less(const T *input1, const T *input2, bool *o
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool *out) const {
|
void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool *out) {
|
||||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||||
auto task = [&](size_t start, size_t end) {
|
auto task = [&](size_t start, size_t end) {
|
||||||
auto iter = base_iter;
|
auto iter = base_iter;
|
||||||
|
@ -70,11 +70,11 @@ void ArithmeticLogicCPUKernel<T>::Equal(const T *input1, const T *input2, bool *
|
||||||
iter.GenNextPos();
|
iter.GenNextPos();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, bool *out) const {
|
void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, bool *out) {
|
||||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||||
auto task = [&](size_t start, size_t end) {
|
auto task = [&](size_t start, size_t end) {
|
||||||
auto iter = base_iter;
|
auto iter = base_iter;
|
||||||
|
@ -86,11 +86,11 @@ void ArithmeticLogicCPUKernel<T>::NotEqual(const T *input1, const T *input2, boo
|
||||||
iter.GenNextPos();
|
iter.GenNextPos();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticLogicCPUKernel<T>::LogicalAnd(const T *input1, const T *input2, bool *out) const {
|
void ArithmeticLogicCPUKernel<T>::LogicalAnd(const T *input1, const T *input2, bool *out) {
|
||||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||||
auto task = [&](size_t start, size_t end) {
|
auto task = [&](size_t start, size_t end) {
|
||||||
auto iter = base_iter;
|
auto iter = base_iter;
|
||||||
|
@ -100,11 +100,11 @@ void ArithmeticLogicCPUKernel<T>::LogicalAnd(const T *input1, const T *input2, b
|
||||||
iter.GenNextPos();
|
iter.GenNextPos();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticLogicCPUKernel<T>::LogicalOr(const T *input1, const T *input2, bool *out) const {
|
void ArithmeticLogicCPUKernel<T>::LogicalOr(const T *input1, const T *input2, bool *out) {
|
||||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||||
auto task = [&](size_t start, size_t end) {
|
auto task = [&](size_t start, size_t end) {
|
||||||
auto iter = base_iter;
|
auto iter = base_iter;
|
||||||
|
@ -114,11 +114,11 @@ void ArithmeticLogicCPUKernel<T>::LogicalOr(const T *input1, const T *input2, bo
|
||||||
iter.GenNextPos();
|
iter.GenNextPos();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool *out) const {
|
void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool *out) {
|
||||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||||
auto task = [&](size_t start, size_t end) {
|
auto task = [&](size_t start, size_t end) {
|
||||||
auto iter = base_iter;
|
auto iter = base_iter;
|
||||||
|
@ -130,11 +130,11 @@ void ArithmeticLogicCPUKernel<T>::Greater(const T *input1, const T *input2, bool
|
||||||
iter.GenNextPos();
|
iter.GenNextPos();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2, bool *out) const {
|
void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2, bool *out) {
|
||||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||||
auto task = [&](size_t start, size_t end) {
|
auto task = [&](size_t start, size_t end) {
|
||||||
auto iter = base_iter;
|
auto iter = base_iter;
|
||||||
|
@ -146,11 +146,11 @@ void ArithmeticLogicCPUKernel<T>::GreaterEqual(const T *input1, const T *input2,
|
||||||
iter.GenNextPos();
|
iter.GenNextPos();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bool *out) const {
|
void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bool *out) {
|
||||||
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
BroadcastIterator base_iter(input_shape1_, input_shape2_, output_shape_);
|
||||||
auto task = [&](size_t start, size_t end) {
|
auto task = [&](size_t start, size_t end) {
|
||||||
auto iter = base_iter;
|
auto iter = base_iter;
|
||||||
|
@ -162,7 +162,7 @@ void ArithmeticLogicCPUKernel<T>::LessEqual(const T *input1, const T *input2, bo
|
||||||
iter.GenNextPos();
|
iter.GenNextPos();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size_);
|
ParallelLaunchAutoSearch(task, output_size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|
|
@ -39,14 +39,14 @@ class ArithmeticLogicCPUKernel : public CPUKernel {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void InitComputeFunc();
|
void InitComputeFunc();
|
||||||
void Less(const T *input1, const T *input2, bool *out) const;
|
void Less(const T *input1, const T *input2, bool *out);
|
||||||
void Equal(const T *input1, const T *input2, bool *out) const;
|
void Equal(const T *input1, const T *input2, bool *out);
|
||||||
void NotEqual(const T *input1, const T *input2, bool *out) const;
|
void NotEqual(const T *input1, const T *input2, bool *out);
|
||||||
void Greater(const T *input1, const T *input2, bool *out) const;
|
void Greater(const T *input1, const T *input2, bool *out);
|
||||||
void GreaterEqual(const T *input1, const T *input2, bool *out) const;
|
void GreaterEqual(const T *input1, const T *input2, bool *out);
|
||||||
void LessEqual(const T *input1, const T *input2, bool *out) const;
|
void LessEqual(const T *input1, const T *input2, bool *out);
|
||||||
void LogicalAnd(const T *input1, const T *input2, bool *out) const;
|
void LogicalAnd(const T *input1, const T *input2, bool *out);
|
||||||
void LogicalOr(const T *input1, const T *input2, bool *out) const;
|
void LogicalOr(const T *input1, const T *input2, bool *out);
|
||||||
|
|
||||||
using TypeComputeFunc = std::function<void(ArithmeticLogicCPUKernel *, const T *, const T *, bool *)>;
|
using TypeComputeFunc = std::function<void(ArithmeticLogicCPUKernel *, const T *, const T *, bool *)>;
|
||||||
TypeComputeFunc compute_func_{nullptr};
|
TypeComputeFunc compute_func_{nullptr};
|
||||||
|
|
|
@ -33,17 +33,17 @@ constexpr size_t kInputsNum = 1;
|
||||||
constexpr size_t kOutputsNum = 1;
|
constexpr size_t kOutputsNum = 1;
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Square(const T *in, T *out, size_t size) {
|
void Square(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = in[i] * in[i];
|
out[i] = in[i] * in[i];
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
ParallelLaunch(task, size, kMaxSquareSerialSize);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Sign(const T *in, T *out, size_t size) {
|
void Sign(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
if (in[i] < 0) {
|
if (in[i] < 0) {
|
||||||
|
@ -55,11 +55,11 @@ void Sign(const T *in, T *out, size_t size) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Neg(const T *in, T *out, size_t size) {
|
void Neg(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = -in[i];
|
out[i] = -in[i];
|
||||||
|
@ -68,77 +68,77 @@ void Neg(const T *in, T *out, size_t size) {
|
||||||
ParallelLaunch(task, size, kMaxNegSerialSize);
|
ParallelLaunch(task, size, kMaxNegSerialSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
void LogicalNot(const bool *in, bool *out, size_t size) {
|
void LogicalNot(ArithmeticSelfCPUKernel *content, const bool *in, bool *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = !in[i];
|
out[i] = !in[i];
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void OnesLike(const T *, T *out, size_t size) {
|
void OnesLike(ArithmeticSelfCPUKernel *content, const T *, T *out, size_t size) {
|
||||||
auto task = [&out](size_t start, size_t end) {
|
auto task = [&out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(1);
|
out[i] = static_cast<T>(1);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ZerosLike(const T *, T *out, size_t size) {
|
void ZerosLike(ArithmeticSelfCPUKernel *content, const T *, T *out, size_t size) {
|
||||||
auto task = [&out](size_t start, size_t end) {
|
auto task = [&out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(0);
|
out[i] = static_cast<T>(0);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Floor(const T *in, T *out, size_t size) {
|
void Floor(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(floor(in[i]));
|
out[i] = static_cast<T>(floor(in[i]));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Rint(const T *in, T *out, size_t size) {
|
void Rint(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&](size_t start, size_t end) {
|
auto task = [&](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(rint(in[i]));
|
out[i] = static_cast<T>(rint(in[i]));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Round(const T *in, T *out, size_t size) {
|
void Round(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&](size_t start, size_t end) {
|
auto task = [&](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(nearbyint(in[i]));
|
out[i] = static_cast<T>(nearbyint(in[i]));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Reciprocal(const T *in, T *out, size_t size) {
|
void Reciprocal(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(1.0 / in[i]);
|
out[i] = static_cast<T>(1.0 / in[i]);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Gelu(const T *in, T *out, size_t size) {
|
void Gelu(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
auto factor_a = static_cast<T>(0.7978845608);
|
auto factor_a = static_cast<T>(0.7978845608);
|
||||||
auto factor_b = static_cast<T>(0.044715);
|
auto factor_b = static_cast<T>(0.044715);
|
||||||
|
@ -149,137 +149,137 @@ void Gelu(const T *in, T *out, size_t size) {
|
||||||
out[i] = x * (static_cast<T>(1.0) + tanh_res) / static_cast<T>(2.0);
|
out[i] = x * (static_cast<T>(1.0) + tanh_res) / static_cast<T>(2.0);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Asin(const T *in, T *out, size_t size) {
|
void Asin(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(asin(static_cast<double>(in[i])));
|
out[i] = static_cast<T>(asin(static_cast<double>(in[i])));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ACos(const T *in, T *out, size_t size) {
|
void ACos(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(acos(static_cast<double>(in[i])));
|
out[i] = static_cast<T>(acos(static_cast<double>(in[i])));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Atan(const T *in, T *out, size_t size) {
|
void Atan(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(atan(static_cast<double>(in[i])));
|
out[i] = static_cast<T>(atan(static_cast<double>(in[i])));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Sin(const T *in, T *out, size_t size) {
|
void Sin(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(sin(static_cast<double>(in[i])));
|
out[i] = static_cast<T>(sin(static_cast<double>(in[i])));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Cos(const T *in, T *out, size_t size) {
|
void Cos(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(cos(static_cast<double>(in[i])));
|
out[i] = static_cast<T>(cos(static_cast<double>(in[i])));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Tan(const T *in, T *out, size_t size) {
|
void Tan(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(tan(static_cast<double>(in[i])));
|
out[i] = static_cast<T>(tan(static_cast<double>(in[i])));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Sinh(const T *in, T *out, size_t size) {
|
void Sinh(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(sinh(static_cast<double>(in[i])));
|
out[i] = static_cast<T>(sinh(static_cast<double>(in[i])));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Cosh(const T *in, T *out, size_t size) {
|
void Cosh(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(cosh(static_cast<double>(in[i])));
|
out[i] = static_cast<T>(cosh(static_cast<double>(in[i])));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Asinh(const T *in, T *out, size_t size) {
|
void Asinh(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(asinh(static_cast<double>(in[i])));
|
out[i] = static_cast<T>(asinh(static_cast<double>(in[i])));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Acosh(const T *in, T *out, size_t size) {
|
void Acosh(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(acosh(static_cast<double>(in[i])));
|
out[i] = static_cast<T>(acosh(static_cast<double>(in[i])));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Atanh(const T *in, T *out, size_t size) {
|
void Atanh(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(atanh(static_cast<double>(in[i])));
|
out[i] = static_cast<T>(atanh(static_cast<double>(in[i])));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Abs(const T *in, T *out, size_t size) {
|
void Abs(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = abs(in[i]);
|
out[i] = abs(in[i]);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void Sqrt(const T *in, T *out, size_t size) {
|
void Sqrt(ArithmeticSelfCPUKernel *content, const T *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = sqrt(in[i]);
|
out[i] = sqrt(in[i]);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -316,49 +316,49 @@ bool ArithmeticSelfCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inpu
|
||||||
}
|
}
|
||||||
|
|
||||||
void ArithmeticSelfCPUKernel::LaunchLogicalNot(const std::vector<AddressPtr> &inputs,
|
void ArithmeticSelfCPUKernel::LaunchLogicalNot(const std::vector<AddressPtr> &inputs,
|
||||||
const std::vector<AddressPtr> &outputs) const {
|
const std::vector<AddressPtr> &outputs) {
|
||||||
auto *input = reinterpret_cast<bool *>(inputs[0]->addr);
|
auto *input = reinterpret_cast<bool *>(inputs[0]->addr);
|
||||||
auto *output = reinterpret_cast<bool *>(outputs[0]->addr);
|
auto *output = reinterpret_cast<bool *>(outputs[0]->addr);
|
||||||
size_t lens = outputs[0]->size / sizeof(bool);
|
size_t lens = outputs[0]->size / sizeof(bool);
|
||||||
LogicalNot(input, output, lens);
|
LogicalNot(this, input, output, lens);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void ArithmeticSelfCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
|
void ArithmeticSelfCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
|
||||||
const std::vector<AddressPtr> &outputs) const {
|
const std::vector<AddressPtr> &outputs) {
|
||||||
const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
|
const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
|
||||||
auto *output = reinterpret_cast<T *>(outputs[0]->addr);
|
auto *output = reinterpret_cast<T *>(outputs[0]->addr);
|
||||||
const size_t lens = outputs[0]->size / sizeof(T);
|
const size_t lens = outputs[0]->size / sizeof(T);
|
||||||
static const std::unordered_map<std::string, std::function<void(const T *, T *, size_t)>> arithmeticSelfFuncMap{
|
static const std::unordered_map<std::string, std::function<void(ArithmeticSelfCPUKernel *, const T *, T *, size_t)>>
|
||||||
{prim::kPrimSquare->name(), Square<T>},
|
arithmeticSelfFuncMap{{prim::kPrimSquare->name(), Square<T>},
|
||||||
{prim::kPrimSign->name(), Sign<T>},
|
{prim::kPrimSign->name(), Sign<T>},
|
||||||
{prim::kPrimNeg->name(), Neg<T>},
|
{prim::kPrimNeg->name(), Neg<T>},
|
||||||
{prim::kPrimAtanh->name(), Atanh<T>},
|
{prim::kPrimAtanh->name(), Atanh<T>},
|
||||||
{prim::kPrimAcosh->name(), Acosh<T>},
|
{prim::kPrimAcosh->name(), Acosh<T>},
|
||||||
{prim::kPrimFloor->name(), Floor<T>},
|
{prim::kPrimFloor->name(), Floor<T>},
|
||||||
{prim::kPrimSin->name(), Sin<T>},
|
{prim::kPrimSin->name(), Sin<T>},
|
||||||
{prim::kPrimGeLU->name(), Gelu<T>},
|
{prim::kPrimGeLU->name(), Gelu<T>},
|
||||||
{prim::kPrimCos->name(), Cos<T>},
|
{prim::kPrimCos->name(), Cos<T>},
|
||||||
{prim::kPrimTan->name(), Tan<T>},
|
{prim::kPrimTan->name(), Tan<T>},
|
||||||
{prim::kPrimAsin->name(), Asin<T>},
|
{prim::kPrimAsin->name(), Asin<T>},
|
||||||
{prim::kPrimACos->name(), ACos<T>},
|
{prim::kPrimACos->name(), ACos<T>},
|
||||||
{prim::kPrimAtan->name(), Atan<T>},
|
{prim::kPrimAtan->name(), Atan<T>},
|
||||||
{prim::kPrimSinh->name(), Sinh<T>},
|
{prim::kPrimSinh->name(), Sinh<T>},
|
||||||
{prim::kPrimCosh->name(), Cosh<T>},
|
{prim::kPrimCosh->name(), Cosh<T>},
|
||||||
{prim::kPrimAsinh->name(), Asinh<T>},
|
{prim::kPrimAsinh->name(), Asinh<T>},
|
||||||
{prim::kPrimZerosLike->name(), ZerosLike<T>},
|
{prim::kPrimZerosLike->name(), ZerosLike<T>},
|
||||||
{prim::kPrimOnesLike->name(), OnesLike<T>},
|
{prim::kPrimOnesLike->name(), OnesLike<T>},
|
||||||
{prim::kPrimReciprocal->name(), Reciprocal<T>},
|
{prim::kPrimReciprocal->name(), Reciprocal<T>},
|
||||||
{prim::kPrimRint->name(), Rint<T>},
|
{prim::kPrimRint->name(), Rint<T>},
|
||||||
{prim::kPrimRound->name(), Round<T>},
|
{prim::kPrimRound->name(), Round<T>},
|
||||||
{prim::kPrimAbs->name(), Abs<T>},
|
{prim::kPrimAbs->name(), Abs<T>},
|
||||||
{prim::kPrimSqrt->name(), Sqrt<T>}};
|
{prim::kPrimSqrt->name(), Sqrt<T>}};
|
||||||
|
|
||||||
const auto func_pair = arithmeticSelfFuncMap.find(kernel_name_);
|
const auto func_pair = arithmeticSelfFuncMap.find(kernel_name_);
|
||||||
if (arithmeticSelfFuncMap.find(kernel_name_) == arithmeticSelfFuncMap.end()) {
|
if (arithmeticSelfFuncMap.find(kernel_name_) == arithmeticSelfFuncMap.end()) {
|
||||||
MS_LOG(EXCEPTION) << "ArithmeticSelfCPUKernel does not support " << kernel_name_;
|
MS_LOG(EXCEPTION) << "ArithmeticSelfCPUKernel does not support " << kernel_name_;
|
||||||
}
|
}
|
||||||
func_pair->second(input, output, lens);
|
func_pair->second(this, input, output, lens);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|
|
@ -37,9 +37,9 @@ class ArithmeticSelfCPUKernel : public CPUKernel {
|
||||||
|
|
||||||
private:
|
private:
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;
|
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||||
|
|
||||||
void LaunchLogicalNot(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;
|
void LaunchLogicalNot(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||||
|
|
||||||
TypeId dtype_{kTypeUnknown};
|
TypeId dtype_{kTypeUnknown};
|
||||||
};
|
};
|
||||||
|
|
|
@ -141,8 +141,7 @@ bool BoundingBoxDecodeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr>
|
||||||
bboxes[right_y] = y2;
|
bboxes[right_y] = y2;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, elem_num);
|
ParallelLaunchAutoSearch(task, elem_num, this, ¶llel_search_info_);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
|
|
|
@ -113,7 +113,7 @@ bool BoundingBoxEncodeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr>
|
||||||
deltas[right_y] = (dh - static_cast<T>(means_[H_INDEX])) / static_cast<T>(stds_[H_INDEX]);
|
deltas[right_y] = (dh - static_cast<T>(means_[H_INDEX])) / static_cast<T>(stds_[H_INDEX]);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, elem_num);
|
ParallelLaunchAutoSearch(task, elem_num, this, ¶llel_search_info_);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -30,13 +30,13 @@ constexpr size_t kCastOutputsNum = 1;
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
template <typename S, typename T>
|
template <typename S, typename T>
|
||||||
void Cast(const S *in, T *out, size_t size) {
|
void Cast(CastCPUKernel<S, T> *content, const S *in, T *out, size_t size) {
|
||||||
auto task = [&in, &out](size_t start, size_t end) {
|
auto task = [&in, &out](size_t start, size_t end) {
|
||||||
for (size_t i = start; i < end; i++) {
|
for (size_t i = start; i < end; i++) {
|
||||||
out[i] = static_cast<T>(in[i]);
|
out[i] = static_cast<T>(in[i]);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename S, typename T>
|
template <typename S, typename T>
|
||||||
|
@ -59,7 +59,7 @@ bool CastCPUKernel<S, T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||||
const auto *input = reinterpret_cast<S *>(inputs[0]->addr);
|
const auto *input = reinterpret_cast<S *>(inputs[0]->addr);
|
||||||
auto *output = reinterpret_cast<T *>(outputs[0]->addr);
|
auto *output = reinterpret_cast<T *>(outputs[0]->addr);
|
||||||
MS_LOG(DEBUG) << "Type source: " << typeid(S).name() << "; target: " << typeid(T).name();
|
MS_LOG(DEBUG) << "Type source: " << typeid(S).name() << "; target: " << typeid(T).name();
|
||||||
Cast<S, T>(input, output, outputs[0]->size / sizeof(T));
|
Cast<S, T>(this, input, output, outputs[0]->size / sizeof(T));
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
|
|
|
@ -64,8 +64,7 @@ bool CheckValidCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &input
|
||||||
output[i] = !valid_flag;
|
output[i] = !valid_flag;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, elem_num);
|
ParallelLaunchAutoSearch(task, elem_num, this, ¶llel_search_info_);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -141,12 +141,13 @@ class CPUKernel : public kernel::KernelMod {
|
||||||
void InitDynamicKernel(const CNodePtr &cnode_ptr) { dynamic_kernel_ = std::make_shared<CpuDynamicKernel>(cnode_ptr); }
|
void InitDynamicKernel(const CNodePtr &cnode_ptr) { dynamic_kernel_ = std::make_shared<CpuDynamicKernel>(cnode_ptr); }
|
||||||
device::DynamicKernelPtr DynamicKernel() const { return dynamic_kernel_; }
|
device::DynamicKernelPtr DynamicKernel() const { return dynamic_kernel_; }
|
||||||
|
|
||||||
|
ParallelSearchInfo parallel_search_info_;
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual void InitInputOutputSize(const CNodePtr &kernel_node);
|
virtual void InitInputOutputSize(const CNodePtr &kernel_node);
|
||||||
std::vector<size_t> input_size_list_;
|
std::vector<size_t> input_size_list_;
|
||||||
std::vector<size_t> output_size_list_;
|
std::vector<size_t> output_size_list_;
|
||||||
std::vector<size_t> workspace_size_list_;
|
std::vector<size_t> workspace_size_list_;
|
||||||
ParallelSearchInfo parallel_search_info_;
|
|
||||||
CNodeWeakPtr cnode_ptr_;
|
CNodeWeakPtr cnode_ptr_;
|
||||||
device::DynamicKernelPtr dynamic_kernel_;
|
device::DynamicKernelPtr dynamic_kernel_;
|
||||||
|
|
||||||
|
|
|
@ -214,7 +214,7 @@ bool CropAndResizeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &in
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, IntToSize(output_size_));
|
ParallelLaunchAutoSearch(task, IntToSize(output_size_), this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
|
|
|
@ -225,35 +225,16 @@ void CumSumCPUKernel::LaunchCumSum(const T *input, T *output, T *workspace, size
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void CumSumCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
|
void CumSumCPUKernel::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
|
||||||
const std::vector<kernel::AddressPtr> &workspace,
|
const std::vector<kernel::AddressPtr> &workspace,
|
||||||
const std::vector<kernel::AddressPtr> &outputs) const {
|
const std::vector<kernel::AddressPtr> &outputs) {
|
||||||
const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
|
const auto *input = reinterpret_cast<T *>(inputs[0]->addr);
|
||||||
auto *ws = reinterpret_cast<T *>(workspace[0]->addr);
|
auto *ws = reinterpret_cast<T *>(workspace[0]->addr);
|
||||||
auto output = reinterpret_cast<T *>(outputs[0]->addr);
|
auto output = reinterpret_cast<T *>(outputs[0]->addr);
|
||||||
// multithreading
|
// multithreading
|
||||||
size_t lens = inputs[0]->size > 0 ? static_cast<size_t>(inputs[0]->size / sizeof(T)) : 1;
|
size_t lens = inputs[0]->size > 0 ? static_cast<size_t>(inputs[0]->size / sizeof(T)) : 1;
|
||||||
auto max_thread_num = std::thread::hardware_concurrency();
|
auto task = [this, &input, &output, &ws](size_t start, size_t end) {
|
||||||
size_t thread_num = lens < 128 * max_thread_num ? std::ceil(lens / 128.0) : max_thread_num;
|
LaunchCumSum<T>(input, output, ws, start, end);
|
||||||
MS_LOG(INFO) << "Lens=" << lens << "; use thread_num=" << thread_num << "; max_thread_num: " << max_thread_num;
|
};
|
||||||
std::vector<std::thread> threads;
|
ParallelLaunchAutoSearch(task, lens, this, ¶llel_search_info_);
|
||||||
threads.reserve(thread_num);
|
|
||||||
size_t start = 0;
|
|
||||||
if (thread_num < 1) {
|
|
||||||
MS_LOG(ERROR) << "Invalid value: thread_num " << thread_num;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
size_t once_compute_size = (lens + thread_num - 1) / thread_num;
|
|
||||||
if (once_compute_size < 1) {
|
|
||||||
MS_LOG(ERROR) << "Invalid value: once_compute_size " << once_compute_size;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
while (start < lens) {
|
|
||||||
size_t end = (start + once_compute_size) > lens ? lens : (start + once_compute_size);
|
|
||||||
(void)threads.emplace_back(std::thread(&CumSumCPUKernel::LaunchCumSum<T>, this, input, output, ws, start, end));
|
|
||||||
start += once_compute_size;
|
|
||||||
}
|
|
||||||
for (size_t i = 0; i < threads.size(); ++i) {
|
|
||||||
threads[i].join();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -65,7 +65,7 @@ class CumSumCPUKernel : public CPUKernel {
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||||
const std::vector<AddressPtr> &outputs) const;
|
const std::vector<AddressPtr> &outputs);
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void LaunchCumSum(const T *input_addr, T *output_addr, T *ws_addr, size_t start, size_t end) const;
|
void LaunchCumSum(const T *input_addr, T *output_addr, T *ws_addr, size_t start, size_t end) const;
|
||||||
|
|
|
@ -73,7 +73,7 @@ bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inp
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
|
|
|
@ -51,8 +51,7 @@ bool EluGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, con
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void EluGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
|
void EluGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
|
||||||
const std::vector<AddressPtr> &outputs) const {
|
|
||||||
const auto *input0 = reinterpret_cast<T *>(inputs[0]->addr);
|
const auto *input0 = reinterpret_cast<T *>(inputs[0]->addr);
|
||||||
const auto *input1 = reinterpret_cast<T *>(inputs[1]->addr);
|
const auto *input1 = reinterpret_cast<T *>(inputs[1]->addr);
|
||||||
auto *output = reinterpret_cast<T *>(outputs[0]->addr);
|
auto *output = reinterpret_cast<T *>(outputs[0]->addr);
|
||||||
|
@ -64,7 +63,7 @@ void EluGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs,
|
||||||
output[i] = (input1[i] < static_cast<T>(0)) ? input0[i] * (input1[i] + alpha) : input0[i];
|
output[i] = (input1[i] < static_cast<T>(0)) ? input0[i] * (input1[i] + alpha) : input0[i];
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, lens);
|
ParallelLaunchAutoSearch(task, lens, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -34,7 +34,7 @@ class EluGradCPUKernel : public CPUKernel {
|
||||||
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
|
||||||
const std::vector<AddressPtr> &outputs) override;
|
const std::vector<AddressPtr> &outputs) override;
|
||||||
template <typename T>
|
template <typename T>
|
||||||
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) const;
|
void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
TypeId dtype_{kTypeUnknown};
|
TypeId dtype_{kTypeUnknown};
|
||||||
|
|
|
@ -59,7 +59,7 @@ bool HSigmoidCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, tensor_size_);
|
ParallelLaunchAutoSearch(task, tensor_size_, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
|
|
|
@ -58,7 +58,7 @@ bool HSigmoidGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, tensor_size_);
|
ParallelLaunchAutoSearch(task, tensor_size_, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
|
|
|
@ -57,7 +57,7 @@ bool HSwishCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, tensor_size_);
|
ParallelLaunchAutoSearch(task, tensor_size_, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
|
|
|
@ -61,7 +61,7 @@ bool HSwishGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &input
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, tensor_size_);
|
ParallelLaunchAutoSearch(task, tensor_size_, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
|
|
|
@ -93,7 +93,7 @@ bool IOUCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, cons
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, iou_size_);
|
ParallelLaunchAutoSearch(task, iou_size_, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
|
|
|
@ -90,7 +90,7 @@ void L2NormalizeCPUKernel<T>::CalcDenominator(const T *input_addr, const size_t
|
||||||
(*denominator_addr)[i] = sqrt(denominator);
|
(*denominator_addr)[i] = sqrt(denominator);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, reduce_size);
|
ParallelLaunchAutoSearch(task, reduce_size, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -120,7 +120,7 @@ void L2NormalizeCPUKernel<T>::CalcOutput(const T *input_addr, const std::vector<
|
||||||
iter.GenNextPos();
|
iter.GenNextPos();
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size);
|
ParallelLaunchAutoSearch(task, output_size, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|
|
@ -65,7 +65,7 @@ bool L2NormalizeGradCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs,
|
||||||
GetOutput(input_x_vector, y_vector, dout_vector, high_dim_index, &output[i]);
|
GetOutput(input_x_vector, y_vector, dout_vector, high_dim_index, &output[i]);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size);
|
ParallelLaunchAutoSearch(task, output_size, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -86,11 +86,11 @@ bool AddNCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, const
|
||||||
const auto input_1 = reinterpret_cast<int *>(inputs[1]->addr);
|
const auto input_1 = reinterpret_cast<int *>(inputs[1]->addr);
|
||||||
auto output = reinterpret_cast<int *>(outputs[0]->addr);
|
auto output = reinterpret_cast<int *>(outputs[0]->addr);
|
||||||
auto task_0 = std::bind(AddInt, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2);
|
auto task_0 = std::bind(AddInt, input_0, input_1, output, std::placeholders::_1, std::placeholders::_2);
|
||||||
CPUKernelUtils::ParallelFor(task_0, elements_num);
|
ParallelLaunchAutoSearch(task_0, elements_num, this, ¶llel_search_info_);
|
||||||
for (size_t index = 2; index < input_num_; ++index) {
|
for (size_t index = 2; index < input_num_; ++index) {
|
||||||
const auto input = reinterpret_cast<int *>(inputs[index]->addr);
|
const auto input = reinterpret_cast<int *>(inputs[index]->addr);
|
||||||
auto task = std::bind(AddInt, input, output, output, std::placeholders::_1, std::placeholders::_2);
|
auto task = std::bind(AddInt, input, output, output, std::placeholders::_1, std::placeholders::_2);
|
||||||
CPUKernelUtils::ParallelFor(task, elements_num);
|
ParallelLaunchAutoSearch(task, elements_num, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
} else if (dtype_ == kNumberTypeFloat64) {
|
} else if (dtype_ == kNumberTypeFloat64) {
|
||||||
size_t elements_num = outputs[0]->size / sizeof(double);
|
size_t elements_num = outputs[0]->size / sizeof(double);
|
||||||
|
|
|
@ -49,7 +49,7 @@ void NMSWithMaskCPUKernel<T>::NmsBitonicSortByKeyKernel(const int inner, const s
|
||||||
index_buff[i] = i;
|
index_buff[i] = i;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task1, ceil_power2);
|
ParallelLaunchAutoSearch(task1, ceil_power2, this, ¶llel_search_info_);
|
||||||
|
|
||||||
for (size_t i = 2; i <= ceil_power2; i <<= 1) {
|
for (size_t i = 2; i <= ceil_power2; i <<= 1) {
|
||||||
for (size_t j = (i >> 1); j > 0; j >>= 1) {
|
for (size_t j = (i >> 1); j > 0; j >>= 1) {
|
||||||
|
@ -71,7 +71,7 @@ void NMSWithMaskCPUKernel<T>::NmsBitonicSortByKeyKernel(const int inner, const s
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task2, ceil_power2);
|
ParallelLaunchAutoSearch(task2, ceil_power2, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -84,7 +84,7 @@ void NMSWithMaskCPUKernel<T>::MaskInit(size_t numSq, bool *row_mask) {
|
||||||
row_mask[mat_pos] = true;
|
row_mask[mat_pos] = true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, numSq);
|
ParallelLaunchAutoSearch(task, numSq, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
// copy data from input to output array sorted by indices returned from bitonic sort
|
// copy data from input to output array sorted by indices returned from bitonic sort
|
||||||
|
@ -122,7 +122,7 @@ void NMSWithMaskCPUKernel<T>::PopulateOutput(const T *data_in, T *data_out, cons
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, IntToSize(num));
|
ParallelLaunchAutoSearch(task, IntToSize(num), this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
// populated return mask (init to all true) and return index array
|
// populated return mask (init to all true) and return index array
|
||||||
|
@ -134,7 +134,7 @@ void NMSWithMaskCPUKernel<T>::Preprocess(const int num, int *sel_idx, bool *sel_
|
||||||
sel_boxes[box_num] = true;
|
sel_boxes[box_num] = true;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, IntToSize(num));
|
ParallelLaunchAutoSearch(task, IntToSize(num), this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -175,7 +175,7 @@ void NMSWithMaskCPUKernel<T>::NmsPass(const int num, const float IOU_value, cons
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, IntToSize(num * num));
|
ParallelLaunchAutoSearch(task, IntToSize(num * num), this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reduce pass runs on 1 block to allow thread sync
|
// Reduce pass runs on 1 block to allow thread sync
|
||||||
|
@ -192,7 +192,7 @@ void NMSWithMaskCPUKernel<T>::ReducePass(const int num, bool *sel_boxes, const b
|
||||||
sel_boxes[j] = sel_boxes[j] && row_mask[i * num + j];
|
sel_boxes[j] = sel_boxes[j] && row_mask[i * num + j];
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, IntToSize(num));
|
ParallelLaunchAutoSearch(task, IntToSize(num), this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -61,28 +61,8 @@ bool PackCpuFwdKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const st
|
||||||
|
|
||||||
// multi-threading
|
// multi-threading
|
||||||
size_t input_size = output_size_;
|
size_t input_size = output_size_;
|
||||||
size_t max_thread_num = std::max(std::thread::hardware_concurrency(), static_cast<unsigned int>(1));
|
auto task = [this, &output](size_t start, size_t end) { PackTensor(output, start, end); };
|
||||||
size_t use_thread_num =
|
ParallelLaunchAutoSearch(task, input_size, this, ¶llel_search_info_);
|
||||||
input_size < 128 * max_thread_num ? std::ceil(static_cast<float>(input_size / 128.0)) : max_thread_num;
|
|
||||||
std::vector<std::thread> threads;
|
|
||||||
|
|
||||||
if (use_thread_num < 1) {
|
|
||||||
use_thread_num = 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
threads.reserve(use_thread_num);
|
|
||||||
size_t start = 0;
|
|
||||||
size_t batch_size = (input_size + use_thread_num - 1) / use_thread_num;
|
|
||||||
|
|
||||||
while (start < input_size) {
|
|
||||||
size_t end = (start + batch_size) > input_size ? input_size : (start + batch_size);
|
|
||||||
(void)threads.emplace_back(std::thread(&PackCpuFwdKernel::PackTensor, this, output, start, end));
|
|
||||||
start += batch_size;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto &it : threads) {
|
|
||||||
it.join();
|
|
||||||
}
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -34,35 +34,16 @@ void StandardNormal(float *output, std::normal_distribution<float> distribution,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void LaunchStandardNormal(unsigned int seed, const std::vector<AddressPtr> &outputs) {
|
void LaunchStandardNormal(RandomCPUKernel *content, unsigned int seed, const std::vector<AddressPtr> &outputs) {
|
||||||
auto output = reinterpret_cast<float *>(outputs[0]->addr);
|
auto output = reinterpret_cast<float *>(outputs[0]->addr);
|
||||||
// multithreading
|
// multithreading
|
||||||
size_t lens = outputs[0]->size / sizeof(float);
|
size_t lens = outputs[0]->size / sizeof(float);
|
||||||
auto max_thread_num = std::thread::hardware_concurrency();
|
auto task = [&seed, &output](size_t start, size_t end) {
|
||||||
size_t thread_num = lens < 128 * max_thread_num ? std::ceil(lens / 128.0) : max_thread_num;
|
std::normal_distribution<float> distribution;
|
||||||
if (thread_num < 1) {
|
|
||||||
MS_LOG(ERROR) << "Invalid value: thread_num " << thread_num;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
std::vector<std::thread> threads;
|
|
||||||
threads.reserve(thread_num);
|
|
||||||
size_t start = 0;
|
|
||||||
size_t once_compute_size = (lens + thread_num - 1) / thread_num;
|
|
||||||
if (once_compute_size < 1) {
|
|
||||||
MS_LOG(ERROR) << "Invalid value: once_compute_size " << once_compute_size;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
std::normal_distribution<float> distribution;
|
|
||||||
while (start < lens) {
|
|
||||||
// avoid different threads using the same seed to generate the same random number
|
|
||||||
std::default_random_engine random_generator(++seed);
|
std::default_random_engine random_generator(++seed);
|
||||||
size_t end = (start + once_compute_size) > lens ? lens : (start + once_compute_size);
|
StandardNormal(output, distribution, random_generator, start, end);
|
||||||
(void)threads.emplace_back(std::thread(StandardNormal, output, distribution, random_generator, start, end));
|
};
|
||||||
start += once_compute_size;
|
ParallelLaunchAutoSearch(task, lens, content, &content->parallel_search_info_);
|
||||||
}
|
|
||||||
for (size_t i = 0; i < threads.size(); ++i) {
|
|
||||||
threads[i].join();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void LaunchUniformInt(unsigned int seed, const std::vector<AddressPtr> &inputs,
|
void LaunchUniformInt(unsigned int seed, const std::vector<AddressPtr> &inputs,
|
||||||
|
@ -138,7 +119,7 @@ bool RandomCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs, cons
|
||||||
|
|
||||||
if (random_op_type_ == RANDOM_OP_NORMAL) {
|
if (random_op_type_ == RANDOM_OP_NORMAL) {
|
||||||
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kStandardNormalOutputsNum, kernel_name_);
|
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kStandardNormalOutputsNum, kernel_name_);
|
||||||
LaunchStandardNormal(RNG_seed, outputs);
|
LaunchStandardNormal(this, RNG_seed, outputs);
|
||||||
} else if (random_op_type_ == RANDOM_OP_UNIFORM_INT) {
|
} else if (random_op_type_ == RANDOM_OP_UNIFORM_INT) {
|
||||||
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kUniformIntInputsNum, kernel_name_);
|
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kUniformIntInputsNum, kernel_name_);
|
||||||
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kUniformIntOutputsNum, kernel_name_);
|
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kUniformIntOutputsNum, kernel_name_);
|
||||||
|
|
|
@ -176,7 +176,7 @@ bool ReduceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, c
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, output_size);
|
ParallelLaunchAutoSearch(task, output_size, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -204,7 +204,7 @@ void ReduceCPUKernel<T>::AccelerateLongVector(T *input_addr, T *output_addr, siz
|
||||||
reduce_func_(&block_output, 0, output_addr);
|
reduce_func_(&block_output, 0, output_addr);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, input_size);
|
ParallelLaunchAutoSearch(task, input_size, this, ¶llel_search_info_);
|
||||||
if (reduce_type_ == kReduceMean) {
|
if (reduce_type_ == kReduceMean) {
|
||||||
*output_addr /= input_size;
|
*output_addr /= input_size;
|
||||||
}
|
}
|
||||||
|
|
|
@ -88,7 +88,7 @@ class BufferCPUAppendKernel : public CPUKernel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, element_nums_);
|
ParallelLaunchAutoSearch(task, element_nums_, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -77,7 +77,7 @@ class BufferCPUGetKernel : public CPUKernel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, element_nums_);
|
ParallelLaunchAutoSearch(task, element_nums_, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -104,7 +104,7 @@ class BufferCPUSampleKernel : public CPUKernel {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, batch_size_);
|
ParallelLaunchAutoSearch(task, batch_size_, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -44,7 +44,7 @@ void RMSPropCPUKernel<T>::LaunchRMSPropUnuseCenter(T *variable, T *mean_square,
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
CPUKernelUtils::ParallelFor(task, size_);
|
ParallelLaunchAutoSearch(task, size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
@ -70,7 +70,7 @@ void RMSPropCPUKernel<T>::LaunchRMSPropUseCenter(T *variable, T *mean_square, T
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
CPUKernelUtils::ParallelFor(task, size_);
|
ParallelLaunchAutoSearch(task, size_, this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
|
|
|
@ -109,8 +109,7 @@ bool ROIAlignCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||||
out_data[thread_idx] = accumulate_val;
|
out_data[thread_idx] = accumulate_val;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, elem_num);
|
ParallelLaunchAutoSearch(task, elem_num, this, ¶llel_search_info_);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -117,7 +117,7 @@ bool ROIAlignGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inp
|
||||||
dx[thread_idx] = ZERO;
|
dx[thread_idx] = ZERO;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task1, IntToSize(size_init));
|
ParallelLaunchAutoSearch(task1, IntToSize(size_init), this, ¶llel_search_info_);
|
||||||
|
|
||||||
int elem_num = roi_rows_ * channels_ * pooled_height_ * pooled_width_;
|
int elem_num = roi_rows_ * channels_ * pooled_height_ * pooled_width_;
|
||||||
auto task2 = [this, &dy, &rois, &dx](size_t start, size_t end) {
|
auto task2 = [this, &dy, &rois, &dx](size_t start, size_t end) {
|
||||||
|
@ -176,7 +176,7 @@ bool ROIAlignGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task2, IntToSize(elem_num));
|
ParallelLaunchAutoSearch(task2, IntToSize(elem_num), this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -27,7 +27,8 @@ constexpr size_t kScatterNdOutputSize = 1;
|
||||||
constexpr size_t kMinIndiceRank = 2;
|
constexpr size_t kMinIndiceRank = 2;
|
||||||
|
|
||||||
template <typename S, typename T>
|
template <typename S, typename T>
|
||||||
void Compute(const ComputeParams<S, T> *params, const size_t start, const size_t end) {
|
void Compute(ScatterNdCPUKernel<S, T> *content, const ComputeParams<S, T> *params, const size_t start,
|
||||||
|
const size_t end) {
|
||||||
T *target = params->target_;
|
T *target = params->target_;
|
||||||
S *indices = params->indices_;
|
S *indices = params->indices_;
|
||||||
T *updates = params->updates_;
|
T *updates = params->updates_;
|
||||||
|
@ -47,7 +48,7 @@ void Compute(const ComputeParams<S, T> *params, const size_t start, const size_t
|
||||||
target[IntToSize(offset) + idx] += updates[IntToSize(params->unit_size_) * i + idx];
|
target[IntToSize(offset) + idx] += updates[IntToSize(params->unit_size_) * i + idx];
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, IntToSize(params->unit_size_));
|
ParallelLaunchAutoSearch(task, IntToSize(params->unit_size_), content, &content->parallel_search_info_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} // namespace
|
} // namespace
|
||||||
|
@ -113,10 +114,10 @@ bool ScatterNdCPUKernel<S, T>::Launch(const std::vector<kernel::AddressPtr> &inp
|
||||||
|
|
||||||
auto task = [this, ¶ms](size_t start, size_t end) {
|
auto task = [this, ¶ms](size_t start, size_t end) {
|
||||||
for (size_t idx = start; idx < end; idx++) {
|
for (size_t idx = start; idx < end; idx++) {
|
||||||
Compute<S, T>(¶ms, idx, idx + 1);
|
Compute<S, T>(this, ¶ms, idx, idx + 1);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, num_units_);
|
ParallelLaunchAutoSearch(task, num_units_, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -70,7 +70,7 @@ bool SearchSortedCPUKernel<S, T>::Launch(const std::vector<kernel::AddressPtr> &
|
||||||
output[i] = static_cast<T>(result);
|
output[i] = static_cast<T>(result);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, elem_num);
|
ParallelLaunchAutoSearch(task, elem_num, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -96,7 +96,7 @@ void SearchSortedCPUKernel<S, T>::CheckParam(const std::vector<AddressPtr> &inpu
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, IntToSize(list_count));
|
ParallelLaunchAutoSearch(task, IntToSize(list_count), this, ¶llel_search_info_);
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -72,7 +72,7 @@ bool SGDCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std::v
|
||||||
output_param[i] = param[i];
|
output_param[i] = param[i];
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, elem_num);
|
ParallelLaunchAutoSearch(task, elem_num, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
|
|
|
@ -63,7 +63,7 @@ bool SmoothL1LossCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inp
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
CPUKernelUtils::ParallelFor(task, tensor_size_);
|
ParallelLaunchAutoSearch(task, tensor_size_, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
|
|
|
@ -84,7 +84,7 @@ bool SpaceToDepthCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inp
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
CPUKernelUtils::ParallelFor(task, size);
|
ParallelLaunchAutoSearch(task, size, this, ¶llel_search_info_);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace kernel
|
} // namespace kernel
|
||||||
|
|
|
@ -23,23 +23,22 @@ namespace runtime {
|
||||||
void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread_num) {
|
void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread_num) {
|
||||||
MS_EXCEPTION_IF_NULL(actor_thread_num);
|
MS_EXCEPTION_IF_NULL(actor_thread_num);
|
||||||
MS_EXCEPTION_IF_NULL(actor_and_kernel_thread_num);
|
MS_EXCEPTION_IF_NULL(actor_and_kernel_thread_num);
|
||||||
size_t cpu_core_num = std::thread::hardware_concurrency() - 1;
|
const size_t cpu_core_num = std::thread::hardware_concurrency() - 1;
|
||||||
|
|
||||||
// Compute the actor thread num.
|
|
||||||
const size_t kActorThreadMaxNum = 5;
|
|
||||||
// The MemoryManagerActor binds single thread, and the other actors share one thread at least, so the min num is 2.
|
// The MemoryManagerActor binds single thread, and the other actors share one thread at least, so the min num is 2.
|
||||||
const size_t kActorThreadMinNum = 2;
|
const size_t kActorThreadMinNum = 2;
|
||||||
|
// Compute the actor thread num.
|
||||||
|
const size_t kActorThreadMaxNum = 5;
|
||||||
|
// a machine may run multiple process, a process should not use all CPUs. default run 5 process in the same time.
|
||||||
|
const size_t kParallelNum = 5;
|
||||||
|
const size_t kThreadMaxNum = cpu_core_num / kParallelNum;
|
||||||
|
|
||||||
auto context_ptr = MsContext::GetInstance();
|
auto context_ptr = MsContext::GetInstance();
|
||||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||||
*actor_thread_num = cpu_core_num < kActorThreadMinNum ? kActorThreadMinNum : cpu_core_num;
|
*actor_thread_num = cpu_core_num < kActorThreadMinNum ? kActorThreadMinNum : cpu_core_num;
|
||||||
*actor_thread_num = *actor_thread_num > kActorThreadMaxNum ? kActorThreadMaxNum : *actor_thread_num;
|
*actor_thread_num = *actor_thread_num > kActorThreadMaxNum ? kActorThreadMaxNum : *actor_thread_num;
|
||||||
|
|
||||||
// Compute the actor and kernel thread num.
|
// Compute the actor and kernel thread num. 1 thread is useless for kernel, so kernel thread num should at least 2.
|
||||||
const size_t kActorAndKernelThreadMaxNum = 23;
|
*actor_and_kernel_thread_num = kThreadMaxNum > (*actor_thread_num + 2) ? kThreadMaxNum : (*actor_thread_num + 2);
|
||||||
*actor_and_kernel_thread_num = cpu_core_num > *actor_thread_num ? cpu_core_num : (*actor_thread_num + 1);
|
|
||||||
*actor_and_kernel_thread_num = *actor_and_kernel_thread_num > kActorAndKernelThreadMaxNum
|
|
||||||
? kActorAndKernelThreadMaxNum
|
|
||||||
: *actor_and_kernel_thread_num;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool IsDeviceQueueDSActor(const AnfNodePtr &node, GraphExecutionStrategy strategy) {
|
bool IsDeviceQueueDSActor(const AnfNodePtr &node, GraphExecutionStrategy strategy) {
|
||||||
|
|
Loading…
Reference in New Issue