!49561 xlogy performance optimize

Merge pull request !49561 from zhangzhaoju/master_xlogy
This commit is contained in:
i-robot 2023-03-01 09:06:10 +00:00 committed by Gitee
commit d22f57ae7a
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
5 changed files with 253 additions and 72 deletions

View File

@ -48,6 +48,7 @@
"mindspore/mindspore/ccsrc/pipeline/pynative/grad/bprop_expander/grad_ops/grad_quant_ops.cc" "internalAstError"
"mindspore/mindspore/ccsrc/pipeline/pynative/grad/bprop_expander/grad_ops/grad_scipy_ops.cc" "internalAstError"
"mindspore/mindspore/ccsrc/pipeline/pynative/grad/bprop_expander/grad_ops/grad_sparse_ops.cc" "internalAstError"
"mindspore/mindspore/ccsrc/plugin/device/cpu/kernel/xlogy_cpu_kernel.cc" "unreadVariable"
# MindData
"mindspore/mindspore/ccsrc/minddata/dataset/engine/dataset_iterator.cc" "useStlAlgorithm"
@ -114,4 +115,4 @@
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "identicalConditionAfterEarlyExit"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitMemberVar"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantInitialization"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantCondition"
"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantCondition"

View File

@ -50,7 +50,6 @@ constexpr auto kFloorDiv = "FloorDiv";
constexpr auto kMod = "Mod";
constexpr auto kFloorMod = "FloorMod";
constexpr auto kSquaredDifference = "SquaredDifference";
constexpr auto kXlogy = "Xlogy";
constexpr auto kAtan2 = "Atan2";
template <typename T>
@ -218,8 +217,7 @@ class ArithmeticCpuTypeFunc : public CpuKernelFunc {
{kFloorDiv, &ArithmeticCpuTypeFunc<T>::FloorDiv},
{kAtan2, &ArithmeticCpuTypeFunc<T>::Atan2},
{kRealDiv, &ArithmeticCpuTypeFunc<T>::RealDiv},
{kSquaredDifference, &ArithmeticCpuTypeFunc<T>::SquaredDifference},
{kXlogy, &ArithmeticCpuTypeFunc<T>::Xlogy}};
{kSquaredDifference, &ArithmeticCpuTypeFunc<T>::SquaredDifference}};
} else {
dtype_desc = "complex data";
arithmeticMathFuncMap = {{kSquaredDifference, &ArithmeticCpuTypeFunc<T>::SquaredDifferenceComplex},
@ -230,8 +228,7 @@ class ArithmeticCpuTypeFunc : public CpuKernelFunc {
{kMul, &ArithmeticCpuTypeFunc<T>::Mul},
{kDivNoNan, &ArithmeticCpuTypeFunc<T>::DivNoNan},
{kAddV2, &ArithmeticCpuTypeFunc<T>::AddV2},
{kPow, &ArithmeticCpuTypeFunc<T>::PowComplex},
{kXlogy, &ArithmeticCpuTypeFunc<T>::Xlogy}};
{kPow, &ArithmeticCpuTypeFunc<T>::PowComplex}};
}
if (arithmeticMathFuncMap.find(kernel_name_) == arithmeticMathFuncMap.end()) {
MS_LOG(EXCEPTION) << "For 'Arithmetic', it only supports operators in "
@ -1221,23 +1218,6 @@ static std::map<std::string, std::vector<std::pair<KernelAttr, ArithmeticCpuFunc
.AddInputAttr(kNumberTypeComplex128)
.AddOutputAttr(kNumberTypeComplex128),
SpecializeArithFunc<complex128>}}},
{kXlogy,
{{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
SpecializeArithFunc<float>},
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
SpecializeArithFunc<double>},
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
SpecializeArithFunc<float16>},
{KernelAttr()
.AddInputAttr(kNumberTypeComplex64)
.AddInputAttr(kNumberTypeComplex64)
.AddOutputAttr(kNumberTypeComplex64),
SpecializeArithFunc<complex64>},
{KernelAttr()
.AddInputAttr(kNumberTypeComplex128)
.AddInputAttr(kNumberTypeComplex128)
.AddOutputAttr(kNumberTypeComplex128),
SpecializeArithFunc<complex128>}}},
{kAtan2,
{{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
SpecializeArithFunc<float16>},
@ -1344,8 +1324,6 @@ MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, AssignSub,
MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, SquaredDifference,
[]() { return std::make_shared<ArithmeticCpuKernelMod>(kSquaredDifference); });
MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, Xlogy,
[]() { return std::make_shared<ArithmeticCpuKernelMod>(kXlogy); });
MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, Atan2,
[]() { return std::make_shared<ArithmeticCpuKernelMod>(kAtan2); });
MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, AddV2,

View File

@ -45,52 +45,23 @@ complex64 GetDivZeroVal(const complex64 &) {
return std::numeric_limits<complex64>::quiet_NaN();
}
template <class T>
bool isZero(const T &val) {
return val == T(0.0f);
}
template <>
bool isZero(const float &val) {
return std::fpclassify(val) == FP_ZERO;
}
template <>
bool isZero(const double &val) {
return std::fpclassify(val) == FP_ZERO;
}
template <typename T>
void SameShapeTask(T *x_addr, T *y_addr, T *output_addr, size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto dividend = x_addr[i];
auto divisor = y_addr[i];
if (isZero(divisor)) {
if (isZero(dividend)) {
output_addr[i] = static_cast<T>(0.0);
continue;
}
output_addr[i] = GetDivZeroVal(dividend);
continue;
}
output_addr[i] = dividend / divisor;
}
void XDivySameShapeTask(T *x_addr, T *y_addr, T *output_addr, size_t start, size_t end) {
Eigen::Map<Eigen::Array<T, -1, 1>> x_v(x_addr + start, end - start);
Eigen::Map<Eigen::Array<T, -1, 1>> y_v(y_addr + start, end - start);
Eigen::Map<Eigen::Array<T, -1, 1>> o_v(output_addr + start, end - start);
o_v = (x_v == T(0)).select(o_v, x_v / y_v);
}
template <>
void SameShapeTask(float *x_addr, float *y_addr, float *output_addr, size_t start, size_t end) {
Eigen::Map<Eigen::ArrayXf> x_v(x_addr + start, end - start);
Eigen::Map<Eigen::ArrayXf> y_v(y_addr + start, end - start);
Eigen::Map<Eigen::ArrayXf> o_v(output_addr + start, end - start);
o_v = (x_v == 0).select(o_v, x_v / y_v);
}
template <>
void SameShapeTask(double *x_addr, double *y_addr, double *output_addr, size_t start, size_t end) {
Eigen::Map<Eigen::ArrayXd> x_v(x_addr + start, end - start);
Eigen::Map<Eigen::ArrayXd> y_v(y_addr + start, end - start);
Eigen::Map<Eigen::ArrayXd> o_v(output_addr + start, end - start);
o_v = (x_v == 0).select(o_v, x_v / y_v);
void XDivySameShapeTask(float16 *x_addr, float16 *y_addr, float16 *output_addr, size_t start, size_t end) {
Eigen::half *ex_addr = reinterpret_cast<Eigen::half *>(x_addr);
Eigen::half *ey_addr = reinterpret_cast<Eigen::half *>(y_addr);
Eigen::half *eo_addr = reinterpret_cast<Eigen::half *>(output_addr);
Eigen::Map<Eigen::Array<Eigen::half, -1, 1>> x_v(ex_addr + start, end - start);
Eigen::Map<Eigen::Array<Eigen::half, -1, 1>> y_v(ey_addr + start, end - start);
Eigen::Map<Eigen::Array<Eigen::half, -1, 1>> o_v(eo_addr + start, end - start);
o_v = (x_v == Eigen::half(0)).select(o_v, x_v / y_v);
}
template <typename T>
@ -107,14 +78,12 @@ bool XdivyCpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &inpu
auto output_addr = static_cast<T *>(outputs[0]->addr);
size_t output_size = outputs[0]->size / sizeof(T);
auto sameShapeTask = [&x_addr, &y_addr, &output_addr](size_t start, size_t end) {
SameShapeTask(x_addr, y_addr, output_addr, start, end);
XDivySameShapeTask(x_addr, y_addr, output_addr, start, end);
};
auto diffShapeTask = [this, &x_addr, &y_addr, &output_addr](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto idxX = index_listx_[i];
auto idxY = index_listy_[i];
auto dividend = x_addr[idxX];
auto divisor = y_addr[idxY];
auto dividend = x_addr[index_listx_[i]];
auto divisor = y_addr[index_listy_[i]];
auto zero = static_cast<T>(0);
if (divisor == zero) {
if (dividend == zero) {

View File

@ -0,0 +1,168 @@
/**
* Copyright 2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "plugin/device/cpu/kernel/xlogy_cpu_kernel.h"
#include <algorithm>
#include <utility>
#include <limits>
#include <cmath>
#include "plugin/device/cpu/hal/device/cpu_device_address.h"
#include "Eigen/Eigen"
namespace mindspore {
namespace kernel {
using complex64 = std::complex<float>;
using complex128 = std::complex<double>;
static constexpr size_t INPUT_NUM = 2;
static constexpr size_t OUTPUT_NUM = 1;
static constexpr int MAX_DIMS = 7;
static constexpr size_t PARALLEL_THRESHOLD = 4096;
template <typename T>
void XlogySameShapeTask(T *x_addr, T *y_addr, T *output_addr, size_t start, size_t end) {
Eigen::Map<Eigen::Array<T, -1, 1>> x_v(x_addr + start, end - start);
Eigen::Map<Eigen::Array<T, -1, 1>> y_v(y_addr + start, end - start);
Eigen::Map<Eigen::Array<T, -1, 1>> o_v(output_addr + start, end - start);
o_v = x_v * y_v.log();
}
template <>
void XlogySameShapeTask(float16 *x_addr, float16 *y_addr, float16 *output_addr, size_t start, size_t end) {
Eigen::half *ex_addr = reinterpret_cast<Eigen::half *>(x_addr);
Eigen::half *ey_addr = reinterpret_cast<Eigen::half *>(y_addr);
Eigen::half *eo_addr = reinterpret_cast<Eigen::half *>(output_addr);
Eigen::Map<Eigen::Array<Eigen::half, -1, 1>> x_v(ex_addr + start, end - start);
Eigen::Map<Eigen::Array<Eigen::half, -1, 1>> y_v(ey_addr + start, end - start);
Eigen::Map<Eigen::Array<Eigen::half, -1, 1>> o_v(eo_addr + start, end - start);
o_v = x_v * y_v.log();
}
template <typename T>
bool XlogyCpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), INPUT_NUM, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), OUTPUT_NUM, kernel_name_);
if (has_null_input_) {
return true;
}
auto x_addr = static_cast<T *>(inputs[0]->addr);
auto y_addr = static_cast<T *>(inputs[1]->addr);
auto output_addr = static_cast<T *>(outputs[0]->addr);
size_t output_size = outputs[0]->size / sizeof(T);
auto sameShapeTask = [&x_addr, &y_addr, &output_addr](size_t start, size_t end) {
XlogySameShapeTask(x_addr, y_addr, output_addr, start, end);
};
auto diffShapeTask = [this, &x_addr, &y_addr, &output_addr](size_t start, size_t end) {
for (size_t i = start; i < end; i++) {
auto x1 = x_addr[index_listx_[i]];
auto x2 = y_addr[index_listy_[i]];
auto logx2 = log(x2);
output_addr[i] = x1 * logx2;
}
};
CTask task = is_need_broadcast_ ? CTask(diffShapeTask) : CTask(sameShapeTask);
if (output_size < PARALLEL_THRESHOLD) {
task(0, output_size);
} else {
ParallelLaunch(task, output_size, PARALLEL_THRESHOLD, this, pool_);
}
return true;
}
bool XlogyCpuKernelMod::Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) {
return kernel_func_(this, inputs, workspace, outputs);
}
bool XlogyCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs) {
kernel_name_ = base_operator->name();
CHECK_KERNEL_INPUTS_NUM(inputs.size(), INPUT_NUM, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), OUTPUT_NUM, kernel_name_);
auto x_type = inputs[0]->GetDtype();
auto y_type = inputs[1]->GetDtype();
auto out_type = outputs[0]->GetDtype();
if (!(x_type == y_type && x_type == out_type)) {
MS_LOG(ERROR) << "Xlogy need same input and output data type, but got X type:" << x_type << " Y type:" << y_type
<< " out type:" << out_type;
return false;
}
if (!MatchKernelFunc(base_operator, inputs, outputs)) {
return false;
}
return true;
}
int XlogyCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs,
const std::map<uint32_t, tensor::TensorPtr> &) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), INPUT_NUM, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), OUTPUT_NUM, kernel_name_);
ResetResource();
if (int ret = KernelMod::Resize(base_operator, inputs, outputs); ret != KRET_OK) {
return ret;
}
auto x_shape = LongVecToSizeVec(inputs.at(kIndex0)->GetShapeVector());
auto y_shape = LongVecToSizeVec(inputs.at(kIndex1)->GetShapeVector());
// while has null input, xlogy result is null too
has_null_input_ = CheckNullInput(x_shape);
has_null_input_ = has_null_input_ || CheckNullInput(y_shape);
if (has_null_input_) {
return 0;
}
auto out_shape = LongVecToSizeVec(outputs.at(kIndex0)->GetShapeVector());
if (out_shape.size() > MAX_DIMS || out_shape.size() < x_shape.size() || out_shape.size() < y_shape.size()) {
MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be greater than " << MAX_DIMS
<< ", and output dimension can't less than input; but got x_shape dimension:" << x_shape.size()
<< " ,y_shape dimension:" << y_shape.size() << " ,out_shape dimension:" << out_shape.size();
}
is_need_broadcast_ = x_shape != y_shape;
if (is_need_broadcast_) {
GetBroadCastIndex(x_shape, out_shape, &index_listx_);
GetBroadCastIndex(y_shape, out_shape, &index_listy_);
}
return 0;
}
const std::vector<std::pair<KernelAttr, XlogyCpuKernelMod::KernelRunFunc>> &XlogyCpuKernelMod::GetFuncList() const {
static const std::vector<std::pair<KernelAttr, XlogyCpuKernelMod::KernelRunFunc>> func_list = {
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
&XlogyCpuKernelMod::LaunchKernel<float16>},
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
&XlogyCpuKernelMod::LaunchKernel<float>},
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
&XlogyCpuKernelMod::LaunchKernel<double>},
{KernelAttr()
.AddInputAttr(kNumberTypeComplex64)
.AddInputAttr(kNumberTypeComplex64)
.AddOutputAttr(kNumberTypeComplex64),
&XlogyCpuKernelMod::LaunchKernel<complex64>},
{KernelAttr()
.AddInputAttr(kNumberTypeComplex128)
.AddInputAttr(kNumberTypeComplex128)
.AddOutputAttr(kNumberTypeComplex128),
&XlogyCpuKernelMod::LaunchKernel<complex128>},
};
return func_list;
}
MS_KERNEL_FACTORY_REG(NativeCpuKernelMod, Xlogy, XlogyCpuKernelMod);
} // namespace kernel
} // namespace mindspore

View File

@ -0,0 +1,65 @@
/**
* Copyright 2023 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_XLOGY_CPU_KERNEL_H
#define MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_XLOGY_CPU_KERNEL_H
#include <complex>
#include <vector>
#include <map>
#include <utility>
#include "plugin/device/cpu/kernel/cpu_kernel.h"
#include "plugin/factory/ms_factory.h"
namespace mindspore {
namespace kernel {
class XlogyCpuKernelMod : public NativeCpuKernelMod, public MatchKernelHelper<XlogyCpuKernelMod> {
public:
XlogyCpuKernelMod() { ResetResource(); }
~XlogyCpuKernelMod() override = default;
bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) override;
bool Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs) override;
int Resize(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
const std::vector<KernelTensorPtr> &outputs, const std::map<uint32_t, tensor::TensorPtr> &) override;
const std::vector<std::pair<KernelAttr, KernelRunFunc>> &GetFuncList() const override;
std::vector<KernelAttr> GetOpSupport() override { return OpSupport(); }
protected:
void ResetResource() noexcept {
input_size_list_.clear();
output_size_list_.clear();
workspace_size_list_.clear();
index_listx_.clear();
index_listy_.clear();
}
private:
template <typename T>
bool LaunchKernel(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &workspace,
const std::vector<kernel::AddressPtr> &outputs);
std::vector<size_t> index_listx_{};
std::vector<size_t> index_listy_{};
bool is_need_broadcast_{false};
bool has_null_input_{false};
};
} // namespace kernel
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_XLOGY_CPU_KERNEL_H