From 000fc95842883d1bca5f83cc1a8a964fd19de16a Mon Sep 17 00:00:00 2001 From: zhangzhaoju Date: Tue, 28 Feb 2023 15:16:23 +0800 Subject: [PATCH] xlogy performance optimize --- .jenkins/check/config/filter_cppcheck.txt | 3 +- .../cpu/kernel/arithmetic_cpu_kernel.cc | 26 +-- .../device/cpu/kernel/xdivy_cpu_kernel.cc | 63 ++----- .../device/cpu/kernel/xlogy_cpu_kernel.cc | 168 ++++++++++++++++++ .../device/cpu/kernel/xlogy_cpu_kernel.h | 65 +++++++ 5 files changed, 253 insertions(+), 72 deletions(-) create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/xlogy_cpu_kernel.cc create mode 100644 mindspore/ccsrc/plugin/device/cpu/kernel/xlogy_cpu_kernel.h diff --git a/.jenkins/check/config/filter_cppcheck.txt b/.jenkins/check/config/filter_cppcheck.txt index df50889b791..97431ceff15 100644 --- a/.jenkins/check/config/filter_cppcheck.txt +++ b/.jenkins/check/config/filter_cppcheck.txt @@ -48,6 +48,7 @@ "mindspore/mindspore/ccsrc/pipeline/pynative/grad/bprop_expander/grad_ops/grad_quant_ops.cc" "internalAstError" "mindspore/mindspore/ccsrc/pipeline/pynative/grad/bprop_expander/grad_ops/grad_scipy_ops.cc" "internalAstError" "mindspore/mindspore/ccsrc/pipeline/pynative/grad/bprop_expander/grad_ops/grad_sparse_ops.cc" "internalAstError" +"mindspore/mindspore/ccsrc/plugin/device/cpu/kernel/xlogy_cpu_kernel.cc" "unreadVariable" # MindData "mindspore/mindspore/ccsrc/minddata/dataset/engine/dataset_iterator.cc" "useStlAlgorithm" @@ -114,4 +115,4 @@ "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "identicalConditionAfterEarlyExit" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "uninitMemberVar" "mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantInitialization" -"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantCondition" \ No newline at end of file +"mindspore/mindspore/ccsrc/plugin/device/ascend/kernel/aicpu/aicpu_ops/cpu_kernel/" "redundantCondition" diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/arithmetic_cpu_kernel.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/arithmetic_cpu_kernel.cc index 5f24a58fc3d..7230f9fb4b8 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/arithmetic_cpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/arithmetic_cpu_kernel.cc @@ -50,7 +50,6 @@ constexpr auto kFloorDiv = "FloorDiv"; constexpr auto kMod = "Mod"; constexpr auto kFloorMod = "FloorMod"; constexpr auto kSquaredDifference = "SquaredDifference"; -constexpr auto kXlogy = "Xlogy"; constexpr auto kAtan2 = "Atan2"; template @@ -218,8 +217,7 @@ class ArithmeticCpuTypeFunc : public CpuKernelFunc { {kFloorDiv, &ArithmeticCpuTypeFunc::FloorDiv}, {kAtan2, &ArithmeticCpuTypeFunc::Atan2}, {kRealDiv, &ArithmeticCpuTypeFunc::RealDiv}, - {kSquaredDifference, &ArithmeticCpuTypeFunc::SquaredDifference}, - {kXlogy, &ArithmeticCpuTypeFunc::Xlogy}}; + {kSquaredDifference, &ArithmeticCpuTypeFunc::SquaredDifference}}; } else { dtype_desc = "complex data"; arithmeticMathFuncMap = {{kSquaredDifference, &ArithmeticCpuTypeFunc::SquaredDifferenceComplex}, @@ -230,8 +228,7 @@ class ArithmeticCpuTypeFunc : public CpuKernelFunc { {kMul, &ArithmeticCpuTypeFunc::Mul}, {kDivNoNan, &ArithmeticCpuTypeFunc::DivNoNan}, {kAddV2, &ArithmeticCpuTypeFunc::AddV2}, - {kPow, &ArithmeticCpuTypeFunc::PowComplex}, - {kXlogy, &ArithmeticCpuTypeFunc::Xlogy}}; + {kPow, &ArithmeticCpuTypeFunc::PowComplex}}; } if (arithmeticMathFuncMap.find(kernel_name_) == arithmeticMathFuncMap.end()) { MS_LOG(EXCEPTION) << "For 'Arithmetic', it only supports operators in " @@ -1221,23 +1218,6 @@ static std::map}}}, - {kXlogy, - {{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), - SpecializeArithFunc}, - {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64), - SpecializeArithFunc}, - {KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), - SpecializeArithFunc}, - {KernelAttr() - .AddInputAttr(kNumberTypeComplex64) - .AddInputAttr(kNumberTypeComplex64) - .AddOutputAttr(kNumberTypeComplex64), - SpecializeArithFunc}, - {KernelAttr() - .AddInputAttr(kNumberTypeComplex128) - .AddInputAttr(kNumberTypeComplex128) - .AddOutputAttr(kNumberTypeComplex128), - SpecializeArithFunc}}}, {kAtan2, {{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), SpecializeArithFunc}, @@ -1344,8 +1324,6 @@ MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, AssignSub, MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, SquaredDifference, []() { return std::make_shared(kSquaredDifference); }); -MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, Xlogy, - []() { return std::make_shared(kXlogy); }); MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, Atan2, []() { return std::make_shared(kAtan2); }); MS_KERNEL_FACTORY_REG_BY_CREATOR(NativeCpuKernelMod, AddV2, diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/xdivy_cpu_kernel.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/xdivy_cpu_kernel.cc index 3d155c1aab9..78e34c12fa1 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/xdivy_cpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/xdivy_cpu_kernel.cc @@ -45,52 +45,23 @@ complex64 GetDivZeroVal(const complex64 &) { return std::numeric_limits::quiet_NaN(); } -template -bool isZero(const T &val) { - return val == T(0.0f); -} - -template <> -bool isZero(const float &val) { - return std::fpclassify(val) == FP_ZERO; -} - -template <> -bool isZero(const double &val) { - return std::fpclassify(val) == FP_ZERO; -} - template -void SameShapeTask(T *x_addr, T *y_addr, T *output_addr, size_t start, size_t end) { - for (size_t i = start; i < end; i++) { - auto dividend = x_addr[i]; - auto divisor = y_addr[i]; - if (isZero(divisor)) { - if (isZero(dividend)) { - output_addr[i] = static_cast(0.0); - continue; - } - output_addr[i] = GetDivZeroVal(dividend); - continue; - } - output_addr[i] = dividend / divisor; - } +void XDivySameShapeTask(T *x_addr, T *y_addr, T *output_addr, size_t start, size_t end) { + Eigen::Map> x_v(x_addr + start, end - start); + Eigen::Map> y_v(y_addr + start, end - start); + Eigen::Map> o_v(output_addr + start, end - start); + o_v = (x_v == T(0)).select(o_v, x_v / y_v); } template <> -void SameShapeTask(float *x_addr, float *y_addr, float *output_addr, size_t start, size_t end) { - Eigen::Map x_v(x_addr + start, end - start); - Eigen::Map y_v(y_addr + start, end - start); - Eigen::Map o_v(output_addr + start, end - start); - o_v = (x_v == 0).select(o_v, x_v / y_v); -} - -template <> -void SameShapeTask(double *x_addr, double *y_addr, double *output_addr, size_t start, size_t end) { - Eigen::Map x_v(x_addr + start, end - start); - Eigen::Map y_v(y_addr + start, end - start); - Eigen::Map o_v(output_addr + start, end - start); - o_v = (x_v == 0).select(o_v, x_v / y_v); +void XDivySameShapeTask(float16 *x_addr, float16 *y_addr, float16 *output_addr, size_t start, size_t end) { + Eigen::half *ex_addr = reinterpret_cast(x_addr); + Eigen::half *ey_addr = reinterpret_cast(y_addr); + Eigen::half *eo_addr = reinterpret_cast(output_addr); + Eigen::Map> x_v(ex_addr + start, end - start); + Eigen::Map> y_v(ey_addr + start, end - start); + Eigen::Map> o_v(eo_addr + start, end - start); + o_v = (x_v == Eigen::half(0)).select(o_v, x_v / y_v); } template @@ -107,14 +78,12 @@ bool XdivyCpuKernelMod::LaunchKernel(const std::vector &inpu auto output_addr = static_cast(outputs[0]->addr); size_t output_size = outputs[0]->size / sizeof(T); auto sameShapeTask = [&x_addr, &y_addr, &output_addr](size_t start, size_t end) { - SameShapeTask(x_addr, y_addr, output_addr, start, end); + XDivySameShapeTask(x_addr, y_addr, output_addr, start, end); }; auto diffShapeTask = [this, &x_addr, &y_addr, &output_addr](size_t start, size_t end) { for (size_t i = start; i < end; i++) { - auto idxX = index_listx_[i]; - auto idxY = index_listy_[i]; - auto dividend = x_addr[idxX]; - auto divisor = y_addr[idxY]; + auto dividend = x_addr[index_listx_[i]]; + auto divisor = y_addr[index_listy_[i]]; auto zero = static_cast(0); if (divisor == zero) { if (dividend == zero) { diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/xlogy_cpu_kernel.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/xlogy_cpu_kernel.cc new file mode 100644 index 00000000000..bd07d5aad8a --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/xlogy_cpu_kernel.cc @@ -0,0 +1,168 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "plugin/device/cpu/kernel/xlogy_cpu_kernel.h" +#include +#include +#include +#include +#include "plugin/device/cpu/hal/device/cpu_device_address.h" +#include "Eigen/Eigen" + +namespace mindspore { +namespace kernel { +using complex64 = std::complex; +using complex128 = std::complex; +static constexpr size_t INPUT_NUM = 2; +static constexpr size_t OUTPUT_NUM = 1; +static constexpr int MAX_DIMS = 7; +static constexpr size_t PARALLEL_THRESHOLD = 4096; + +template +void XlogySameShapeTask(T *x_addr, T *y_addr, T *output_addr, size_t start, size_t end) { + Eigen::Map> x_v(x_addr + start, end - start); + Eigen::Map> y_v(y_addr + start, end - start); + Eigen::Map> o_v(output_addr + start, end - start); + o_v = x_v * y_v.log(); +} + +template <> +void XlogySameShapeTask(float16 *x_addr, float16 *y_addr, float16 *output_addr, size_t start, size_t end) { + Eigen::half *ex_addr = reinterpret_cast(x_addr); + Eigen::half *ey_addr = reinterpret_cast(y_addr); + Eigen::half *eo_addr = reinterpret_cast(output_addr); + Eigen::Map> x_v(ex_addr + start, end - start); + Eigen::Map> y_v(ey_addr + start, end - start); + Eigen::Map> o_v(eo_addr + start, end - start); + o_v = x_v * y_v.log(); +} + +template +bool XlogyCpuKernelMod::LaunchKernel(const std::vector &inputs, + const std::vector &, + const std::vector &outputs) { + CHECK_KERNEL_INPUTS_NUM(inputs.size(), INPUT_NUM, kernel_name_); + CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), OUTPUT_NUM, kernel_name_); + if (has_null_input_) { + return true; + } + auto x_addr = static_cast(inputs[0]->addr); + auto y_addr = static_cast(inputs[1]->addr); + auto output_addr = static_cast(outputs[0]->addr); + size_t output_size = outputs[0]->size / sizeof(T); + auto sameShapeTask = [&x_addr, &y_addr, &output_addr](size_t start, size_t end) { + XlogySameShapeTask(x_addr, y_addr, output_addr, start, end); + }; + auto diffShapeTask = [this, &x_addr, &y_addr, &output_addr](size_t start, size_t end) { + for (size_t i = start; i < end; i++) { + auto x1 = x_addr[index_listx_[i]]; + auto x2 = y_addr[index_listy_[i]]; + auto logx2 = log(x2); + output_addr[i] = x1 * logx2; + } + }; + + CTask task = is_need_broadcast_ ? CTask(diffShapeTask) : CTask(sameShapeTask); + if (output_size < PARALLEL_THRESHOLD) { + task(0, output_size); + } else { + ParallelLaunch(task, output_size, PARALLEL_THRESHOLD, this, pool_); + } + return true; +} + +bool XlogyCpuKernelMod::Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) { + return kernel_func_(this, inputs, workspace, outputs); +} + +bool XlogyCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs) { + kernel_name_ = base_operator->name(); + CHECK_KERNEL_INPUTS_NUM(inputs.size(), INPUT_NUM, kernel_name_); + CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), OUTPUT_NUM, kernel_name_); + auto x_type = inputs[0]->GetDtype(); + auto y_type = inputs[1]->GetDtype(); + auto out_type = outputs[0]->GetDtype(); + if (!(x_type == y_type && x_type == out_type)) { + MS_LOG(ERROR) << "Xlogy need same input and output data type, but got X type:" << x_type << " Y type:" << y_type + << " out type:" << out_type; + return false; + } + if (!MatchKernelFunc(base_operator, inputs, outputs)) { + return false; + } + return true; +} + +int XlogyCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs, + const std::map &) { + CHECK_KERNEL_INPUTS_NUM(inputs.size(), INPUT_NUM, kernel_name_); + CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), OUTPUT_NUM, kernel_name_); + ResetResource(); + if (int ret = KernelMod::Resize(base_operator, inputs, outputs); ret != KRET_OK) { + return ret; + } + + auto x_shape = LongVecToSizeVec(inputs.at(kIndex0)->GetShapeVector()); + auto y_shape = LongVecToSizeVec(inputs.at(kIndex1)->GetShapeVector()); + + // while has null input, xlogy result is null too + has_null_input_ = CheckNullInput(x_shape); + has_null_input_ = has_null_input_ || CheckNullInput(y_shape); + if (has_null_input_) { + return 0; + } + + auto out_shape = LongVecToSizeVec(outputs.at(kIndex0)->GetShapeVector()); + if (out_shape.size() > MAX_DIMS || out_shape.size() < x_shape.size() || out_shape.size() < y_shape.size()) { + MS_LOG(EXCEPTION) << "For '" << kernel_name_ << "', the dimension of input cannot be greater than " << MAX_DIMS + << ", and output dimension can't less than input; but got x_shape dimension:" << x_shape.size() + << " ,y_shape dimension:" << y_shape.size() << " ,out_shape dimension:" << out_shape.size(); + } + is_need_broadcast_ = x_shape != y_shape; + if (is_need_broadcast_) { + GetBroadCastIndex(x_shape, out_shape, &index_listx_); + GetBroadCastIndex(y_shape, out_shape, &index_listy_); + } + return 0; +} + +const std::vector> &XlogyCpuKernelMod::GetFuncList() const { + static const std::vector> func_list = { + {KernelAttr().AddInputAttr(kNumberTypeFloat16).AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), + &XlogyCpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), + &XlogyCpuKernelMod::LaunchKernel}, + {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64), + &XlogyCpuKernelMod::LaunchKernel}, + {KernelAttr() + .AddInputAttr(kNumberTypeComplex64) + .AddInputAttr(kNumberTypeComplex64) + .AddOutputAttr(kNumberTypeComplex64), + &XlogyCpuKernelMod::LaunchKernel}, + {KernelAttr() + .AddInputAttr(kNumberTypeComplex128) + .AddInputAttr(kNumberTypeComplex128) + .AddOutputAttr(kNumberTypeComplex128), + &XlogyCpuKernelMod::LaunchKernel}, + }; + return func_list; +} + +MS_KERNEL_FACTORY_REG(NativeCpuKernelMod, Xlogy, XlogyCpuKernelMod); +} // namespace kernel +} // namespace mindspore diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/xlogy_cpu_kernel.h b/mindspore/ccsrc/plugin/device/cpu/kernel/xlogy_cpu_kernel.h new file mode 100644 index 00000000000..f75fe7d0bbc --- /dev/null +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/xlogy_cpu_kernel.h @@ -0,0 +1,65 @@ +/** + * Copyright 2023 Huawei Technologies Co., Ltd + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_XLOGY_CPU_KERNEL_H +#define MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_XLOGY_CPU_KERNEL_H +#include +#include +#include +#include +#include "plugin/device/cpu/kernel/cpu_kernel.h" +#include "plugin/factory/ms_factory.h" + +namespace mindspore { +namespace kernel { +class XlogyCpuKernelMod : public NativeCpuKernelMod, public MatchKernelHelper { + public: + XlogyCpuKernelMod() { ResetResource(); } + ~XlogyCpuKernelMod() override = default; + + bool Launch(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs) override; + + bool Init(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs) override; + + int Resize(const BaseOperatorPtr &base_operator, const std::vector &inputs, + const std::vector &outputs, const std::map &) override; + + const std::vector> &GetFuncList() const override; + + std::vector GetOpSupport() override { return OpSupport(); } + + protected: + void ResetResource() noexcept { + input_size_list_.clear(); + output_size_list_.clear(); + workspace_size_list_.clear(); + index_listx_.clear(); + index_listy_.clear(); + } + + private: + template + bool LaunchKernel(const std::vector &inputs, const std::vector &workspace, + const std::vector &outputs); + std::vector index_listx_{}; + std::vector index_listy_{}; + bool is_need_broadcast_{false}; + bool has_null_input_{false}; +}; +} // namespace kernel +} // namespace mindspore +#endif // MINDSPORE_CCSRC_PLUGIN_DEVICE_CPU_KERNEL_XLOGY_CPU_KERNEL_H