From 83c950922cfa1752c933f05e4ec4c58d70641403 Mon Sep 17 00:00:00 2001 From: zong-shuai Date: Mon, 6 Mar 2023 20:45:05 +0800 Subject: [PATCH] debug --- .../cpu/kernel/matrix_exp_cpu_kernel.cc | 195 +++++------------- .../device/cpu/kernel/matrix_exp_cpu_kernel.h | 26 ++- 2 files changed, 68 insertions(+), 153 deletions(-) diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/matrix_exp_cpu_kernel.cc b/mindspore/ccsrc/plugin/device/cpu/kernel/matrix_exp_cpu_kernel.cc index b7cd647d543..cd50c7f9bff 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/matrix_exp_cpu_kernel.cc +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/matrix_exp_cpu_kernel.cc @@ -23,7 +23,6 @@ namespace { constexpr uint32_t kMatrixExpInputsNum = 1; constexpr uint32_t kMatrixExpOutputsNum = 1; constexpr uint32_t kIndexTwo = 2; -constexpr int total_n_degs = 6; // Coefficients for computing taylor approximant of order 8. constexpr double sqrt_177 = 0.1330413469565007072504e+2, x3 = 2. / 3.; @@ -31,29 +30,14 @@ constexpr double x1 = x3 * ((1. + sqrt_177) / 88.), x2 = x3 * ((1. + sqrt_177) / constexpr double x4 = (-271. + 29. * sqrt_177) / (315. * x3), x5 = (-11. + 11. * sqrt_177) / (1260. * x3); constexpr double x6 = (-99. + 11. * sqrt_177) / (5040. * x3), x7 = (89. - sqrt_177) / (5040. * x3); constexpr double y2 = (857. - 58. * sqrt_177) / 630.; - -template -using array2d = std::array, ROW>; - -// Coefficients for computing taylor approximant of order 12. -constexpr int num_prods_12 = 4; -array2d b12 = { - {{9.0198e-16, 0.46932117595418237389, -0.20099424927047284052, -0.04623946134063071740}, - {5.31597895759871264183, 1.19926790417132231573, 0.01179296240992997031, 0.01108844528519167989}, - {0.18188869982170434744, 0.05502798439925399070, 0.09351590770535414968, 0.00610700528898058230}, - {-2.0861320e-13, -0.13181061013830184015, -0.02027855540589259079, -0.00675951846863086359}}}; - -// Coefficients for computing taylor approximant of order 18. -constexpr int num_prods_18 = 5; -array2d b18 = { - {{0., -1.00365581030144618291e-01, -8.02924648241156932449e-03, -8.92138498045729985177e-04, 0.}, - {0., 3.97849749499645077844e-01, 1.36783778460411720168e+00, 4.98289622525382669416e-01, - -6.37898194594723280150e-04}, - {-1.09676396052962061844e+01, 1.68015813878906206114e+00, 5.71779846478865511061e-02, -6.98210122488052056106e-03, - 3.34975017086070470649e-05}, - {-9.04316832390810593223e-02, -6.76404519071381882256e-02, 6.75961301770459654925e-02, 2.95552570429315521194e-02, - -1.39180257516060693404e-05}, - {0., 0., -9.23364619367118555360e-02, -1.69364939002081722752e-02, -1.40086798182036094347e-05}}}; +const std::vector> b18 = { + {0., -1.00365581030144618291e-01, -8.02924648241156932449e-03, -8.92138498045729985177e-04, 0.}, + {0., 3.97849749499645077844e-01, 1.36783778460411720168e+00, 4.98289622525382669416e-01, -6.37898194594723280150e-04}, + {-1.09676396052962061844e+01, 1.68015813878906206114e+00, 5.71779846478865511061e-02, -6.98210122488052056106e-03, + 3.34975017086070470649e-05}, + {-9.04316832390810593223e-02, -6.76404519071381882256e-02, 6.75961301770459654925e-02, 2.95552570429315521194e-02, + -1.39180257516060693404e-05}, + {0., 0., -9.23364619367118555360e-02, -1.69364939002081722752e-02, -1.40086798182036094347e-05}}; } // namespace bool MatrixExpCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector &inputs, @@ -80,86 +64,67 @@ int MatrixExpCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const st return 0; } -template -void MatrixExpCpuKernelMod::MTaylorApproximant(const Eigen::MatrixBase &A, - const Eigen::MatrixBase &I, int order, - Eigen::MatrixBase *E) const { - constexpr int exp_order_1 = 1, exp_order_2 = 2, exp_order_4 = 4, exp_order_8 = 8, exp_order_12 = 12; +template +void MatrixExpCpuKernelMod::MTaylorApproximantLow(const Eigen::MatrixBase &A, + const Eigen::MatrixBase &I, int order, + Eigen::MatrixBase *matrix_y) const { + constexpr int exp_order_1 = 1, exp_order_2 = 2, exp_order_4 = 4; auto A2 = A * A; - auto A3 = A * A2; if (order == exp_order_1) { - *E = I + A; + *matrix_y = I + A; } else if (order == exp_order_2) { constexpr int A2_divisor = 2; - *E = I + A + A2 / A2_divisor; + *matrix_y = I + A + A2 / A2_divisor; } else if (order == exp_order_4) { constexpr int I_divisor = 2, A_divisor = 6, A2_divisor = 24; - *E = I + A + A2 * (I / I_divisor + A / A_divisor + A2 / A2_divisor); - } else if (order == exp_order_8) { + *matrix_y = I + A + A2 * (I / I_divisor + A / A_divisor + A2 / A2_divisor); + } else { auto A4 = A2 * (x1 * A + x2 * A2); auto A8 = (x3 * A2 + A4) * (x4 * I + x5 * A + x6 * A2 + x7 * A4); - *E = I + A + y2 * A2 + A8; - } else if (order == exp_order_12) { - auto q31 = b12[0][0] * I + b12[0][1] * A + b12[0][2] * A2 + b12[0][3] * A3; - auto q32 = b12[1][0] * I + b12[1][1] * A + b12[1][2] * A2 + b12[1][3] * A3; - auto q33 = b12[2][0] * I + b12[2][1] * A + b12[2][2] * A2 + b12[2][3] * A3; - auto q34 = b12[3][0] * I + b12[3][1] * A + b12[3][2] * A2 + b12[3][3] * A3; - auto q61 = q33 + q34 * q34; - *E = q31 + (q32 + q61) * q61; - } else { - auto A6 = A3 * A3; - auto q31 = b18[0][0] * I + b18[0][1] * A + b18[0][2] * A2 + b18[0][3] * A3 + b18[0][4] * A6; - auto q61 = b18[1][0] * I + b18[1][1] * A + b18[1][2] * A2 + b18[1][3] * A3 + b18[1][4] * A6; - auto q62 = b18[2][0] * I + b18[2][1] * A + b18[2][2] * A2 + b18[2][3] * A3 + b18[2][4] * A6; - auto q63 = b18[3][0] * I + b18[3][1] * A + b18[3][2] * A2 + b18[3][3] * A3 + b18[3][4] * A6; - auto q64 = b18[4][0] * I + b18[4][1] * A + b18[4][2] * A2 + b18[4][3] * A3 + b18[4][4] * A6; - auto q91 = q31 * q64 + q63; - *E = q61 + (q62 + q91) * q91; + *matrix_y = I + A + y2 * A2 + A8; } } +template +void MatrixExpCpuKernelMod::MTaylorApproximantHigh(const Eigen::MatrixBase &A_scaled, + const Eigen::MatrixBase &I, + Eigen::MatrixBase *matrix_y) const { + auto A2 = A_scaled * A_scaled; + auto A3 = A_scaled * A2; + auto A6 = A3 * A3; + auto q31 = b18[0][0] * I + b18[0][1] * A_scaled + b18[0][2] * A2 + b18[0][3] * A3 + b18[0][4] * A6; + auto q61 = b18[1][0] * I + b18[1][1] * A_scaled + b18[1][2] * A2 + b18[1][3] * A3 + b18[1][4] * A6; + auto q62 = b18[2][0] * I + b18[2][1] * A_scaled + b18[2][2] * A2 + b18[2][3] * A3 + b18[2][4] * A6; + auto q63 = b18[3][0] * I + b18[3][1] * A_scaled + b18[3][2] * A2 + b18[3][3] * A3 + b18[3][4] * A6; + auto q64 = b18[4][0] * I + b18[4][1] * A_scaled + b18[4][2] * A2 + b18[4][3] * A3 + b18[4][4] * A6; + auto q91 = q31 * q64 + q63; + *matrix_y = q61 + (q62 + q91) * q91; +} -template -void MatrixExpCpuKernelMod::MexpImpl(const Eigen::MatrixBase &A, const Eigen::MatrixBase &I, - Eigen::MatrixBase *mexp) const { +template +void MatrixExpCpuKernelMod::MexpImpl(const Eigen::MatrixBase &A, const Eigen::MatrixBase &I, + Eigen::MatrixBase *matrix_y) const { const auto norm = A.cwiseAbs().colwise().sum().maxCoeff(); - constexpr std::array m_vals = {1, 2, 4, 8, 12, 18}; - constexpr int cut_deg = 2; - int64_t s = -1; + std::vector m_vals = {1, 2, 4, 8}; + std::vector thetas; if (data_type_ == kNumberTypeFloat16 || data_type_ == kNumberTypeFloat || data_type_ == kNumberTypeComplex64) { - constexpr std::array thetas_float = {1.192092800768788e-07, 5.978858893805233e-04, - 5.116619363445086e-02, 5.800524627688768e-01, - 1.461661507209034e+00, 3.010066362817634e+00}; - for (int i = 0; i < total_n_degs - 1; i++) { - if (norm <= thetas_float[i]) { - MTaylorApproximant(A, I, m_vals[i], mexp); - break; - } - } - if (norm >= thetas_float[total_n_degs - cut_deg]) { - s = ceil(log2(norm / thetas_float[total_n_degs - 1])); - } + thetas = {1.192092800768788e-07, 5.978858893805233e-04, 5.116619363445086e-02, 5.800524627688768e-01, + 3.010066362817634e+00}; } else { - constexpr std::array thetas_double = {2.220446049250313e-16, 2.580956802971767e-08, - 3.397168839976962e-04, 4.991228871115323e-02, - 2.996158913811580e-01, 1.090863719290036e+00}; - for (int i = 0; i < total_n_degs - 1; i++) { - if (norm <= thetas_double[i]) { - MTaylorApproximant(A, I, m_vals[i], mexp); - break; - } - } - if (norm >= thetas_double[total_n_degs - cut_deg]) { - s = ceil(log2(norm / thetas_double[total_n_degs - 1])); + thetas = {2.220446049250313e-16, 2.580956802971767e-08, 3.397168839976962e-04, 4.991228871115323e-02, + 1.090863719290036e+00}; + } + for (size_t i = 0; i < thetas.size() - 1; i++) { + if (norm <= thetas[i]) { + MTaylorApproximantLow(A, I, m_vals[i], matrix_y); + return; } } - if (s <= 0) { - s = 0; - } + int64_t s = ceil(log2(norm / thetas.back())) > 0 ? ceil(log2(norm / thetas.back())) : 0; const auto pow2s = pow(2, s); const auto A_scaled = A / pow2s; - MTaylorApproximant(A_scaled, I, m_vals[total_n_degs - 1], mexp); + MTaylorApproximantHigh(A_scaled, I, matrix_y); for (int k = 0; k < s; k++) { - *mexp = (*mexp) * (*mexp); + *matrix_y *= (*matrix_y); } } @@ -173,17 +138,17 @@ bool MatrixExpCpuKernelMod::LaunchKernel(const std::vector & auto output_y = reinterpret_cast(outputs[0]->addr); int64_t m = SizeToLong(*(input_shape_.end() - 1)); int64_t size_mm = m * m; - typedef Eigen::Matrix MatrixXd; - MatrixXd I(m, m); + MatrixXd I(m, m); + Eigen::Map> map_I(I.data(), m, m); (void)I.setIdentity(); int64_t total = SizeToLong(inputs[0]->size / sizeof(T)); int64_t matrix_num = total / size_mm; - auto task = [this, &input_x, &output_y, &I, m](size_t start, size_t end) { + auto task = [this, &input_x, &output_y, &map_I, m](size_t start, size_t end) { for (size_t i = start; i < end; i++) { - Eigen::Map matrix_x(input_x + i * m * m, m, m); - Eigen::Map matrix_y(output_y + i * m * m, m, m); + Eigen::Map> matrix_x(input_x + i * m * m, m, m); + Eigen::Map> matrix_y(output_y + i * m * m, m, m); if (matrix_x.size() > 0) { - MexpImpl(matrix_x, I, &matrix_y); + MexpImpl(matrix_x, map_I, &matrix_y); } } }; @@ -191,57 +156,9 @@ bool MatrixExpCpuKernelMod::LaunchKernel(const std::vector & return true; } -void MatrixExpCpuKernelMod::TyepChangeForFp16(int64_t i, int64_t m, int64_t size_mm, const mindspore::Float16 *input_x, - mindspore::Float16 *output_y) const { - typedef Eigen::Matrix MatrixXd; - MatrixXd I(m, m); - (void)I.setIdentity(); - MatrixXd matrix_x(m, m); - MatrixXd matrix_y(m, m); - for (int p = 0; p < m; p++) { - for (int q = 0; q < m; q++) { - matrix_x(p, q) = static_cast(input_x[i * size_mm + p * m + q]); - } - } - if (matrix_x.size() > 0) { - MexpImpl(matrix_x, I, &matrix_y); - } - for (int p = 0; p < m; p++) { - for (int q = 0; q < m; q++) { - output_y[i * size_mm + p * m + q] = static_cast(matrix_y(p, q)); - } - } -} - -template -bool MatrixExpCpuKernelMod::LaunchKernelFP16(const std::vector &inputs, - const std::vector &workspace, - const std::vector &outputs) { - CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMatrixExpInputsNum, kernel_name_); - CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMatrixExpOutputsNum, kernel_name_); - auto input_x = reinterpret_cast(inputs[0]->addr); - auto output_y = reinterpret_cast(outputs[0]->addr); - int64_t m = SizeToLong(*(input_shape_.end() - 1)); - int64_t size_mm = m * m; - typedef Eigen::Matrix MatrixXd; - MatrixXd I(m, m); - (void)I.setIdentity(); - int64_t total = SizeToLong(inputs[0]->size / sizeof(T)); - int64_t matrix_num = total / size_mm; - auto task = [this, &input_x, &output_y, m, size_mm](size_t start, size_t end) { - for (size_t i = start; i < end; i++) { - TyepChangeForFp16(i, m, size_mm, input_x, output_y); - } - }; - ParallelLaunchAutoSearch(task, matrix_num, this, ¶llel_search_info_); - return true; -} - const std::vector> &MatrixExpCpuKernelMod::GetFuncList() const { static const std::vector> func_list = { - {KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16), - &MatrixExpCpuKernelMod::LaunchKernelFP16}, {KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), &MatrixExpCpuKernelMod::LaunchKernel}, {KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64), diff --git a/mindspore/ccsrc/plugin/device/cpu/kernel/matrix_exp_cpu_kernel.h b/mindspore/ccsrc/plugin/device/cpu/kernel/matrix_exp_cpu_kernel.h index e0b149f0148..56af41565ce 100644 --- a/mindspore/ccsrc/plugin/device/cpu/kernel/matrix_exp_cpu_kernel.h +++ b/mindspore/ccsrc/plugin/device/cpu/kernel/matrix_exp_cpu_kernel.h @@ -30,6 +30,9 @@ namespace mindspore { namespace kernel { +template +using MatrixXd = Eigen::Matrix; + class MatrixExpCpuKernelMod : public NativeCpuKernelMod, public MatchKernelHelper { public: MatrixExpCpuKernelMod() = default; @@ -52,25 +55,20 @@ class MatrixExpCpuKernelMod : public NativeCpuKernelMod, public MatchKernelHelpe std::vector GetOpSupport() override { return OpSupport(); } private: - template - void MTaylorApproximant(const Eigen::MatrixBase &A, const Eigen::MatrixBase &I, int order, - Eigen::MatrixBase *E) const; - - template - void MexpImpl(const Eigen::MatrixBase &A, const Eigen::MatrixBase &I, - Eigen::MatrixBase *mexp) const; + template + void MTaylorApproximantLow(const Eigen::MatrixBase &A, const Eigen::MatrixBase &I, int order, + Eigen::MatrixBase *matrix_y) const; + template + void MTaylorApproximantHigh(const Eigen::MatrixBase &A_scaled, const Eigen::MatrixBase &I, + Eigen::MatrixBase *matrix_y) const; + template + void MexpImpl(const Eigen::MatrixBase &A, const Eigen::MatrixBase &I, + Eigen::MatrixBase *matrix_y) const; template bool LaunchKernel(const std::vector &inputs, const std::vector &workspace, const std::vector &outputs); - void TyepChangeForFp16(int64_t i, int64_t m, int64_t size_mm, const mindspore::Float16 *input_x, - mindspore::Float16 *output_y) const; - - template - bool LaunchKernelFP16(const std::vector &inputs, const std::vector &workspace, - const std::vector &outputs); - std::vector input_shape_; TypeId data_type_; };