forked from mindspore-Ecosystem/mindspore
!49895 improve the speed of compile matrix_exp
Merge pull request !49895 from zong_shuai/compile_matrix_exp
This commit is contained in:
commit
1b6faf50f9
|
@ -23,7 +23,6 @@ namespace {
|
|||
constexpr uint32_t kMatrixExpInputsNum = 1;
|
||||
constexpr uint32_t kMatrixExpOutputsNum = 1;
|
||||
constexpr uint32_t kIndexTwo = 2;
|
||||
constexpr int total_n_degs = 6;
|
||||
|
||||
// Coefficients for computing taylor approximant of order 8.
|
||||
constexpr double sqrt_177 = 0.1330413469565007072504e+2, x3 = 2. / 3.;
|
||||
|
@ -31,29 +30,14 @@ constexpr double x1 = x3 * ((1. + sqrt_177) / 88.), x2 = x3 * ((1. + sqrt_177) /
|
|||
constexpr double x4 = (-271. + 29. * sqrt_177) / (315. * x3), x5 = (-11. + 11. * sqrt_177) / (1260. * x3);
|
||||
constexpr double x6 = (-99. + 11. * sqrt_177) / (5040. * x3), x7 = (89. - sqrt_177) / (5040. * x3);
|
||||
constexpr double y2 = (857. - 58. * sqrt_177) / 630.;
|
||||
|
||||
template <typename T, int ROW, int COL>
|
||||
using array2d = std::array<std::array<T, COL>, ROW>;
|
||||
|
||||
// Coefficients for computing taylor approximant of order 12.
|
||||
constexpr int num_prods_12 = 4;
|
||||
array2d<double, num_prods_12, num_prods_12> b12 = {
|
||||
{{9.0198e-16, 0.46932117595418237389, -0.20099424927047284052, -0.04623946134063071740},
|
||||
{5.31597895759871264183, 1.19926790417132231573, 0.01179296240992997031, 0.01108844528519167989},
|
||||
{0.18188869982170434744, 0.05502798439925399070, 0.09351590770535414968, 0.00610700528898058230},
|
||||
{-2.0861320e-13, -0.13181061013830184015, -0.02027855540589259079, -0.00675951846863086359}}};
|
||||
|
||||
// Coefficients for computing taylor approximant of order 18.
|
||||
constexpr int num_prods_18 = 5;
|
||||
array2d<double, num_prods_18, num_prods_18> b18 = {
|
||||
{{0., -1.00365581030144618291e-01, -8.02924648241156932449e-03, -8.92138498045729985177e-04, 0.},
|
||||
{0., 3.97849749499645077844e-01, 1.36783778460411720168e+00, 4.98289622525382669416e-01,
|
||||
-6.37898194594723280150e-04},
|
||||
{-1.09676396052962061844e+01, 1.68015813878906206114e+00, 5.71779846478865511061e-02, -6.98210122488052056106e-03,
|
||||
3.34975017086070470649e-05},
|
||||
{-9.04316832390810593223e-02, -6.76404519071381882256e-02, 6.75961301770459654925e-02, 2.95552570429315521194e-02,
|
||||
-1.39180257516060693404e-05},
|
||||
{0., 0., -9.23364619367118555360e-02, -1.69364939002081722752e-02, -1.40086798182036094347e-05}}};
|
||||
const std::vector<std::vector<double>> b18 = {
|
||||
{0., -1.00365581030144618291e-01, -8.02924648241156932449e-03, -8.92138498045729985177e-04, 0.},
|
||||
{0., 3.97849749499645077844e-01, 1.36783778460411720168e+00, 4.98289622525382669416e-01, -6.37898194594723280150e-04},
|
||||
{-1.09676396052962061844e+01, 1.68015813878906206114e+00, 5.71779846478865511061e-02, -6.98210122488052056106e-03,
|
||||
3.34975017086070470649e-05},
|
||||
{-9.04316832390810593223e-02, -6.76404519071381882256e-02, 6.75961301770459654925e-02, 2.95552570429315521194e-02,
|
||||
-1.39180257516060693404e-05},
|
||||
{0., 0., -9.23364619367118555360e-02, -1.69364939002081722752e-02, -1.40086798182036094347e-05}};
|
||||
} // namespace
|
||||
|
||||
bool MatrixExpCpuKernelMod::Init(const BaseOperatorPtr &base_operator, const std::vector<KernelTensorPtr> &inputs,
|
||||
|
@ -80,86 +64,67 @@ int MatrixExpCpuKernelMod::Resize(const BaseOperatorPtr &base_operator, const st
|
|||
return 0;
|
||||
}
|
||||
|
||||
template <typename Derived1, typename Derived2, typename Derived3>
|
||||
void MatrixExpCpuKernelMod::MTaylorApproximant(const Eigen::MatrixBase<Derived1> &A,
|
||||
const Eigen::MatrixBase<Derived2> &I, int order,
|
||||
Eigen::MatrixBase<Derived3> *E) const {
|
||||
constexpr int exp_order_1 = 1, exp_order_2 = 2, exp_order_4 = 4, exp_order_8 = 8, exp_order_12 = 12;
|
||||
template <typename Derived>
|
||||
void MatrixExpCpuKernelMod::MTaylorApproximantLow(const Eigen::MatrixBase<Derived> &A,
|
||||
const Eigen::MatrixBase<Derived> &I, int order,
|
||||
Eigen::MatrixBase<Derived> *matrix_y) const {
|
||||
constexpr int exp_order_1 = 1, exp_order_2 = 2, exp_order_4 = 4;
|
||||
auto A2 = A * A;
|
||||
auto A3 = A * A2;
|
||||
if (order == exp_order_1) {
|
||||
*E = I + A;
|
||||
*matrix_y = I + A;
|
||||
} else if (order == exp_order_2) {
|
||||
constexpr int A2_divisor = 2;
|
||||
*E = I + A + A2 / A2_divisor;
|
||||
*matrix_y = I + A + A2 / A2_divisor;
|
||||
} else if (order == exp_order_4) {
|
||||
constexpr int I_divisor = 2, A_divisor = 6, A2_divisor = 24;
|
||||
*E = I + A + A2 * (I / I_divisor + A / A_divisor + A2 / A2_divisor);
|
||||
} else if (order == exp_order_8) {
|
||||
*matrix_y = I + A + A2 * (I / I_divisor + A / A_divisor + A2 / A2_divisor);
|
||||
} else {
|
||||
auto A4 = A2 * (x1 * A + x2 * A2);
|
||||
auto A8 = (x3 * A2 + A4) * (x4 * I + x5 * A + x6 * A2 + x7 * A4);
|
||||
*E = I + A + y2 * A2 + A8;
|
||||
} else if (order == exp_order_12) {
|
||||
auto q31 = b12[0][0] * I + b12[0][1] * A + b12[0][2] * A2 + b12[0][3] * A3;
|
||||
auto q32 = b12[1][0] * I + b12[1][1] * A + b12[1][2] * A2 + b12[1][3] * A3;
|
||||
auto q33 = b12[2][0] * I + b12[2][1] * A + b12[2][2] * A2 + b12[2][3] * A3;
|
||||
auto q34 = b12[3][0] * I + b12[3][1] * A + b12[3][2] * A2 + b12[3][3] * A3;
|
||||
auto q61 = q33 + q34 * q34;
|
||||
*E = q31 + (q32 + q61) * q61;
|
||||
} else {
|
||||
auto A6 = A3 * A3;
|
||||
auto q31 = b18[0][0] * I + b18[0][1] * A + b18[0][2] * A2 + b18[0][3] * A3 + b18[0][4] * A6;
|
||||
auto q61 = b18[1][0] * I + b18[1][1] * A + b18[1][2] * A2 + b18[1][3] * A3 + b18[1][4] * A6;
|
||||
auto q62 = b18[2][0] * I + b18[2][1] * A + b18[2][2] * A2 + b18[2][3] * A3 + b18[2][4] * A6;
|
||||
auto q63 = b18[3][0] * I + b18[3][1] * A + b18[3][2] * A2 + b18[3][3] * A3 + b18[3][4] * A6;
|
||||
auto q64 = b18[4][0] * I + b18[4][1] * A + b18[4][2] * A2 + b18[4][3] * A3 + b18[4][4] * A6;
|
||||
auto q91 = q31 * q64 + q63;
|
||||
*E = q61 + (q62 + q91) * q91;
|
||||
*matrix_y = I + A + y2 * A2 + A8;
|
||||
}
|
||||
}
|
||||
template <typename Derived, typename Derived1>
|
||||
void MatrixExpCpuKernelMod::MTaylorApproximantHigh(const Eigen::MatrixBase<Derived1> &A_scaled,
|
||||
const Eigen::MatrixBase<Derived> &I,
|
||||
Eigen::MatrixBase<Derived> *matrix_y) const {
|
||||
auto A2 = A_scaled * A_scaled;
|
||||
auto A3 = A_scaled * A2;
|
||||
auto A6 = A3 * A3;
|
||||
auto q31 = b18[0][0] * I + b18[0][1] * A_scaled + b18[0][2] * A2 + b18[0][3] * A3 + b18[0][4] * A6;
|
||||
auto q61 = b18[1][0] * I + b18[1][1] * A_scaled + b18[1][2] * A2 + b18[1][3] * A3 + b18[1][4] * A6;
|
||||
auto q62 = b18[2][0] * I + b18[2][1] * A_scaled + b18[2][2] * A2 + b18[2][3] * A3 + b18[2][4] * A6;
|
||||
auto q63 = b18[3][0] * I + b18[3][1] * A_scaled + b18[3][2] * A2 + b18[3][3] * A3 + b18[3][4] * A6;
|
||||
auto q64 = b18[4][0] * I + b18[4][1] * A_scaled + b18[4][2] * A2 + b18[4][3] * A3 + b18[4][4] * A6;
|
||||
auto q91 = q31 * q64 + q63;
|
||||
*matrix_y = q61 + (q62 + q91) * q91;
|
||||
}
|
||||
|
||||
template <typename Derived1, typename Derived2>
|
||||
void MatrixExpCpuKernelMod::MexpImpl(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I,
|
||||
Eigen::MatrixBase<Derived1> *mexp) const {
|
||||
template <typename Derived>
|
||||
void MatrixExpCpuKernelMod::MexpImpl(const Eigen::MatrixBase<Derived> &A, const Eigen::MatrixBase<Derived> &I,
|
||||
Eigen::MatrixBase<Derived> *matrix_y) const {
|
||||
const auto norm = A.cwiseAbs().colwise().sum().maxCoeff();
|
||||
constexpr std::array<int, total_n_degs> m_vals = {1, 2, 4, 8, 12, 18};
|
||||
constexpr int cut_deg = 2;
|
||||
int64_t s = -1;
|
||||
std::vector<int> m_vals = {1, 2, 4, 8};
|
||||
std::vector<double> thetas;
|
||||
if (data_type_ == kNumberTypeFloat16 || data_type_ == kNumberTypeFloat || data_type_ == kNumberTypeComplex64) {
|
||||
constexpr std::array<float, total_n_degs> thetas_float = {1.192092800768788e-07, 5.978858893805233e-04,
|
||||
5.116619363445086e-02, 5.800524627688768e-01,
|
||||
1.461661507209034e+00, 3.010066362817634e+00};
|
||||
for (int i = 0; i < total_n_degs - 1; i++) {
|
||||
if (norm <= thetas_float[i]) {
|
||||
MTaylorApproximant(A, I, m_vals[i], mexp);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (norm >= thetas_float[total_n_degs - cut_deg]) {
|
||||
s = ceil(log2(norm / thetas_float[total_n_degs - 1]));
|
||||
}
|
||||
thetas = {1.192092800768788e-07, 5.978858893805233e-04, 5.116619363445086e-02, 5.800524627688768e-01,
|
||||
3.010066362817634e+00};
|
||||
} else {
|
||||
constexpr std::array<double, total_n_degs> thetas_double = {2.220446049250313e-16, 2.580956802971767e-08,
|
||||
3.397168839976962e-04, 4.991228871115323e-02,
|
||||
2.996158913811580e-01, 1.090863719290036e+00};
|
||||
for (int i = 0; i < total_n_degs - 1; i++) {
|
||||
if (norm <= thetas_double[i]) {
|
||||
MTaylorApproximant(A, I, m_vals[i], mexp);
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (norm >= thetas_double[total_n_degs - cut_deg]) {
|
||||
s = ceil(log2(norm / thetas_double[total_n_degs - 1]));
|
||||
thetas = {2.220446049250313e-16, 2.580956802971767e-08, 3.397168839976962e-04, 4.991228871115323e-02,
|
||||
1.090863719290036e+00};
|
||||
}
|
||||
for (size_t i = 0; i < thetas.size() - 1; i++) {
|
||||
if (norm <= thetas[i]) {
|
||||
MTaylorApproximantLow(A, I, m_vals[i], matrix_y);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (s <= 0) {
|
||||
s = 0;
|
||||
}
|
||||
int64_t s = ceil(log2(norm / thetas.back())) > 0 ? ceil(log2(norm / thetas.back())) : 0;
|
||||
const auto pow2s = pow(2, s);
|
||||
const auto A_scaled = A / pow2s;
|
||||
MTaylorApproximant(A_scaled, I, m_vals[total_n_degs - 1], mexp);
|
||||
MTaylorApproximantHigh(A_scaled, I, matrix_y);
|
||||
for (int k = 0; k < s; k++) {
|
||||
*mexp = (*mexp) * (*mexp);
|
||||
*matrix_y *= (*matrix_y);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -173,17 +138,17 @@ bool MatrixExpCpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &
|
|||
auto output_y = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
int64_t m = SizeToLong(*(input_shape_.end() - 1));
|
||||
int64_t size_mm = m * m;
|
||||
typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
|
||||
MatrixXd I(m, m);
|
||||
MatrixXd<T> I(m, m);
|
||||
Eigen::Map<MatrixXd<T>> map_I(I.data(), m, m);
|
||||
(void)I.setIdentity();
|
||||
int64_t total = SizeToLong(inputs[0]->size / sizeof(T));
|
||||
int64_t matrix_num = total / size_mm;
|
||||
auto task = [this, &input_x, &output_y, &I, m](size_t start, size_t end) {
|
||||
auto task = [this, &input_x, &output_y, &map_I, m](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
Eigen::Map<MatrixXd> matrix_x(input_x + i * m * m, m, m);
|
||||
Eigen::Map<MatrixXd> matrix_y(output_y + i * m * m, m, m);
|
||||
Eigen::Map<MatrixXd<T>> matrix_x(input_x + i * m * m, m, m);
|
||||
Eigen::Map<MatrixXd<T>> matrix_y(output_y + i * m * m, m, m);
|
||||
if (matrix_x.size() > 0) {
|
||||
MexpImpl(matrix_x, I, &matrix_y);
|
||||
MexpImpl(matrix_x, map_I, &matrix_y);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
@ -191,57 +156,9 @@ bool MatrixExpCpuKernelMod::LaunchKernel(const std::vector<kernel::AddressPtr> &
|
|||
return true;
|
||||
}
|
||||
|
||||
void MatrixExpCpuKernelMod::TyepChangeForFp16(int64_t i, int64_t m, int64_t size_mm, const mindspore::Float16 *input_x,
|
||||
mindspore::Float16 *output_y) const {
|
||||
typedef Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
|
||||
MatrixXd I(m, m);
|
||||
(void)I.setIdentity();
|
||||
MatrixXd matrix_x(m, m);
|
||||
MatrixXd matrix_y(m, m);
|
||||
for (int p = 0; p < m; p++) {
|
||||
for (int q = 0; q < m; q++) {
|
||||
matrix_x(p, q) = static_cast<float>(input_x[i * size_mm + p * m + q]);
|
||||
}
|
||||
}
|
||||
if (matrix_x.size() > 0) {
|
||||
MexpImpl(matrix_x, I, &matrix_y);
|
||||
}
|
||||
for (int p = 0; p < m; p++) {
|
||||
for (int q = 0; q < m; q++) {
|
||||
output_y[i * size_mm + p * m + q] = static_cast<float16>(matrix_y(p, q));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
bool MatrixExpCpuKernelMod::LaunchKernelFP16(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &workspace,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMatrixExpInputsNum, kernel_name_);
|
||||
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMatrixExpOutputsNum, kernel_name_);
|
||||
auto input_x = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto output_y = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
int64_t m = SizeToLong(*(input_shape_.end() - 1));
|
||||
int64_t size_mm = m * m;
|
||||
typedef Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor> MatrixXd;
|
||||
MatrixXd I(m, m);
|
||||
(void)I.setIdentity();
|
||||
int64_t total = SizeToLong(inputs[0]->size / sizeof(T));
|
||||
int64_t matrix_num = total / size_mm;
|
||||
auto task = [this, &input_x, &output_y, m, size_mm](size_t start, size_t end) {
|
||||
for (size_t i = start; i < end; i++) {
|
||||
TyepChangeForFp16(i, m, size_mm, input_x, output_y);
|
||||
}
|
||||
};
|
||||
ParallelLaunchAutoSearch(task, matrix_num, this, ¶llel_search_info_);
|
||||
return true;
|
||||
}
|
||||
|
||||
const std::vector<std::pair<KernelAttr, MatrixExpCpuKernelMod::KernelRunFunc>> &MatrixExpCpuKernelMod::GetFuncList()
|
||||
const {
|
||||
static const std::vector<std::pair<KernelAttr, MatrixExpCpuKernelMod::KernelRunFunc>> func_list = {
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat16).AddOutputAttr(kNumberTypeFloat16),
|
||||
&MatrixExpCpuKernelMod::LaunchKernelFP16<float16>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
|
||||
&MatrixExpCpuKernelMod::LaunchKernel<float>},
|
||||
{KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
|
||||
|
|
|
@ -30,6 +30,9 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
template <typename T>
|
||||
using MatrixXd = Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
|
||||
|
||||
class MatrixExpCpuKernelMod : public NativeCpuKernelMod, public MatchKernelHelper<MatrixExpCpuKernelMod> {
|
||||
public:
|
||||
MatrixExpCpuKernelMod() = default;
|
||||
|
@ -52,25 +55,20 @@ class MatrixExpCpuKernelMod : public NativeCpuKernelMod, public MatchKernelHelpe
|
|||
std::vector<KernelAttr> GetOpSupport() override { return OpSupport(); }
|
||||
|
||||
private:
|
||||
template <typename Derived1, typename Derived2, typename Derived3>
|
||||
void MTaylorApproximant(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I, int order,
|
||||
Eigen::MatrixBase<Derived3> *E) const;
|
||||
|
||||
template <typename Derived1, typename Derived2>
|
||||
void MexpImpl(const Eigen::MatrixBase<Derived1> &A, const Eigen::MatrixBase<Derived2> &I,
|
||||
Eigen::MatrixBase<Derived1> *mexp) const;
|
||||
template <typename Derived>
|
||||
void MTaylorApproximantLow(const Eigen::MatrixBase<Derived> &A, const Eigen::MatrixBase<Derived> &I, int order,
|
||||
Eigen::MatrixBase<Derived> *matrix_y) const;
|
||||
template <typename Derived, typename Derived1>
|
||||
void MTaylorApproximantHigh(const Eigen::MatrixBase<Derived1> &A_scaled, const Eigen::MatrixBase<Derived> &I,
|
||||
Eigen::MatrixBase<Derived> *matrix_y) const;
|
||||
template <typename Derived>
|
||||
void MexpImpl(const Eigen::MatrixBase<Derived> &A, const Eigen::MatrixBase<Derived> &I,
|
||||
Eigen::MatrixBase<Derived> *matrix_y) const;
|
||||
|
||||
template <typename T>
|
||||
bool LaunchKernel(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &workspace,
|
||||
const std::vector<kernel::AddressPtr> &outputs);
|
||||
|
||||
void TyepChangeForFp16(int64_t i, int64_t m, int64_t size_mm, const mindspore::Float16 *input_x,
|
||||
mindspore::Float16 *output_y) const;
|
||||
|
||||
template <typename T>
|
||||
bool LaunchKernelFP16(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &workspace,
|
||||
const std::vector<kernel::AddressPtr> &outputs);
|
||||
|
||||
std::vector<size_t> input_shape_;
|
||||
TypeId data_type_;
|
||||
};
|
||||
|
|
Loading…
Reference in New Issue