!26546 Unify GPU/CPU ops input/output(col/rolmajor), modify related testcases, add linalg function and testcases

Merge pull request !26546 from wuwenbing/master
2021-11-22 04:34:33 +00:00 · 2021-11-22 04:34:33 +00:00 · 69c4f470e4
parent 732c5e005a 13409f519f
commit 69c4f470e4
14 changed files with 360 additions and 186 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/eigen/eig_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/eigen/eig_cpu_kernel.cc
@ -24,8 +24,6 @@ namespace kernel {
 namespace {
 constexpr size_t kInputsNum = 1;
 constexpr size_t kOutputsNum = 2;
-constexpr size_t kDefaultShape = 1;
-constexpr auto kAMatrixDimNum = 2;

 }  // namespace
 using Eigen::Dynamic;
@ -45,12 +43,8 @@ using ComplexMatrixSquare = Eigen::Matrix<std::complex<T>, Dynamic, Dynamic, Row
 template <typename T, typename C>
 void EigCPUKernel<T, C>::InitKernel(const CNodePtr &kernel_node) {
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
-
  compute_eigen_vectors = AnfAlgo::GetNodeAttr<bool>(kernel_node, C_EIEH_VECTOR);
-
  auto A_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-  CHECK_KERNEL_INPUTS_NUM(A_shape.size(), kAMatrixDimNum, AnfAlgo::GetCNodeName(kernel_node));
-
  if (A_shape.size() != kShape2dDims || A_shape[0] != A_shape[1]) {
    MS_LOG(EXCEPTION) << "wrong array shape, A should be a  matrix, but got [" << A_shape[0] << " X " << A_shape[1]
                      << "]";
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/eigen/eigh_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/eigen/eigh_cpu_kernel.cc
@ -22,10 +22,8 @@ namespace mindspore {
 namespace kernel {

 namespace {
-constexpr size_t kInputsNum = 2;
+constexpr size_t kInputsNum = 1;
 constexpr size_t kOutputsNum = 2;
-constexpr size_t kDefaultShape = 1;
-constexpr auto kAMatrixDimNum = 2;

 }  // namespace
 using Eigen::Dynamic;
@ -45,12 +43,9 @@ using ComplexMatrixSquare = Eigen::Matrix<std::complex<T>, Dynamic, Dynamic, Row
 template <typename T>
 void EighCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
-
-  compute_eigen_vectors = AnfAlgo::GetNodeAttr<bool>(kernel_node, C_EIEH_VECTOR);
-
+  compute_eigen_vectors_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, C_EIEH_VECTOR);
+  lower_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, LOWER);
  auto A_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
-  CHECK_KERNEL_INPUTS_NUM(A_shape.size(), kAMatrixDimNum, AnfAlgo::GetCNodeName(kernel_node));
-
  if (A_shape.size() != kShape2dDims || A_shape[0] != A_shape[1]) {
    MS_LOG(EXCEPTION) << "wrong array shape, A should be a matrix, but got [" << A_shape[0] << " X " << A_shape[1]
                      << "]";
@ -91,10 +86,8 @@ bool EighCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std::
                              const std::vector<AddressPtr> &outputs) {
  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kInputsNum, kernel_name_);
  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kOutputsNum, kernel_name_);
-
  auto A_addr = reinterpret_cast<T *>(inputs[0]->addr);
-  // is the Matrix a symmetric matrix(0, all, general matxi, -1 lower triangle, 1 upper triangle)
-  auto symmetric_type = reinterpret_cast<bool *>(inputs[1]->addr);
+  // is the Matrix a symmetric matrix(true lower triangle, false upper triangle)
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  auto output_v_addr = reinterpret_cast<T *>(outputs[1]->addr);
  Map<MatrixSquare<T>> A(A_addr, m_, m_);
@ -102,19 +95,19 @@ bool EighCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs, const std::
  Map<MatrixSquare<T>> output(output_addr, m_, 1);
  Map<MatrixSquare<T>> outputv(output_v_addr, m_, m_);
  // selfadjoint matrix
-  if (*symmetric_type) {
+  if (lower_) {
    A_ = A.template selfadjointView<Lower>();
  } else {
    A_ = A.template selfadjointView<Upper>();
  }
  // Real scalar eigen solver
  if constexpr (std::is_same_v<T, float>) {
-    SolveSelfAdjointMatrix(A_, &output, &outputv, compute_eigen_vectors);
+    SolveSelfAdjointMatrix(A_, &output, &outputv, compute_eigen_vectors_);
  } else if constexpr (std::is_same_v<T, double>) {
-    SolveSelfAdjointMatrix(A_, &output, &outputv, compute_eigen_vectors);
+    SolveSelfAdjointMatrix(A_, &output, &outputv, compute_eigen_vectors_);
  } else {
    // complex eigen solver
-    SolveComplexMatrix(A_, &output, &outputv, compute_eigen_vectors);
+    SolveComplexMatrix(A_, &output, &outputv, compute_eigen_vectors_);
  }
  return true;
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/eigen/eigh_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/eigen/eigh_cpu_kernel.h
@ -46,36 +46,29 @@ class EighCPUKernel : public CPUKernel {

 private:
  size_t m_{1};
-  bool compute_eigen_vectors{false};
+  bool compute_eigen_vectors_{false};
+  bool lower_{true};
  TypeId dtype_{kNumberTypeFloat32};
 };

-MS_REG_CPU_KERNEL_T(Eigh,
-                    KernelAttr()
-                      .AddInputAttr(kNumberTypeFloat32)
-                      .AddInputAttr(kNumberTypeBool)
-                      .AddOutputAttr(kNumberTypeFloat32)
-                      .AddOutputAttr(kNumberTypeFloat32),
-                    EighCPUKernel, float);
-MS_REG_CPU_KERNEL_T(Eigh,
-                    KernelAttr()
-                      .AddInputAttr(kNumberTypeFloat64)
-                      .AddInputAttr(kNumberTypeBool)
-                      .AddOutputAttr(kNumberTypeFloat64)
-                      .AddOutputAttr(kNumberTypeFloat64),
-                    EighCPUKernel, double);
+MS_REG_CPU_KERNEL_T(
+  Eigh,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  EighCPUKernel, float);
+MS_REG_CPU_KERNEL_T(
+  Eigh,
+  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
+  EighCPUKernel, double);

 MS_REG_CPU_KERNEL_T(Eigh,
                    KernelAttr()
                      .AddInputAttr(kNumberTypeComplex64)
-                      .AddInputAttr(kNumberTypeBool)
                      .AddOutputAttr(kNumberTypeComplex64)
                      .AddOutputAttr(kNumberTypeComplex64),
                    EighCPUKernel, float_complex);
 MS_REG_CPU_KERNEL_T(Eigh,
                    KernelAttr()
                      .AddInputAttr(kNumberTypeComplex128)
-                      .AddInputAttr(kNumberTypeBool)
                      .AddOutputAttr(kNumberTypeComplex128)
                      .AddOutputAttr(kNumberTypeComplex128),
                    EighCPUKernel, double_complex);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/transpose_impl.cu
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/transpose_impl.cu
@ -18,6 +18,10 @@

 #include "transpose_impl.cuh"
 #include "runtime/device/gpu/cuda_common.h"
+#include "utils/complex.h"
+
+template <typename T>
+using Complex = mindspore::utils::Complex<T>;

 template <typename T>
 __global__ void Transpose(const size_t size, const T *input, const size_t *input_shape, const size_t *input_axis,
@ -74,3 +78,9 @@ template void CalTranspose<int>(const size_t size, const int *input, const size_
 template void CalTranspose<int64_t>(const size_t size, const int64_t *input, const size_t *input_shape,
                                    const size_t *input_axis, const size_t shape_size, int64_t *output,
                                    cudaStream_t cuda_stream);
+template void CalTranspose<Complex<float>>(const size_t size, const Complex<float> *input, const size_t *input_shape,
+                                const size_t *input_axis, const size_t shape_size, Complex<float> *output,
+                                cudaStream_t cuda_stream);
+template void CalTranspose<Complex<double>>(const size_t size, const Complex<double> *input, const size_t *input_shape,
+                                    const size_t *input_axis, const size_t shape_size, Complex<double> *output,
+                                    cudaStream_t cuda_stream);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_c_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_c_gpu_kernel.cc
@ -21,14 +21,12 @@ namespace kernel {
 MS_REG_GPU_KERNEL_ONE(Eigh,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeComplex64)
-                        .AddInputAttr(kNumberTypeBool)
                        .AddOutputAttr(kNumberTypeComplex64)
                        .AddOutputAttr(kNumberTypeComplex64),
                      EighcGpuKernel, Complex<float>)
 MS_REG_GPU_KERNEL_ONE(Eigh,
                      KernelAttr()
                        .AddInputAttr(kNumberTypeComplex128)
-                        .AddInputAttr(kNumberTypeBool)
                        .AddOutputAttr(kNumberTypeComplex128)
                        .AddOutputAttr(kNumberTypeComplex128),
                      EighcGpuKernel, Complex<double>);
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_c_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_c_gpu_kernel.h
@ -32,10 +32,12 @@
 #include "utils/convert_utils.h"
 #include "utils/complex.h"
 #include "backend/kernel_compiler/gpu/cuda_impl/real_to_complex_impl.cuh"
+#include "backend/kernel_compiler/gpu/cuda_impl/transpose_impl.cuh"

 namespace mindspore {
 namespace kernel {
 constexpr char C_EIEH_VECTOR[] = "compute_eigenvectors";
+constexpr char LOWER[] = "lower";
 template <typename T>
 using Complex = mindspore::utils::Complex<T>;

@ -61,6 +63,7 @@ class EighcGpuKernel : public GpuKernel {
    dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
    auto A_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    compute_eigen_vectors_ = static_cast<bool>(GetAttr<bool>(kernel_node, C_EIEH_VECTOR));
+    lower_ = static_cast<bool>(GetAttr<bool>(kernel_node, LOWER));
    if (compute_eigen_vectors_) {
      jobz_ = CUSOLVER_EIG_MODE_VECTOR;
    } else {
@ -84,13 +87,7 @@ class EighcGpuKernel : public GpuKernel {
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    // matrix A, input or output(eigenvector)
    auto inout_A_addr = GetDeviceAddress<T>(inputs, 0);
-    auto lower = GetDeviceAddress<bool>(inputs, 1);
-    bool h_lower{true};
-    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
-                               cudaMemcpyAsync(&h_lower, lower, sizeof(bool), cudaMemcpyDeviceToHost,
-                                               reinterpret_cast<cudaStream_t>(stream_ptr)),
-                               "copy lower do device failed");
-    if (h_lower) {
+    if (lower_) {
      uplo_ = CUBLAS_FILL_MODE_LOWER;
    } else {
      uplo_ = CUBLAS_FILL_MODE_UPPER;
@ -105,24 +102,39 @@ class EighcGpuKernel : public GpuKernel {
    // temp output eigenvalues real scalar
    auto w_w_addr = GetDeviceAddress<D>(workspace, 0);
    auto w_w_c_addr = GetDeviceAddress<T>(workspace, 1);
+    // temp eigenvector before transpose
+    auto w_v_addr = GetDeviceAddress<T>(workspace, 2);
    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
                               cudaMemcpyAsync(output_v_addr, inout_A_addr, m_ * m_ * sizeof(T),
                                               cudaMemcpyDeviceToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
                               "copy input matrix failed");
+    size_t input_shape[kShape2dDims] = {m_, m_};
+    size_t input_axis[kShape2dDims] = {1, 0};
+    size_t *dev_input_shape = nullptr;
+    cudaMalloc(reinterpret_cast<void **>(&dev_input_shape), kShape2dDims * sizeof(size_t));
+    size_t *dev_input_axis = nullptr;
+    cudaMalloc(reinterpret_cast<void **>(&dev_input_axis), kShape2dDims * sizeof(size_t));
+    cudaMemcpyAsync(dev_input_shape, input_shape, kShape2dDims * sizeof(size_t), cudaMemcpyHostToDevice,
+                    reinterpret_cast<cudaStream_t>(stream_ptr));
+    cudaMemcpyAsync(dev_input_axis, input_axis, kShape2dDims * sizeof(size_t), cudaMemcpyHostToDevice,
+                    reinterpret_cast<cudaStream_t>(stream_ptr));
+    CalTranspose(m_ * m_, output_v_addr, dev_input_shape, dev_input_axis, kShape2dDims, w_v_addr,
+                 reinterpret_cast<cudaStream_t>(stream_ptr));
+
    int lwork = 0;
    if constexpr (std::is_same_v<T, Complex<float>>) {
      cusolverDnCheevd_bufferSize(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast<cuComplex *>(output_v_addr),
                                  lda_, w_w_addr, &lwork);
      CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMalloc(reinterpret_cast<void **>(&d_work), sizeof(T) * lwork),
                                 "cal eigenvalues workspace failed");
-      cusolverDnCheevd(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast<cuComplex *>(output_v_addr), lda_, w_w_addr,
+      cusolverDnCheevd(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast<cuComplex *>(w_v_addr), lda_, w_w_addr,
                       reinterpret_cast<cuComplex *>(d_work), lwork, devInfo);
    } else {
      cusolverDnZheevd_bufferSize(cusolver_handle_, jobz_, uplo_, m_,
                                  reinterpret_cast<cuDoubleComplex *>(output_v_addr), lda_, w_w_addr, &lwork);
      CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_, cudaMalloc(reinterpret_cast<void **>(&d_work), sizeof(T) * lwork),
                                 "cal eigenvalues workspace failed");
-      cusolverDnZheevd(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast<cuDoubleComplex *>(output_v_addr), lda_,
+      cusolverDnZheevd(cusolver_handle_, jobz_, uplo_, m_, reinterpret_cast<cuDoubleComplex *>(w_v_addr), lda_,
                       w_w_addr, reinterpret_cast<cuDoubleComplex *>(d_work), lwork, devInfo);
    }
    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
@ -131,6 +143,8 @@ class EighcGpuKernel : public GpuKernel {
                               "copy eigenvalue from workspace to host failed");
    RealToComplex(m_, reinterpret_cast<D *>(w_w_c_addr), reinterpret_cast<D *>(output_w_addr),
                  reinterpret_cast<cudaStream_t>(stream_ptr));
+    CalTranspose(m_ * m_, w_v_addr, dev_input_shape, dev_input_axis, kShape2dDims, output_v_addr,
+                 reinterpret_cast<cudaStream_t>(stream_ptr));
    // convert real scalar to complex
    if (d_work) {
      cudaFree(d_work);
@ -153,8 +167,6 @@ class EighcGpuKernel : public GpuKernel {
  void InitSizeLists() override {
    // in/out matrix, eigenvector
    input_size_list_.push_back(m_ * m_ * sizeof(T));
-    // uplo
-    input_size_list_.push_back(sizeof(bool));
    // eigenvalues, cuda output original real scalar, should covert to complex<ft32/64>
    output_size_list_.push_back(m_ * sizeof(T));
    output_size_list_.push_back(m_ * m_ * sizeof(T));
@ -162,6 +174,7 @@ class EighcGpuKernel : public GpuKernel {
    workspace_size_list_.push_back(m_ * sizeof(D));
    // for temp pre-transpose complex mitrx
    workspace_size_list_.push_back(m_ * sizeof(T));
+    workspace_size_list_.push_back(m_ * m_ * sizeof(T));
  }

  size_t m_{1};
@ -171,6 +184,7 @@ class EighcGpuKernel : public GpuKernel {
  cublasFillMode_t uplo_ = CUBLAS_FILL_MODE_UPPER;
  cusolverEigMode_t jobz_ = CUSOLVER_EIG_MODE_NOVECTOR;
  bool compute_eigen_vectors_{false};
+  bool lower_{true};
  std::vector<T *> h_array_{};
  std::vector<size_t> input_size_list_{};
  std::vector<size_t> output_size_list_{};
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_gpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_gpu_kernel.cc
@ -18,19 +18,13 @@

 namespace mindspore {
 namespace kernel {
-MS_REG_GPU_KERNEL_ONE(Eigh,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat32)
-                        .AddInputAttr(kNumberTypeBool)
-                        .AddOutputAttr(kNumberTypeFloat32)
-                        .AddOutputAttr(kNumberTypeFloat32),
-                      EighGpuKernel, float)
-MS_REG_GPU_KERNEL_ONE(Eigh,
-                      KernelAttr()
-                        .AddInputAttr(kNumberTypeFloat64)
-                        .AddInputAttr(kNumberTypeBool)
-                        .AddOutputAttr(kNumberTypeFloat64)
-                        .AddOutputAttr(kNumberTypeFloat64),
-                      EighGpuKernel, double);
+MS_REG_GPU_KERNEL_ONE(
+  Eigh,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  EighGpuKernel, float)
+MS_REG_GPU_KERNEL_ONE(
+  Eigh,
+  KernelAttr().AddInputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64).AddOutputAttr(kNumberTypeFloat64),
+  EighGpuKernel, double);
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_gpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/gpu/math/eigh_gpu_kernel.h
@ -30,10 +30,12 @@
 #include "backend/kernel_compiler/gpu/gpu_kernel_factory.h"
 #include "backend/kernel_compiler/gpu/kernel_constants.h"
 #include "utils/convert_utils.h"
+#include "backend/kernel_compiler/gpu/cuda_impl/transpose_impl.cuh"

 namespace mindspore {
 namespace kernel {
 constexpr char C_EIEH_VECTOR[] = "compute_eigenvectors";
+constexpr char LOWER[] = "lower";
 template <typename T>
 class EighGpuKernel : public GpuKernel {
 public:
@ -47,6 +49,7 @@ class EighGpuKernel : public GpuKernel {
    dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
    auto A_shape = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
    compute_eigen_vectors_ = static_cast<bool>(GetAttr<bool>(kernel_node, C_EIEH_VECTOR));
+    lower_ = static_cast<bool>(GetAttr<bool>(kernel_node, LOWER));
    if (compute_eigen_vectors_) {
      jobz_ = CUSOLVER_EIG_MODE_VECTOR;
    } else {
@ -69,26 +72,23 @@ class EighGpuKernel : public GpuKernel {
  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs, void *stream_ptr) override {
    // matrix A, input or output(eigenvector)
-    auto inout_A_addr = GetDeviceAddress<T>(inputs, 0);
-    auto lower = GetDeviceAddress<bool>(inputs, 1);
-    bool h_lower{true};
-    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
-                               cudaMemcpyAsync(&h_lower, lower, sizeof(bool), cudaMemcpyDeviceToHost,
-                                               reinterpret_cast<cudaStream_t>(stream_ptr)),
-                               "copy to lower to device failed");
-    if (h_lower) {
-      uplo_ = CUBLAS_FILL_MODE_LOWER;
-    } else {
+    auto inout_A_addr = GetDeviceAddress<T>(inputs, kDim0);
+    // Notice :this is important
+    // a col or row major is different to cpu, so a lower triangle is a upper triangle, a upper is a lower in gpu mem
+    // so the upper is positive to it from var, but for real scalar matrix, upper eq lower, it's different from complex
+    if (lower_) {
      uplo_ = CUBLAS_FILL_MODE_UPPER;
+    } else {
+      uplo_ = CUBLAS_FILL_MODE_LOWER;
    }
-    auto output_addr = GetDeviceAddress<T>(outputs, 0);    // output eigenvalues
-    auto output_v_addr = GetDeviceAddress<T>(outputs, 1);  // output eigenvalues
+    auto output_addr = GetDeviceAddress<T>(outputs, kDim0);    // output eigenvalues
+    auto output_v_addr = GetDeviceAddress<T>(outputs, kDim1);  // output eigenvalues
+    auto w_v_addr = GetDeviceAddress<T>(workspace, kDim0);     // temp eigenvector before transpose
    CHECK_CUDA_RET_WITH_EXCEPT(kernel_node_,
-                               cudaMemcpyAsync(output_v_addr, inout_A_addr, m_ * m_ * sizeof(T),
-                                               cudaMemcpyDeviceToDevice, reinterpret_cast<cudaStream_t>(stream_ptr)),
+                               cudaMemcpyAsync(w_v_addr, inout_A_addr, m_ * m_ * sizeof(T), cudaMemcpyDeviceToDevice,
+                                               reinterpret_cast<cudaStream_t>(stream_ptr)),
                               "copy to input matrix failed");
    size_t lda_ = m_;
-
    int lwork = 0;
    if constexpr (std::is_same_v<T, float>) {
      cusolverDnSsyevd_bufferSize(cusolver_handle_, jobz_, uplo_, m_, inout_A_addr, lda_, output_addr, &lwork);
@ -100,10 +100,22 @@ class EighGpuKernel : public GpuKernel {
    T *d_work = nullptr;
    cudaMalloc(reinterpret_cast<void **>(&d_work), sizeof(T) * lwork);
    if constexpr (std::is_same_v<T, float>) {
-      cusolverDnSsyevd(cusolver_handle_, jobz_, uplo_, m_, output_v_addr, lda_, output_addr, d_work, lwork, devInfo);
+      cusolverDnSsyevd(cusolver_handle_, jobz_, uplo_, m_, w_v_addr, lda_, output_addr, d_work, lwork, devInfo);
    } else if constexpr (std::is_same_v<T, double>) {
-      cusolverDnDsyevd(cusolver_handle_, jobz_, uplo_, m_, output_v_addr, lda_, output_addr, d_work, lwork, devInfo);
+      cusolverDnDsyevd(cusolver_handle_, jobz_, uplo_, m_, w_v_addr, lda_, output_addr, d_work, lwork, devInfo);
    }
+    size_t input_shape[kShape2dDims] = {m_, m_};
+    size_t input_axis[kShape2dDims] = {1, 0};
+    size_t *dev_input_shape = nullptr;
+    cudaMalloc(reinterpret_cast<void **>(&dev_input_shape), kShape2dDims * sizeof(size_t));
+    size_t *dev_input_axis = nullptr;
+    cudaMalloc(reinterpret_cast<void **>(&dev_input_axis), kShape2dDims * sizeof(size_t));
+    cudaMemcpyAsync(dev_input_shape, input_shape, kShape2dDims * sizeof(size_t), cudaMemcpyHostToDevice,
+                    reinterpret_cast<cudaStream_t>(stream_ptr));
+    cudaMemcpyAsync(dev_input_axis, input_axis, kShape2dDims * sizeof(size_t), cudaMemcpyHostToDevice,
+                    reinterpret_cast<cudaStream_t>(stream_ptr));
+    CalTranspose(m_ * m_, w_v_addr, dev_input_shape, dev_input_axis, kShape2dDims, output_v_addr,
+                 reinterpret_cast<cudaStream_t>(stream_ptr));
    if (d_work) {
      cudaFree(d_work);
    }
@ -125,12 +137,11 @@ class EighGpuKernel : public GpuKernel {
  void InitSizeLists() override {
    // in/out matrix, eigenvector
    input_size_list_.push_back(m_ * m_ * sizeof(T));
-    // uplo
-    input_size_list_.push_back(sizeof(bool));
    // eigenvalues
    output_size_list_.push_back(m_ * sizeof(T));
    // eigenvector
    output_size_list_.push_back(m_ * m_ * sizeof(T));
+    workspace_size_list_.push_back(m_ * m_ * sizeof(T));
  }

  size_t m_{1};
@ -139,6 +150,7 @@ class EighGpuKernel : public GpuKernel {
  cublasFillMode_t uplo_ = CUBLAS_FILL_MODE_UPPER;
  cusolverEigMode_t jobz_ = CUSOLVER_EIG_MODE_NOVECTOR;
  bool compute_eigen_vectors_{false};
+  bool lower_{true};
  std::vector<T *> h_array_{};
  std::vector<size_t> input_size_list_{};
  std::vector<size_t> output_size_list_{};
--- a/mindspore/scipy/linalg.py
+++ b/mindspore/scipy/linalg.py
@ -18,9 +18,10 @@ from .. import ops
 from .ops import SolveTriangular
 from .ops import CholeskySolver
 from .ops import Cholesky
+from .ops import EighNet
 from ..ops import operations as P

-__all__ = ['block_diag', 'solve_triangular', 'inv', 'cho_factor', 'cholesky', 'cho_solve']
+__all__ = ['block_diag', 'solve_triangular', 'inv', 'cho_factor', 'cholesky', 'cho_solve', 'eigh']


 def block_diag(*arrs):
@ -318,3 +319,84 @@ def cho_solve(c_and_lower, b, overwrite_b=False, check_finite=True):
    cholesky_solver_net = CholeskySolver(lower=lower)
    x = cholesky_solver_net(c, b)
    return x
+
+
+def eigh(a, b=None, lower=True, eigvals_only=False, overwrite_a=False,
+         overwrite_b=False, turbo=True, eigvals=None, _type=1,
+         check_finite=True):
+    """
+    Solve a standard or generalized eigenvalue problem for a complex
+    Hermitian or real symmetric matrix.
+
+    Find eigenvalues Tensor ``w`` and optionally eigenvectors Tensor ``v`` of
+    Tensor ``a``, where ``b`` is positive definite such that for every
+    eigenvalue λ (i-th entry of w) and its eigenvector ``vi`` (i-th column of
+    ``v``) satisfies::
+
+                      a @ vi = λ * b @ vi
+        vi.conj().T @ a @ vi = λ
+        vi.conj().T @ b @ vi = 1
+
+    In the standard problem, ``b`` is assumed to be the identity matrix.
+
+    Args:
+        a (Tensor): (M, M) Tensor
+            A complex Hermitian or real symmetric matrix whose eigenvalues and
+            eigenvectors will be computed.
+        b (Tensor, optional): (M, M) Tensor
+            A complex Hermitian or real symmetric definite positive matrix in.
+            If omitted, identity matrix is assumed.
+        lower (bool, optional): Whether the pertinent Tensor data is taken from
+            the lower or upper triangle of ``a`` and, if applicable, ``b``. (Default: lower)
+        eigvals_only (bool, optional): Whether to calculate only eigenvalues
+            and no eigenvectors. (Default: both are calculated)
+        _type (int, optional): For the generalized problems, this keyword specifies
+            the problem type to be solved for ``w`` and ``v`` (only takes 1, 2, 3 as possible
+            inputs)::
+
+                1 =>     a @ v = w @ b @ v
+                2 => a @ b @ v = w @ v
+                3 => b @ a @ v = w @ v
+
+            This keyword is ignored for standard problems.
+        overwrite_a (bool, optional): Whether to overwrite data in ``a``
+            (may improve performance). Default is False.
+        overwrite_b (bool, optional): Whether to overwrite data in ``b``
+            (may improve performance). Default is False.
+        check_finite (bool, optional): Whether to check that the input matrices
+            contain only finite numbers.
+            Disabling may give a performance gain, but may result in problems
+            (crashes, non-termination) if the inputs do contain infinities or NaNs.
+        turbo (bool, optional): use divide and conquer algorithm (faster but
+            expensive in memory, only for generalized eigenvalue problem and
+            if full set of eigenvalues are requested.). Has no significant
+            effect if eigenvectors are not requested.
+        eigvals (tuple, optional): Indexes of the smallest and largest (in ascending order)
+            eigenvalues and corresponding eigenvectors to be returned: 0 <= lo <= hi <= M-1.
+            If omitted, all eigenvalues and eigenvectors are returned.
+
+    Returns:
+        w (Tensor): (N,) Tensor, The N (1<=N<=M) selected eigenvalues, in ascending order,
+            each repeated according to its multiplicity.
+        v (Tensor): (M, N) Tensor, (if ``eigvals_only == False``)
+
+    Raises:
+        LinAlgError: If eigenvalue computation does not converge, an error occurred, or
+            b matrix is not definite positive. Note that if input matrices are
+            not symmetric or Hermitian, no error will be reported but results will
+            be wrong.
+
+    Supported Platforms:
+        ``CPU`` ``GPU``
+
+    Examples:
+        >>> import numpy as onp
+        >>> from mindspore.common import Tensor
+        >>> from mindspore.scipy.linalg import eigh
+        >>> A = Tensor(onp.array([[6, 3, 1, 5], [3, 0, 5, 1], [1, 5, 6, 2], [5, 1, 2, 2]]))
+        >>> w, v = eigh(A)
+        >>> onp.allclose(A @ v - v @ onp.diag(w), onp.zeros((4, 4)))
+        True
+    """
+    eigh_net = EighNet(not eigvals_only, lower=True)
+    return eigh_net(a)
--- a/mindspore/scipy/ops.py
+++ b/mindspore/scipy/ops.py
@ -194,42 +194,21 @@ class Eigh(PrimitiveWithInfer):
    """

    @prim_attr_register
-    def __init__(self, compute_eigenvectors):
+    def __init__(self, compute_eigenvectors=True, lower=True):
        super().__init__(name="Eigh")
-        self.init_prim_io_names(inputs=['A', 's'], outputs=['output', 'output_v'])
+        self.init_prim_io_names(inputs=['A'], outputs=['output_w', 'output_v'])
        self.compute_eigenvectors = validator.check_value_type(
            "compute_eigenvectors", compute_eigenvectors, [bool], self.name)
+        self.lower = validator.check_value_type("lower", lower, [bool], self.lower)
+        self.add_prim_attr('lower', self.lower)
+        self.add_prim_attr('compute_eigenvectors', self.compute_eigenvectors)

-    def __infer__(self, A, s):
+    def __infer__(self, A):
        shape = {
            'shape': ((A['shape'][0],), (A['shape'][0], A['shape'][0])),
            'dtype': (A['dtype'], A['dtype']),
            'value': None
        }
-        if A['dtype'] == mstype.tensor_type(mstype.float32):
-            shape = {
-                'shape': ((A['shape'][0],), (A['shape'][0], A['shape'][0])),
-                'dtype': (mstype.float32, mstype.float32),
-                'value': None
-            }
-        elif A['dtype'] == mstype.tensor_type(mstype.float64):
-            shape = {
-                'shape': ((A['shape'][0],), (A['shape'][0], A['shape'][0])),
-                'dtype': (mstype.float64, mstype.float64),
-                'value': None
-            }
-        elif A['dtype'] == mstype.tensor_type(mstype.complex64):
-            shape = {
-                'shape': ((A['shape'][0],), (A['shape'][0], A['shape'][0])),
-                'dtype': (A['dtype'], A['dtype']),
-                'value': None
-            }
-        elif A['dtype'] == mstype.tensor_type(mstype.complex128):
-            shape = {
-                'shape': ((A['shape'][0],), (A['shape'][0], A['shape'][0])),
-                'dtype': (mstype.complex128, mstype.complex128),
-                'value': None
-            }
        return shape


@ -238,16 +217,17 @@ class EighNet(nn.Cell):
    EigenValue /eigenvector solver for symmetric/Hermitian matrix
    Ax = lambda * x
    """
-    def __init__(self, b):
-        super(EighNet, self).__init__()
-        self.b = b
-        self.eigh = Eigh(b)

-    def construct(self, A, s=True):
-        r = self.eigh(A, s)
-        if self.b:
+    def __init__(self, bv=True, lower=True):
+        super(EighNet, self).__init__()
+        self.bv = bv
+        self.eigh = Eigh(bv, lower)
+
+    def construct(self, A):
+        r = self.eigh(A)
+        if self.bv:
            return (r[0], r[1])
-        return (r[0],)
+        return r[0]


 class Eig(PrimitiveWithInfer):
@ -257,7 +237,7 @@ class Eig(PrimitiveWithInfer):
    """

    @prim_attr_register
-    def __init__(self, compute_eigenvectors):
+    def __init__(self, compute_eigenvectors=True):
        super().__init__(name="Eig")
        self.init_prim_io_names(inputs=['A'], outputs=['output', 'output_v'])
        self.compute_eigenvectors = validator.check_value_type(
@ -285,13 +265,14 @@ class EigNet(nn.Cell):
    EigenValue /eigenvector solver for generic matrix
    Ax = lambda * x
    """
-    def __init__(self, b):
+
+    def __init__(self, bv=True):
        super(EigNet, self).__init__()
-        self.b = b
-        self.eig = Eig(b)
+        self.bv = bv
+        self.eig = Eig(bv)

    def construct(self, A):
        r = self.eig(A)
-        if self.b:
+        if self.bv:
            return (r[0], r[1])
-        return (r[0],)
+        return r[0]
--- a/tests/st/ops/cpu/test_solve_eig_value_op.py
+++ b/tests/st/ops/cpu/test_solve_eig_value_op.py
@ -31,11 +31,6 @@ def match(v, v_, error=0):
        np.testing.assert_equal(v, v_)


-def create_sym_pos_matrix(m, n, dtype):
-    a = (np.random.random((m, n)) + np.eye(m, n)).astype(dtype)
-    return np.dot(a, a.T)
-
-
@pytest.mark.parametrize('n', [4, 6, 9, 10])
@pytest.mark.platform_x86_cpu
 def test_eig_net(n: int):
@ -48,13 +43,13 @@ def test_eig_net(n: int):
    rtol = 1e-3
    atol = 1e-4
    msp_eig = EigNet(True)
-    A = create_sym_pos_matrix(n, n, np.float32)
+    A = np.array(np.random.rand(n, n), dtype=np.float32)
    tensor_a = Tensor(np.array(A).astype(np.float32))
    msp_w, msp_v = msp_eig(tensor_a)
    assert np.allclose(A @ msp_v.asnumpy() - msp_v.asnumpy() @ np.diag(msp_w.asnumpy()), np.zeros((n, n)), rtol, atol)

    # test case for real scalar double 64
-    A = np.random.rand(n, n)
+    A = np.array(np.random.rand(n, n), dtype=np.float64)
    rtol = 1e-5
    atol = 1e-8
    msp_eig = EigNet(True)
@ -98,6 +93,7 @@ def test_eig_net(n: int):
    # Com`pare with scipy, scipy passed
    # sp_w, sp_v = sp.linalg.eig(A.astype(np.complex128))
    # assert np.allclose(A @ sp_v - sp_v @ np.diag(sp_w), np.zeros((n, n)), rtol, atol)
-
-    # print(A @ msp_v.asnumpy() - msp_v.asnumpy() @ np.diag(msp_w.asnumpy()))
    assert np.allclose(A @ msp_v.asnumpy() - msp_v.asnumpy() @ np.diag(msp_w.asnumpy()), np.zeros((n, n)), rtol, atol)
+    msp_eig = EigNet(False)
+    msp_w0 = msp_eig(Tensor(np.array(A).astype(np.complex128)))
+    assert np.allclose(msp_w0.asnumpy() - msp_w.asnumpy(), np.zeros((n, n)), rtol, atol)
--- a/tests/st/ops/cpu/test_solve_eigh_value_op.py
+++ b/tests/st/ops/cpu/test_solve_eigh_value_op.py
@ -47,10 +47,12 @@ def test_eigh_net(n: int):
    # test for real scalar float 32
    rtol = 1e-3
    atol = 1e-4
-    msp_eigh = EighNet(True)
+
    A = create_sym_pos_matrix(n, n, np.float32)
-    msp_wl, msp_vl = msp_eigh(Tensor(np.array(A).astype(np.float32)), True)
-    msp_wu, msp_vu = msp_eigh(Tensor(np.array(A).astype(np.float32)), False)
+    msp_eigh = EighNet(True, True)
+    msp_wl, msp_vl = msp_eigh(Tensor(np.array(A).astype(np.float32)))
+    msp_eigh = EighNet(True, False)
+    msp_wu, msp_vu = msp_eigh(Tensor(np.array(A).astype(np.float32)))
    sym_Al = (np.tril((np.tril(A) - np.tril(A).T)) + np.tril(A).T)
    sym_Au = (np.triu((np.triu(A) - np.triu(A).T)) + np.triu(A).T)
    assert np.allclose(sym_Al @ msp_vl.asnumpy() - msp_vl.asnumpy() @ np.diag(msp_wl.asnumpy()), np.zeros((n, n)), rtol,
@ -62,19 +64,23 @@ def test_eigh_net(n: int):
    A = np.random.rand(n, n)
    rtol = 1e-5
    atol = 1e-8
-    msp_eigh = EighNet(True)
-    msp_wl, msp_vl = msp_eigh(Tensor(np.array(A).astype(np.float64)), True)
-    msp_wu, msp_vu = msp_eigh(Tensor(np.array(A).astype(np.float64)), False)
-
-    # Compare with scipy
-    # sp_wl, sp_vl = sp.linalg.eigh(np.tril(A).astype(np.float64), lower=True, eigvals_only=False)
-    # sp_wu, sp_vu = sp.linalg.eigh(A.astype(np.float64), lower=False, eigvals_only=False)
+    msp_eigh = EighNet(True, True)
+    msp_wl, msp_vl = msp_eigh(Tensor(np.array(A).astype(np.float64)))
+    msp_eigh = EighNet(True, False)
+    msp_wu, msp_vu = msp_eigh(Tensor(np.array(A).astype(np.float64)))
    sym_Al = (np.tril((np.tril(A) - np.tril(A).T)) + np.tril(A).T)
    sym_Au = (np.triu((np.triu(A) - np.triu(A).T)) + np.triu(A).T)
    assert np.allclose(sym_Al @ msp_vl.asnumpy() - msp_vl.asnumpy() @ np.diag(msp_wl.asnumpy()), np.zeros((n, n)), rtol,
                       atol)
    assert np.allclose(sym_Au @ msp_vu.asnumpy() - msp_vu.asnumpy() @ np.diag(msp_wu.asnumpy()), np.zeros((n, n)), rtol,
                       atol)
+    # test for real scalar float64 no vector
+    msp_eigh = EighNet(False, True)
+    msp_wl0 = msp_eigh(Tensor(np.array(A).astype(np.float64)))
+    msp_eigh = EighNet(False, False)
+    msp_wu0 = msp_eigh(Tensor(np.array(A).astype(np.float64)))
+    assert np.allclose(msp_wl.asnumpy() - msp_wl0.asnumpy(), np.zeros((n, n)), rtol, atol)
+    assert np.allclose(msp_wu.asnumpy() - msp_wu0.asnumpy(), np.zeros((n, n)), rtol, atol)

    # test case for complex64
    rtol = 1e-3
@ -86,18 +92,12 @@ def test_eigh_net(n: int):
                A[i][j] = complex(np.random.rand(1, 1), 0)
            else:
                A[i][j] = complex(np.random.rand(1, 1), np.random.rand(1, 1))
-    msp_eigh = EighNet(True)
    sym_Al = (np.tril((np.tril(A) - np.tril(A).T)) + np.tril(A).conj().T)
    sym_Au = (np.triu((np.triu(A) - np.triu(A).T)) + np.triu(A).conj().T)
-    msp_wl, msp_vl = msp_eigh(Tensor(np.array(A).astype(np.complex64)), True)
-    msp_wu, msp_vu = msp_eigh(Tensor(np.array(A).astype(np.complex64)), False)
-    # Compare with scipy, scipy passed
-    # sp_wl, sp_vl = sp.linalg.eigh(np.tril(A).astype(np.complex128), lower=True, eigvals_only=False)
-    # sp_wu, sp_vu = sp.linalg.eigh(A.astype(np.complex128), lower=False, eigvals_only=False)
-    # assert np.allclose(sym_Al @ sp_vl - sp_vl @ np.diag(sp_wl), np.zeros((n, n)), rtol, atol)
-    # assert np.allclose(sym_Au @ sp_vu - sp_vu @ np.diag(sp_wu), np.zeros((n, n)), rtol, atol)
-
-    # print(A @ msp_v.asnumpy() - msp_v.asnumpy() @ np.diag(msp_w.asnumpy()))
+    msp_eigh = EighNet(True, True)
+    msp_wl, msp_vl = msp_eigh(Tensor(np.array(A).astype(np.complex64)))
+    msp_eigh = EighNet(True, False)
+    msp_wu, msp_vu = msp_eigh(Tensor(np.array(A).astype(np.complex64)))
    assert np.allclose(sym_Al @ msp_vl.asnumpy() - msp_vl.asnumpy() @ np.diag(msp_wl.asnumpy()), np.zeros((n, n)), rtol,
                       atol)
    assert np.allclose(sym_Au @ msp_vu.asnumpy() - msp_vu.asnumpy() @ np.diag(msp_wu.asnumpy()), np.zeros((n, n)), rtol,
@ -113,19 +113,21 @@ def test_eigh_net(n: int):
                A[i][j] = complex(np.random.rand(1, 1), 0)
            else:
                A[i][j] = complex(np.random.rand(1, 1), np.random.rand(1, 1))
-    msp_eigh = EighNet(True)
    sym_Al = (np.tril((np.tril(A) - np.tril(A).T)) + np.tril(A).conj().T)
    sym_Au = (np.triu((np.triu(A) - np.triu(A).T)) + np.triu(A).conj().T)
-    msp_wl, msp_vl = msp_eigh(Tensor(np.array(A).astype(np.complex128)), True)
-    msp_wu, msp_vu = msp_eigh(Tensor(np.array(A).astype(np.complex128)), False)
-    # Compare with scipy, scipy passed
-    # sp_wl, sp_vl = sp.linalg.eigh(np.tril(A).astype(np.complex128), lower=True, eigvals_only=False)
-    # sp_wu, sp_vu = sp.linalg.eigh(A.astype(np.complex128), lower=False, eigvals_only=False)
-    # assert np.allclose(sym_Al @ sp_vl - sp_vl @ np.diag(sp_wl), np.zeros((n, n)), rtol, atol)
-    # assert np.allclose(sym_Au @ sp_vu - sp_vu @ np.diag(sp_wu), np.zeros((n, n)), rtol, atol)
-
-    # print(A @ msp_v.asnumpy() - msp_v.asnumpy() @ np.diag(msp_w.asnumpy()))
+    msp_eigh = EighNet(True, True)
+    msp_wl, msp_vl = msp_eigh(Tensor(np.array(A).astype(np.complex128)))
+    msp_eigh = EighNet(True, False)
+    msp_wu, msp_vu = msp_eigh(Tensor(np.array(A).astype(np.complex128)))
    assert np.allclose(sym_Al @ msp_vl.asnumpy() - msp_vl.asnumpy() @ np.diag(msp_wl.asnumpy()), np.zeros((n, n)), rtol,
                       atol)
    assert np.allclose(sym_Au @ msp_vu.asnumpy() - msp_vu.asnumpy() @ np.diag(msp_wu.asnumpy()), np.zeros((n, n)), rtol,
                       atol)
+
+    # test for real scalar complex128 no vector
+    msp_eigh = EighNet(False, True)
+    msp_wl0 = msp_eigh(Tensor(np.array(A).astype(np.complex128)))
+    msp_eigh = EighNet(False, False)
+    msp_wu0 = msp_eigh(Tensor(np.array(A).astype(np.complex128)))
+    assert np.allclose(msp_wl.asnumpy() - msp_wl0.asnumpy(), np.zeros((n, n)), rtol, atol)
+    assert np.allclose(msp_wu.asnumpy() - msp_wu0.asnumpy(), np.zeros((n, n)), rtol, atol)
--- a/tests/st/ops/gpu/test_solve_eigh_value_op.py
+++ b/tests/st/ops/gpu/test_solve_eigh_value_op.py
@ -47,26 +47,35 @@ def test_eigh_net(n: int):
    # test for real scalar float 32
    rtol = 1e-3
    atol = 1e-4
-    msp_eigh = EighNet(True)
    A = create_sym_pos_matrix(n, n, np.float32)
-    msp_wl, msp_vl = msp_eigh(Tensor(np.array(A).astype(np.float32)), True)
-    msp_wu, msp_vu = msp_eigh(Tensor(np.array(A).astype(np.float32)), False)
-    assert np.allclose(A @ msp_vl.T.asnumpy() - msp_vl.T.asnumpy() @ np.diag(msp_wl.asnumpy()), np.zeros((n, n)), rtol,
+    msp_eigh = EighNet(True, True)
+    msp_wl, msp_vl = msp_eigh(Tensor(np.array(A).astype(np.float32)))
+    msp_eigh = EighNet(True, False)
+    msp_wu, msp_vu = msp_eigh(Tensor(np.array(A).astype(np.float32)))
+    assert np.allclose(A @ msp_vl.asnumpy() - msp_vl.asnumpy() @ np.diag(msp_wl.asnumpy()), np.zeros((n, n)), rtol,
                       atol)
-    assert np.allclose(A @ msp_vu.T.asnumpy() - msp_vu.T.asnumpy() @ np.diag(msp_wu.asnumpy()), np.zeros((n, n)), rtol,
+    assert np.allclose(A @ msp_vu.asnumpy() - msp_vu.asnumpy() @ np.diag(msp_wu.asnumpy()), np.zeros((n, n)), rtol,
                       atol)

    # test case for real scalar double 64
    A = create_sym_pos_matrix(n, n, np.float64)
    rtol = 1e-5
    atol = 1e-8
-    msp_eigh = EighNet(True)
-    msp_wl, msp_vl = msp_eigh(Tensor(np.array(A).astype(np.float64)), True)
-    msp_wu, msp_vu = msp_eigh(Tensor(np.array(A).astype(np.float64)), False)
-    assert np.allclose(A @ msp_vl.T.asnumpy() - msp_vl.T.asnumpy() @ np.diag(msp_wl.asnumpy()), np.zeros((n, n)), rtol,
+    msp_eigh = EighNet(True, True)
+    msp_wl, msp_vl = msp_eigh(Tensor(np.array(A).astype(np.float64)))
+    msp_eigh = EighNet(True, False)
+    msp_wu, msp_vu = msp_eigh(Tensor(np.array(A).astype(np.float64)))
+    assert np.allclose(A @ msp_vl.asnumpy() - msp_vl.asnumpy() @ np.diag(msp_wl.asnumpy()), np.zeros((n, n)), rtol,
                       atol)
-    assert np.allclose(A @ msp_vu.T.asnumpy() - msp_vu.T.asnumpy() @ np.diag(msp_wu.asnumpy()), np.zeros((n, n)), rtol,
+    assert np.allclose(A @ msp_vu.asnumpy() - msp_vu.asnumpy() @ np.diag(msp_wu.asnumpy()), np.zeros((n, n)), rtol,
                       atol)
+    # test for real scalar float64 no vector
+    msp_eigh = EighNet(False, True)
+    msp_wl0 = msp_eigh(Tensor(np.array(A).astype(np.float64)))
+    msp_eigh = EighNet(False, False)
+    msp_wu0 = msp_eigh(Tensor(np.array(A).astype(np.float64)))
+    assert np.allclose(msp_wl.asnumpy() - msp_wl0.asnumpy(), np.zeros((n, n)), rtol, atol)
+    assert np.allclose(msp_wu.asnumpy() - msp_wu0.asnumpy(), np.zeros((n, n)), rtol, atol)

    # test case for complex64
    rtol = 1e-3
@ -78,14 +87,15 @@ def test_eigh_net(n: int):
                A[i][j] = complex(np.random.rand(1, 1), 0)
            else:
                A[i][j] = complex(np.random.rand(1, 1), np.random.rand(1, 1))
-    msp_eigh = EighNet(True)
    sym_Al = (np.tril((np.tril(A) - np.tril(A).T)) + np.tril(A).conj().T)
    sym_Au = (np.triu((np.triu(A) - np.triu(A).T)) + np.triu(A).conj().T)
-    msp_wl, msp_vl = msp_eigh(Tensor(np.array(sym_Al).astype(np.complex64)), True)
-    msp_wu, msp_vu = msp_eigh(Tensor(np.array(sym_Au).astype(np.complex64)), False)
-    assert np.allclose(sym_Al @ msp_vl.asnumpy().conj().T - msp_vl.asnumpy().conj().T @ np.diag(msp_wl.asnumpy()),
+    msp_eigh = EighNet(True, True)
+    msp_wl, msp_vl = msp_eigh(Tensor(np.array(A).astype(np.complex64)))
+    msp_eigh = EighNet(True, False)
+    msp_wu, msp_vu = msp_eigh(Tensor(np.array(A).astype(np.complex64)))
+    assert np.allclose(sym_Al @ msp_vl.asnumpy() - msp_vl.asnumpy() @ np.diag(msp_wl.asnumpy()),
                       np.zeros((n, n)), rtol, atol)
-    assert np.allclose(sym_Au @ msp_vu.asnumpy().conj().T - msp_vu.asnumpy().conj().T @ np.diag(msp_wu.asnumpy()),
+    assert np.allclose(sym_Au @ msp_vu.asnumpy() - msp_vu.asnumpy() @ np.diag(msp_wu.asnumpy()),
                       np.zeros((n, n)), rtol, atol)

    # test for complex128
@ -94,17 +104,24 @@ def test_eigh_net(n: int):
    A = np.array(np.random.rand(n, n), dtype=np.complex128)
    for i in range(0, n):
        for j in range(0, n):
-
            if i == j:
                A[i][j] = complex(np.random.rand(1, 1), 0)
            else:
                A[i][j] = complex(np.random.rand(1, 1), np.random.rand(1, 1))
-    msp_eigh = EighNet(True)
    sym_Al = (np.tril((np.tril(A) - np.tril(A).T)) + np.tril(A).conj().T)
    sym_Au = (np.triu((np.triu(A) - np.triu(A).T)) + np.triu(A).conj().T)
-    msp_wl, msp_vl = msp_eigh(Tensor(np.array(sym_Al).astype(np.complex128)), True)
-    msp_wu, msp_vu = msp_eigh(Tensor(np.array(sym_Au).astype(np.complex128)), False)
-    assert np.allclose(sym_Al @ msp_vl.asnumpy().conj().T - msp_vl.asnumpy().conj().T @ np.diag(msp_wl.asnumpy()),
+    msp_eigh = EighNet(True, True)
+    msp_wl, msp_vl = msp_eigh(Tensor(np.array(A).astype(np.complex128)))
+    msp_eigh = EighNet(True, False)
+    msp_wu, msp_vu = msp_eigh(Tensor(np.array(A).astype(np.complex128)))
+    assert np.allclose(sym_Al @ msp_vl.asnumpy() - msp_vl.asnumpy() @ np.diag(msp_wl.asnumpy()),
                       np.zeros((n, n)), rtol, atol)
-    assert np.allclose(sym_Au @ msp_vu.asnumpy().conj().T - msp_vu.asnumpy().conj().T @ np.diag(msp_wu.asnumpy()),
+    assert np.allclose(sym_Au @ msp_vu.asnumpy() - msp_vu.asnumpy() @ np.diag(msp_wu.asnumpy()),
                       np.zeros((n, n)), rtol, atol)
+    # test for real scalar complex128 no vector
+    msp_eigh = EighNet(False, True)
+    msp_wl0 = msp_eigh(Tensor(np.array(A).astype(np.complex128)))
+    msp_eigh = EighNet(False, False)
+    msp_wu0 = msp_eigh(Tensor(np.array(A).astype(np.complex128)))
+    assert np.allclose(msp_wl.asnumpy() - msp_wl0.asnumpy(), np.zeros((n, n)), rtol, atol)
+    assert np.allclose(msp_wu.asnumpy() - msp_wu0.asnumpy(), np.zeros((n, n)), rtol, atol)
--- a/tests/st/scipy_st/test_linalg.py
+++ b/tests/st/scipy_st/test_linalg.py
@ -139,3 +139,91 @@ def test_cholesky_solver(n: int, lower: bool, dtype):
    # pre tensor_a has been inplace.
    tensor_a = Tensor(a)
    assert onp.allclose(onp.dot(a, osp_x), mnp.dot(tensor_a, msp_x).asnumpy())
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_gpu_training
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+@pytest.mark.parametrize('n', [4, 6, 9, 20])
+def test_eigh_solver(n: int):
+    """
+    Feature: ALL TO ALL
+    Description:  test cases for eigenvalues/eigenvector for symmetric/Hermitian matrix solver [N,N]
+    Expectation: the result match scipy cholesky_solve
+    """
+    # test for real scalar float 32
+    rtol = 1e-3
+    atol = 1e-4
+    A = create_sym_pos_matrix([n, n], onp.float32)
+    msp_wl, msp_vl = msp.linalg.eigh(Tensor(onp.array(A).astype(onp.float32)), lower=True, eigvals_only=False)
+    msp_wu, msp_vu = msp.linalg.eigh(Tensor(onp.array(A).astype(onp.float32)), lower=False, eigvals_only=False)
+    assert onp.allclose(A @ msp_vl.asnumpy() - msp_vl.asnumpy() @ onp.diag(msp_wl.asnumpy()), onp.zeros((n, n)),
+                        rtol,
+                        atol)
+    assert onp.allclose(A @ msp_vu.asnumpy() - msp_vu.asnumpy() @ onp.diag(msp_wu.asnumpy()), onp.zeros((n, n)),
+                        rtol,
+                        atol)
+
+    # test case for real scalar double 64
+    A = create_sym_pos_matrix([n, n], onp.float64)
+    rtol = 1e-5
+    atol = 1e-8
+    msp_wl, msp_vl = msp.linalg.eigh(Tensor(onp.array(A).astype(onp.float64)), lower=True, eigvals_only=False)
+    msp_wu, msp_vu = msp.linalg.eigh(Tensor(onp.array(A).astype(onp.float64)), lower=False, eigvals_only=False)
+    assert onp.allclose(A @ msp_vl.asnumpy() - msp_vl.asnumpy() @ onp.diag(msp_wl.asnumpy()), onp.zeros((n, n)),
+                        rtol,
+                        atol)
+    assert onp.allclose(A @ msp_vu.asnumpy() - msp_vu.asnumpy() @ onp.diag(msp_wu.asnumpy()), onp.zeros((n, n)),
+                        rtol,
+                        atol)
+    # test for real scalar float64 no vector
+    msp_wl0 = msp.linalg.eigh(Tensor(onp.array(A).astype(onp.float64)), lower=True, eigvals_only=True)
+    msp_wu0 = msp.linalg.eigh(Tensor(onp.array(A).astype(onp.float64)), lower=False, eigvals_only=True)
+    assert onp.allclose(msp_wl.asnumpy() - msp_wl0.asnumpy(), onp.zeros((n, n)), rtol, atol)
+    assert onp.allclose(msp_wu.asnumpy() - msp_wu0.asnumpy(), onp.zeros((n, n)), rtol, atol)
+
+    # test case for complex64
+    rtol = 1e-3
+    atol = 1e-4
+    A = onp.array(onp.random.rand(n, n), dtype=onp.complex64)
+    for i in range(0, n):
+        for j in range(0, n):
+            if i == j:
+                A[i][j] = complex(onp.random.rand(1, 1), 0)
+            else:
+                A[i][j] = complex(onp.random.rand(1, 1), onp.random.rand(1, 1))
+    sym_Al = (onp.tril((onp.tril(A) - onp.tril(A).T)) + onp.tril(A).conj().T)
+    sym_Au = (onp.triu((onp.triu(A) - onp.triu(A).T)) + onp.triu(A).conj().T)
+    msp_wl, msp_vl = msp.linalg.eigh(Tensor(onp.array(sym_Al).astype(onp.complex64)), lower=True, eigvals_only=False)
+    msp_wu, msp_vu = msp.linalg.eigh(Tensor(onp.array(sym_Au).astype(onp.complex64)), lower=False, eigvals_only=False)
+    assert onp.allclose(sym_Al @ msp_vl.asnumpy() - msp_vl.asnumpy() @ onp.diag(msp_wl.asnumpy()),
+                        onp.zeros((n, n)), rtol, atol)
+    assert onp.allclose(sym_Au @ msp_vu.asnumpy() - msp_vu.asnumpy() @ onp.diag(msp_wu.asnumpy()),
+                        onp.zeros((n, n)), rtol, atol)
+
+    # test for complex128
+    rtol = 1e-5
+    atol = 1e-8
+    A = onp.array(onp.random.rand(n, n), dtype=onp.complex128)
+    for i in range(0, n):
+        for j in range(0, n):
+
+            if i == j:
+                A[i][j] = complex(onp.random.rand(1, 1), 0)
+            else:
+                A[i][j] = complex(onp.random.rand(1, 1), onp.random.rand(1, 1))
+    sym_Al = (onp.tril((onp.tril(A) - onp.tril(A).T)) + onp.tril(A).conj().T)
+    sym_Au = (onp.triu((onp.triu(A) - onp.triu(A).T)) + onp.triu(A).conj().T)
+    msp_wl, msp_vl = msp.linalg.eigh(Tensor(onp.array(sym_Al).astype(onp.complex128)), lower=True, eigvals_only=False)
+    msp_wu, msp_vu = msp.linalg.eigh(Tensor(onp.array(sym_Au).astype(onp.complex128)), lower=False, eigvals_only=False)
+    assert onp.allclose(sym_Al @ msp_vl.asnumpy() - msp_vl.asnumpy() @ onp.diag(msp_wl.asnumpy()),
+                        onp.zeros((n, n)), rtol, atol)
+    assert onp.allclose(sym_Au @ msp_vu.asnumpy() - msp_vu.asnumpy() @ onp.diag(msp_wu.asnumpy()),
+                        onp.zeros((n, n)), rtol, atol)
+
+    # test for real scalar float64 no vector
+    msp_wl0 = msp.linalg.eigh(Tensor(onp.array(sym_Al).astype(onp.complex128)), lower=True, eigvals_only=True)
+    msp_wu0 = msp.linalg.eigh(Tensor(onp.array(sym_Au).astype(onp.complex128)), lower=False, eigvals_only=True)
+    assert onp.allclose(msp_wl.asnumpy() - msp_wl0.asnumpy(), onp.zeros((n, n)), rtol, atol)
+    assert onp.allclose(msp_wu.asnumpy() - msp_wu0.asnumpy(), onp.zeros((n, n)), rtol, atol)