add lstm

2020-05-28 09:09:56 +08:00 · 2020-05-28 09:09:56 +08:00 · 9bcdf4cbdc
parent 650a45b233
commit 9bcdf4cbdc
15 changed files with 1187 additions and 39 deletions
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.cc
@ -0,0 +1,120 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernel/cpu/mkldnn/lstm_cpu_kernel.h"
+#include <string>
+#include "common/utils.h"
+#include "kernel/cpu/mkldnn/mkl_kernel_engine.h"
+#include "device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+void LstmCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
+  input_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "input_size");
+  hidden_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "hidden_size");
+  num_layers_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "num_layers");
+  batch_size_ = SizeToInt(src_shape[1]);
+  seq_len_ = SizeToInt(src_shape[0]);
+  num_directions_ = 1;
+  if (bidirectional_) {
+    num_directions_ = 2;
+  }
+  int gate_size = 4 * hidden_size_;
+  for (int i = 0; i < num_layers_; ++i) {
+    weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
+    weight_h_size_ += gate_size * hidden_size_;
+  }
+  weight_size_ = weight_size_ * num_directions_;
+  weight_h_size_ = weight_h_size_ * num_directions_;
+}
+
+bool LstmCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                           const std::vector<kernel::AddressPtr> & /*workspace*/,
+                           const std::vector<kernel::AddressPtr> &outputs) {
+  using dt = dnnl::memory::data_type;
+  using tag = dnnl::memory::format_tag;
+  using dim = dnnl::memory::dims;
+  auto eng = MKLKernelEngine::Get().engine();
+  dnnl::stream s(eng);
+  auto formatted_md = [](dim dimensions, tag layout) { return dnnl::memory::desc{{dimensions}, dt::f32, layout}; };
+  dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
+  if (bidirectional_) {
+    direction = dnnl::rnn_direction::bidirectional_concat;
+  }
+
+  dim src_dims = {seq_len_, batch_size_, input_size_};
+  dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim weights_dims = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
+  dim weights_h_dims = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
+  dim bias_dims = {num_layers_, num_directions_, 4, hidden_size_};
+  dim dst_dims = {seq_len_, batch_size_, hidden_size_ * num_directions_};
+  dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
+  dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
+  dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
+  dnnl::memory::desc weights_desc = formatted_md(weights_dims, tag::ldigo);
+  dnnl::memory::desc weights_h_desc = formatted_md(weights_h_dims, tag::ldigo);
+  dnnl::memory::desc bias_desc = formatted_md(bias_dims, tag::ldgo);
+  dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
+  dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
+  dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
+  dnnl::lstm_forward::desc desc =
+    dnnl::lstm_forward::desc(dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc,
+                             weights_desc, weights_h_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
+  auto prim_desc = dnnl::lstm_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
+  auto workspace_memory = dnnl::memory(prim_desc.workspace_desc(), eng);
+  auto src_memory = dnnl::memory(formatted_md(src_dims, tag::tnc), eng);
+  write_to_dnnl_memory(inputs[0]->addr, src_memory);
+
+  auto src_h_memory = dnnl::memory(prim_desc.src_iter_desc(), eng);
+  auto src_c_memory = dnnl::memory(prim_desc.src_iter_c_desc(), eng);
+  write_to_dnnl_memory(inputs[1]->addr, src_h_memory);
+  write_to_dnnl_memory(inputs[2]->addr, src_c_memory);
+
+  auto weights_memory = dnnl::memory(formatted_md(weights_dims, tag::ldigo), eng);
+  auto weights_h_memory = dnnl::memory(formatted_md(weights_h_dims, tag::ldigo), eng);
+  auto bias_memory = dnnl::memory(formatted_md(bias_dims, tag::ldgo), eng);
+  write_to_dnnl_memory(inputs[3]->addr, weights_memory);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_, weights_h_memory);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_, bias_memory);
+
+  auto dst_memory = dnnl::memory(formatted_md(dst_dims, tag::tnc), eng);
+  auto dst_h_memory = dnnl::memory(prim_desc.dst_iter_desc(), eng);
+  auto dst_c_memory = dnnl::memory(prim_desc.dst_iter_c_desc(), eng);
+  dnnl::lstm_forward fw_layer(prim_desc);
+  workspace_memory.set_data_handle(outputs[3]->addr);
+  dst_memory.set_data_handle(outputs[0]->addr);
+  dst_h_memory.set_data_handle(outputs[1]->addr);
+  dst_c_memory.set_data_handle(outputs[2]->addr);
+  fw_layer.execute(s, {{DNNL_ARG_SRC_LAYER, src_memory},
+                       {DNNL_ARG_SRC_ITER, src_h_memory},
+                       {DNNL_ARG_SRC_ITER_C, src_c_memory},
+                       {DNNL_ARG_WEIGHTS_LAYER, weights_memory},
+                       {DNNL_ARG_WEIGHTS_ITER, weights_h_memory},
+                       {DNNL_ARG_BIAS, bias_memory},
+                       {DNNL_ARG_DST_LAYER, dst_memory},
+                       {DNNL_ARG_DST_ITER, dst_h_memory},
+                       {DNNL_ARG_DST_ITER_C, dst_c_memory},
+                       {DNNL_ARG_WORKSPACE, workspace_memory}});
+  return true;
+}
+
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.h
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_cpu_kernel.h
@ -0,0 +1,59 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_LSTM_CPU_KERNEL_H
+#define MINDSPORE_CCSRC_KERNEL_CPU_LSTM_CPU_KERNEL_H
+#include <vector>
+#include <memory>
+#include "kernel/cpu/mkldnn/mkl_cpu_kernel.h"
+namespace mindspore {
+namespace kernel {
+class LstmCPUKernel : public MKLCPUKernel {
+ public:
+  LstmCPUKernel() = default;
+  ~LstmCPUKernel() override = default;
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  int weight_size_ = 0;
+  int weight_h_size_ = 0;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  int batch_size_;
+  int seq_len_;
+  int num_directions_;
+  bool bidirectional_;
+};
+
+MS_REG_CPU_KERNEL(LSTM,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32),
+                  LstmCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_LSTM_CPU_KERNEL_H
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.cc
@ -0,0 +1,169 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "kernel/cpu/mkldnn/lstm_grad_cpu_kernel.h"
+#include <cstring>
+#include <cmath>
+#include <numeric>
+#include <string>
+#include "common/utils.h"
+#include "kernel/cpu/mkldnn/mkl_kernel_engine.h"
+#include "device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+
+void LSTMGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  bidirectional_ = AnfAlgo::GetNodeAttr<bool>(kernel_node, "bidirectional");
+  input_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "input_size");
+  hidden_size_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "hidden_size");
+  num_layers_ = AnfAlgo::GetNodeAttr<int>(kernel_node, "num_layers");
+  batch_size_ = SizeToInt(src_shape[1]);
+  seq_len_ = SizeToInt(src_shape[0]);
+  num_directions_ = 1;
+  if (bidirectional_) {
+    num_directions_ = 2;
+  }
+  int gate_size = 4 * hidden_size_;
+  for (int i = 0; i < num_layers_; ++i) {
+    weight_size_ += gate_size * (i == 0 ? input_size_ : hidden_size_ * num_directions_);
+    weight_h_size_ += gate_size * hidden_size_;
+  }
+  weight_size_ = weight_size_ * num_directions_;
+  weight_h_size_ = weight_h_size_ * num_directions_;
+}
+
+bool LSTMGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                               const std::vector<kernel::AddressPtr> &workspace /*workspace*/,
+                               const std::vector<kernel::AddressPtr> &outputs) {
+  using tag = dnnl::memory::format_tag;
+  using dt = dnnl::memory::data_type;
+  using dim = dnnl::memory::dims;
+  auto eng = MKLKernelEngine::Get().engine();
+  dnnl::stream s(eng);
+  auto formatted_md = [](dim dimensions, tag layout) { return dnnl::memory::desc{{dimensions}, dt::f32, layout}; };
+  auto generic_md = [](dim dimensions) { return dnnl::memory::desc{{dimensions}, dt::f32, tag::any}; };
+  dnnl::rnn_direction direction = dnnl::rnn_direction::unidirectional;
+  if (bidirectional_) {
+    direction = dnnl::rnn_direction::bidirectional_concat;
+  }
+  dim src_dims = {seq_len_, batch_size_, input_size_};
+  dim src_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim src_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim weights_dims = {num_layers_, num_directions_, input_size_, 4, hidden_size_};
+  dim weights_h_dims = {num_layers_, num_directions_, hidden_size_, 4, hidden_size_};
+  dim bias_dims = {num_layers_, num_directions_, 4, hidden_size_};
+  dim dst_dims = {seq_len_, batch_size_, hidden_size_ * num_directions_};
+  dim dst_h_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+  dim dst_c_dims = {num_layers_, num_directions_, batch_size_, hidden_size_};
+
+  dnnl::memory::desc src_desc = formatted_md(src_dims, tag::tnc);
+  dnnl::memory::desc src_h_desc = formatted_md(src_h_dims, tag::ldnc);
+  dnnl::memory::desc src_c_desc = formatted_md(src_c_dims, tag::ldnc);
+  dnnl::memory::desc weights_desc = formatted_md(weights_dims, tag::ldigo);
+  dnnl::memory::desc weights_h_desc = formatted_md(weights_h_dims, tag::ldigo);
+  dnnl::memory::desc bias_desc = formatted_md(bias_dims, tag::ldgo);
+  dnnl::memory::desc dst_desc = formatted_md(dst_dims, tag::tnc);
+  dnnl::memory::desc dst_h_desc = formatted_md(dst_h_dims, tag::ldnc);
+  dnnl::memory::desc dst_c_desc = formatted_md(dst_c_dims, tag::ldnc);
+
+  dnnl::lstm_forward::desc forward_desc =
+    dnnl::lstm_forward::desc(dnnl::prop_kind::forward_training, direction, src_desc, src_h_desc, src_c_desc,
+                             weights_desc, weights_h_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
+  auto prim_forward_desc = dnnl::lstm_forward::primitive_desc(forward_desc, eng);
+
+  dnnl::lstm_backward::desc backward_desc = dnnl::lstm_backward::desc(
+    dnnl::prop_kind::backward, direction, src_desc, src_h_desc, src_c_desc, generic_md(weights_dims),
+    generic_md(weights_h_dims), generic_md(bias_dims), dst_desc, dst_h_desc, dst_c_desc, src_desc, src_h_desc,
+    src_c_desc, weights_desc, weights_h_desc, bias_desc, dst_desc, dst_h_desc, dst_c_desc);
+  auto prim_backward_desc = dnnl::lstm_backward::primitive_desc(backward_desc, eng, prim_forward_desc);
+  // construct fw memory
+  auto src_memory = dnnl::memory(formatted_md(src_dims, tag::tnc), eng);
+  write_to_dnnl_memory(inputs[0]->addr, src_memory);
+
+  auto src_h_memory = dnnl::memory(prim_forward_desc.src_iter_desc(), eng);
+  auto src_c_memory = dnnl::memory(prim_forward_desc.src_iter_c_desc(), eng);
+  write_to_dnnl_memory(inputs[1]->addr, src_h_memory);
+  write_to_dnnl_memory(inputs[2]->addr, src_c_memory);
+
+  auto user_weights_memory = dnnl::memory(formatted_md(weights_dims, tag::ldigo), eng);
+  auto user_weights_h_memory = dnnl::memory(formatted_md(weights_h_dims, tag::ldigo), eng);
+  auto user_bias_memory = dnnl::memory(formatted_md(bias_dims, tag::ldgo), eng);
+  write_to_dnnl_memory(inputs[3]->addr, user_weights_memory);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_, user_weights_h_memory);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[3]->addr) + weight_size_ + weight_h_size_, user_bias_memory);
+  auto weights_memory = dnnl::memory(prim_backward_desc.weights_layer_desc(), eng);
+  auto weights_h_memory = dnnl::memory(prim_backward_desc.weights_iter_desc(), eng);
+  auto bias_memory = dnnl::memory(prim_forward_desc.bias_desc(), eng);
+  dnnl::reorder(user_weights_memory, weights_memory).execute(s, user_weights_memory, weights_memory);
+  dnnl::reorder(user_weights_h_memory, weights_h_memory).execute(s, user_weights_h_memory, weights_h_memory);
+  dnnl::reorder(user_bias_memory, bias_memory).execute(s, user_bias_memory, bias_memory);
+
+  auto dst_memory = dnnl::memory(formatted_md(dst_dims, tag::tnc), eng);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[4]->addr), dst_memory);
+  auto dst_h_memory = dnnl::memory(prim_backward_desc.dst_iter_desc(), eng);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[5]->addr), dst_h_memory);
+  auto dst_c_memory = dnnl::memory(prim_backward_desc.dst_iter_c_desc(), eng);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[6]->addr), dst_c_memory);
+  auto workspace_memory = dnnl::memory(prim_forward_desc.workspace_desc(), eng);
+  write_to_dnnl_memory(inputs[10]->addr, workspace_memory);
+
+  // construct diff memory
+  auto diff_src_memory = dnnl::memory(formatted_md(src_dims, tag::tnc), eng);
+  auto diff_src_h_memory = dnnl::memory(prim_backward_desc.diff_src_iter_desc(), eng);
+  auto diff_src_c_memory = dnnl::memory(prim_backward_desc.diff_src_iter_c_desc(), eng);
+
+  auto diff_weights_memory = dnnl::memory(prim_backward_desc.diff_weights_layer_desc(), eng);
+  auto diff_weights_h_memory = dnnl::memory(prim_backward_desc.diff_weights_iter_desc(), eng);
+  auto diff_bias_memory = dnnl::memory(prim_backward_desc.diff_bias_desc(), eng);
+  auto diff_dst_memory = dnnl::memory(formatted_md(dst_dims, tag::tnc), eng);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[7]->addr), diff_dst_memory);
+  auto diff_dst_h_memory = dnnl::memory(prim_backward_desc.diff_dst_iter_desc(), eng);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[8]->addr), diff_dst_h_memory);
+  auto diff_dst_c_memory = dnnl::memory(prim_backward_desc.diff_dst_iter_c_desc(), eng);
+  write_to_dnnl_memory(reinterpret_cast<float *>(inputs[9]->addr), diff_dst_c_memory);
+
+  diff_src_memory.set_data_handle(outputs[0]->addr);
+  diff_src_h_memory.set_data_handle(outputs[1]->addr);
+  diff_src_c_memory.set_data_handle(outputs[2]->addr);
+  diff_weights_memory.set_data_handle(outputs[3]->addr);
+  diff_weights_h_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_);
+  diff_bias_memory.set_data_handle(reinterpret_cast<float *>(outputs[3]->addr) + weight_size_ + weight_h_size_);
+  dnnl::lstm_backward bwd_layer(prim_backward_desc);
+  bwd_layer.execute(s, {{DNNL_ARG_SRC_LAYER, src_memory},
+                        {DNNL_ARG_SRC_ITER, src_h_memory},
+                        {DNNL_ARG_SRC_ITER_C, src_c_memory},
+                        {DNNL_ARG_WEIGHTS_LAYER, weights_memory},
+                        {DNNL_ARG_WEIGHTS_ITER, weights_h_memory},
+                        {DNNL_ARG_BIAS, bias_memory},
+                        {DNNL_ARG_DST_LAYER, dst_memory},
+                        {DNNL_ARG_DST_ITER, dst_h_memory},
+                        {DNNL_ARG_DST_ITER_C, dst_c_memory},
+                        {DNNL_ARG_DIFF_SRC_LAYER, diff_src_memory},
+                        {DNNL_ARG_DIFF_SRC_ITER, diff_src_h_memory},
+                        {DNNL_ARG_DIFF_SRC_ITER_C, diff_src_c_memory},
+                        {DNNL_ARG_DIFF_WEIGHTS_LAYER, diff_weights_memory},
+                        {DNNL_ARG_DIFF_WEIGHTS_ITER, diff_weights_h_memory},
+                        {DNNL_ARG_DIFF_BIAS, diff_bias_memory},
+                        {DNNL_ARG_DIFF_DST_LAYER, diff_dst_memory},
+                        {DNNL_ARG_DIFF_DST_ITER, diff_dst_h_memory},
+                        {DNNL_ARG_DIFF_DST_ITER_C, diff_dst_c_memory},
+                        {DNNL_ARG_WORKSPACE, workspace_memory}});
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/lstm_grad_cpu_kernel.h
@ -0,0 +1,67 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_KERNEL_CPU_LSTM_GRAD_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_KERNEL_CPU_LSTM_GRAD_CPU_KERNEL_H_
+
+#include <vector>
+#include <memory>
+#include "kernel/cpu/mkldnn/mkl_cpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+class LSTMGradCPUKernel : public MKLCPUKernel {
+ public:
+  LSTMGradCPUKernel() = default;
+  ~LSTMGradCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  int weight_size_ = 0;
+  int weight_h_size_ = 0;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  int batch_size_;
+  int seq_len_;
+  int num_directions_;
+  bool bidirectional_;
+};
+
+MS_REG_CPU_KERNEL(LSTMGrad,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32),
+                  LSTMGradCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_KERNEL_CPU_LSTM_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.cc
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -98,5 +98,11 @@ void MKLCPUKernel::SetArgumentHandle(int arg_key, void *ptr) {
 }

 void MKLCPUKernel::ExecutePrimitive() { MKLKernelEngine::Get().Execute(primitive_, arguments_); }
+void MKLCPUKernel::write_to_dnnl_memory(void *handle, const dnnl::memory &mem) {
+  MKLKernelEngine::Get().write_to_dnnl_memory(handle, mem);
+}
+void MKLCPUKernel::read_from_dnnl_memory(void *handle, const dnnl::memory &mem) {
+  MKLKernelEngine::Get().read_from_dnnl_memory(handle, mem);
+}
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.h
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_cpu_kernel.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -39,6 +39,8 @@ class MKLCPUKernel : public CPUKernel {
  dnnl::memory::format_tag GetDefaultFormatTag(const dnnl::memory::dims &dims) const;
  dnnl::memory::desc GetDefaultMemDesc(const std::vector<size_t> &shape);
  void ExecutePrimitive();
+  void write_to_dnnl_memory(void *handle, const dnnl::memory &mem);
+  void read_from_dnnl_memory(void *handle, const dnnl::memory &mem);
  std::unordered_map<int, dnnl::memory> arguments_;
  std::shared_ptr<dnnl::primitive> primitive_{nullptr};
 };
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.cc
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.cc
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
--- a/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.h
+++ b/mindspore/ccsrc/kernel/cpu/mkldnn/mkl_kernel_engine.h
@ -1,5 +1,5 @@
 /**
- * Copyright 2019 Huawei Technologies Co., Ltd
+ * Copyright 2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -15,7 +15,10 @@
 */
 #ifndef MINDSPORE_MKL_KERNEL_ENGINE_H_
 #define MINDSPORE_MKL_KERNEL_ENGINE_H_
-
+#include <cstdlib>
+#include <algorithm>
+#include <iostream>
+#include <string>
 #include <unordered_map>
 #include <vector>
 #include <memory>
@ -39,6 +42,30 @@ class MKLKernelEngine {
  void Execute(const std::shared_ptr<dnnl::primitive> &primitive,
               const std::unordered_map<int, dnnl::memory> &arguments);

+  inline void read_from_dnnl_memory(void *handle, const dnnl::memory &mem) {
+    dnnl::engine eng = mem.get_engine();
+    size_t bytes = mem.get_desc().get_size();
+    if (eng.get_kind() == dnnl::engine::kind::cpu) {
+      auto dst = reinterpret_cast<uint8_t *>(handle);
+      uint8_t *src = reinterpret_cast<uint8_t *>(mem.get_data_handle());
+      for (size_t i = 0; i < bytes; ++i) {
+        dst[i] = src[i];
+      }
+    }
+  }
+  // Read from handle, write to memory
+  inline void write_to_dnnl_memory(void *handle, const dnnl::memory &mem) {
+    dnnl::engine eng = mem.get_engine();
+    size_t bytes = mem.get_desc().get_size();
+    if (eng.get_kind() == dnnl::engine::kind::cpu) {
+      auto src = reinterpret_cast<uint8_t *>(handle);
+      uint8_t *dst = reinterpret_cast<uint8_t *>(mem.get_data_handle());
+      for (size_t i = 0; i < bytes; ++i) {
+        dst[i] = src[i];
+      }
+    }
+  }
+
 private:
  MKLKernelEngine() : engine_(dnnl::engine::kind::cpu, 0), stream_(engine_) {}
  ~MKLKernelEngine() = default;
--- a/mindspore/nn/layer/lstm.py
+++ b/mindspore/nn/layer/lstm.py
@ -18,8 +18,13 @@ from mindspore.nn.cell import Cell
 from mindspore.common.parameter import Parameter
 from mindspore.common.initializer import initializer
 from mindspore._checkparam import Validator as validator
+from mindspore import context
+import mindspore.nn as nn
+from mindspore.common.tensor import Tensor
+import numpy as np
+
+__all__ = ['LSTM', 'LSTMCell']

-__all__ = ['LSTM']

 class LSTM(Cell):
    r"""
@ -102,6 +107,7 @@ class LSTM(Cell):
        >>> c0 = Tensor(np.ones([1 * 2, 3, 12]).astype(np.float32))
        >>> output, (hn, cn) = net(input, h0, c0)
    """
+
    def __init__(self,
                 input_size,
                 hidden_size,
@ -118,39 +124,198 @@ class LSTM(Cell):
        self.batch_first = validator.check_value_type("batch_first", batch_first, [bool], self.cls_name)
        self.dropout = float(dropout)
        self.bidirectional = bidirectional
-
        if self.batch_first:
            self.transpose1 = P.Transpose()
            self.transpose2 = P.Transpose()
-        self.lstm = P.LSTM(input_size=self.input_size,
-                           hidden_size=self.hidden_size,
-                           num_layers=self.num_layers,
-                           has_bias=self.has_bias,
-                           bidirectional=self.bidirectional,
-                           dropout=self.dropout)
-
        num_directions = 2 if self.bidirectional else 1
-
-        weight_size = 0
-        gate_size = 4 * self.hidden_size
-        for layer in range(self.num_layers):
-            input_layer_size = self.input_size if layer == 0 else self.hidden_size * num_directions
-            increment_size = gate_size * input_layer_size
-            increment_size += gate_size * self.hidden_size
-            if self.has_bias:
-                increment_size += 2 * gate_size
-            weight_size += increment_size * num_directions
-
-        self.weight = Parameter(initializer(0.0, [weight_size, 1, 1]), name='weight')
-
+        self.cpu_target = False
+        if context.get_context("device_target") == "CPU":
+            self.cpu_target = True
+        if not self.cpu_target:
+            self.lstm = P.LSTM(input_size=self.input_size,
+                               hidden_size=self.hidden_size,
+                               num_layers=self.num_layers,
+                               has_bias=self.has_bias,
+                               bidirectional=self.bidirectional,
+                               dropout=self.dropout)
+            weight_size = 0
+            gate_size = 4 * self.hidden_size
+            for layer in range(self.num_layers):
+                input_layer_size = self.input_size if layer == 0 else self.hidden_size * num_directions
+                increment_size = gate_size * input_layer_size
+                increment_size += gate_size * self.hidden_size
+                if self.has_bias:
+                    increment_size += 2 * gate_size
+                weight_size += increment_size * num_directions
+            self.weight = Parameter(initializer(0.0, [weight_size, 1, 1]), name='weight')
+        else:
+            layer = []
+            layer.append(nn.LSTMCell(input_size=self.input_size,
+                                     hidden_size=self.hidden_size,
+                                     layer_index=0,
+                                     has_bias=self.has_bias,
+                                     bidirectional=self.bidirectional,
+                                     dropout=self.dropout))
+            for i in range(num_layers - 1):
+                layer.append(nn.LSTMCell(input_size=self.hidden_size * num_directions,
+                                         hidden_size=self.hidden_size,
+                                         layer_index=i + 1,
+                                         has_bias=self.has_bias,
+                                         bidirectional=self.bidirectional,
+                                         dropout=self.dropout))
+            self.lstms = layer
        self.fill = P.Fill()
        self.shape = P.Shape()

    def construct(self, x, hx):
        if self.batch_first:
            x = self.transpose1(x, (1, 0, 2))
-        h0, c0 = hx
-        output, hn, cn, _, _ = self.lstm(x, h0, c0, self.weight)
+        if not self.cpu_target:
+            h, c = hx
+            output, h, c, _, _ = self.lstm(x, h, c, self.weight)
+            if self.batch_first:
+                output = self.transpose2(output, (1, 0, 2))
+            return (output, (h, c))
+        h, c = hx
+        output, hn, cn, _, _ = self.lstms[0](x, h[0], c[0])
+        for i in range(1, self.num_layers):
+            output, hn, cn, _, _ = self.lstms[i](output, h[i], c[i])
        if self.batch_first:
            output = self.transpose2(output, (1, 0, 2))
-        return (output, (hn, cn))
+        return output, hn, cn, _, _
+
+
+class LSTMCell(Cell):
+    r"""
+    LSTM (Long Short-Term Memory) layer.
+
+    Applies a LSTM layer to the input.
+
+    There are two pipelines connecting two consecutive cells in a LSTM model; one is cell state pipeline
+    and another is hidden state pipeline. Denote two consecutive time nodes as :math:`t-1` and :math:`t`.
+    Given an input :math:`x_t` at time :math:`t`, an hidden state :math:`h_{t-1}` and an cell
+    state :math:`c_{t-1}` of the layer at time :math:`{t-1}`, the cell state and hidden state at
+    time :math:`t` is computed using an gating mechanism. Input gate :math:`i_t` is designed to protect the cell
+    from perturbation by irrelevant inputs. Forget gate :math:`f_t` affords protection of the cell by forgetting
+    some information in the past, which is stored in :math:`h_{t-1}`. Output gate :math:`o_t` protects other
+    units from perturbation by currently irrelevant memory contents. Candidate cell state :math:`\tilde{c}_t` is
+    calculated with the current input, on which the input gate will be applied. Finally, current cell state
+    :math:`c_{t}` and hidden state :math:`h_{t}` are computed with the calculated gates and cell states. The complete
+    formulation is as follows.
+
+    .. math::
+        \begin{array}{ll} \\
+            i_t = \sigma(W_{ix} x_t + b_{ix} + W_{ih} h_{(t-1)} + b_{ih}) \\
+            f_t = \sigma(W_{fx} x_t + b_{fx} + W_{fh} h_{(t-1)} + b_{fh}) \\
+            \tilde{c}_t = \tanh(W_{cx} x_t + b_{cx} + W_{ch} h_{(t-1)} + b_{ch}) \\
+            o_t = \sigma(W_{ox} x_t + b_{ox} + W_{oh} h_{(t-1)} + b_{oh}) \\
+            c_t = f_t * c_{(t-1)} + i_t * \tilde{c}_t \\
+            h_t = o_t * \tanh(c_t) \\
+        \end{array}
+
+    Here :math:`\sigma` is the sigmoid function, and :math:`*` is the Hadamard product. :math:`W, b`
+    are learnable weights between the output and the input in the formula. For instance,
+    :math:`W_{ix}, b_{ix}` are the weight and bias used to transform from input :math:`x` to :math:`i`.
+    Details can be found in paper `LONG SHORT-TERM MEMORY
+    <https://www.bioinf.jku.at/publications/older/2604.pdf>`_ and
+    `Long Short-Term Memory Recurrent Neural Network Architectures for Large Scale Acoustic Modeling
+    <https://static.googleusercontent.com/media/research.google.com/zh-CN//pubs/archive/43905.pdf>`_.
+
+    Args:
+        input_size (int): Number of features of input.
+        hidden_size (int):  Number of features of hidden layer.
+        layer_index (int): index of current layer of stacked LSTM . Default: 0.
+        has_bias (bool): Specifies whether has bias `b_ih` and `b_hh`. Default: True.
+        batch_first (bool): Specifies whether the first dimension of input is batch_size. Default: False.
+        dropout (float, int): If not 0, append `Dropout` layer on the outputs of each
+            LSTM layer except the last layer. Default 0. The range of dropout is [0.0, 1.0].
+        bidirectional (bool): Specifies whether this is a bidirectional LSTM. If set True,
+            number of directions will be 2 otherwise number of directions is 1. Default: False.
+
+    Inputs:
+        - **input** (Tensor) - Tensor of shape (seq_len, batch_size, `input_size`).
+        - **h** - data type mindspore.float32 or
+          mindspore.float16 and shape (num_directions * `num_layers`, batch_size, `hidden_size`).
+        - **c** - data type mindspore.float32 or
+          mindspore.float16 and shape (num_directions * `num_layers`, batch_size, `hidden_size`).
+          Data type of `h' and 'c' should be the same of `input`.
+
+    Outputs:
+        `output`, `h_n`, `c_n`, 'reserve', 'state'.
+
+        - **output** (Tensor) - Tensor of shape (seq_len, batch_size, num_directions * `hidden_size`).
+        - **h** - A Tensor with shape (num_directions * `num_layers`, batch_size, `hidden_size`).
+        - **c** - A Tensor with shape (num_directions * `num_layers`, batch_size, `hidden_size`).
+        - **reserve** - reserved
+        - **state** - reserved
+
+    Examples:
+        >>> class LstmNet(nn.Cell):
+        >>>     def __init__(self, input_size, hidden_size, layer_index, has_bias, batch_first, bidirectional):
+        >>>         super(LstmNet, self).__init__()
+        >>>         self.lstm = nn.LSTMCell(input_size=input_size,
+        >>>                             hidden_size=hidden_size,
+        >>>                             layer_index=layer_index,
+        >>>                             has_bias=has_bias,
+        >>>                             batch_first=batch_first,
+        >>>                             bidirectional=bidirectional,
+        >>>                             dropout=0.0)
+        >>>
+        >>>     def construct(self, inp, h0, c0):
+        >>>         return self.lstm(inp, (h0, c0))
+        >>>
+        >>> net = LstmNet(10, 12, 2, has_bias=True, batch_first=True, bidirectional=False)
+        >>> input = Tensor(np.ones([3, 5, 10]).astype(np.float32))
+        >>> h0 = Tensor(np.ones([1 * 2, 3, 12]).astype(np.float32))
+        >>> c0 = Tensor(np.ones([1 * 2, 3, 12]).astype(np.float32))
+        >>> output, hn, cn, _, _ = net(input, h0, c0)
+    """
+
+
+    def __init__(self,
+                 input_size,
+                 hidden_size,
+                 layer_index=0,
+                 has_bias=True,
+                 batch_first=False,
+                 dropout=0,
+                 bidirectional=False):
+        super(LSTMCell, self).__init__()
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.num_layers = 1
+        self.layer_index = layer_index
+        self.has_bias = has_bias
+        self.batch_first = validator.check_value_type("batch_first", batch_first, [bool], self.cls_name)
+        self.dropout = float(dropout)
+        self.bidirectional = bidirectional
+        self.num_directions = 1
+        if self.bidirectional:
+            self.num_directions = 2
+        if self.batch_first:
+            self.transpose1 = P.Transpose()
+            self.transpose2 = P.Transpose()
+        w_np = np.ones([(self.input_size + self.hidden_size) * self.num_directions * self.hidden_size * 4, 1]).astype(
+            np.float32) * 0.01
+        if has_bias:
+            b_np = np.ones([self.num_directions * self.hidden_size * 4, 1]).astype(
+                np.float32) * 0.01
+        else:
+            b_np = np.zeros([self.num_directions * self.hidden_size * 4, 1]).astype(
+                np.float32) * 0.01
+        wb_np = np.concatenate((w_np, b_np), axis=0).reshape([-1, 1, 1])
+        self.w = Parameter(initializer(Tensor(wb_np), wb_np.shape), name='w' + str(self.layer_index))
+        self.lstm = P.LSTM(input_size=self.input_size,
+                           hidden_size=self.hidden_size,
+                           num_layers=1,
+                           has_bias=self.has_bias,
+                           bidirectional=self.bidirectional,
+                           dropout=self.dropout)
+
+    def construct(self, x, h, c):
+        if self.batch_first:
+            x = self.transpose1(x, (1, 0, 2))
+        output, hn, cn, _, _ = self.lstm(x, h, c, self.w)
+        if self.batch_first:
+            output = self.transpose2(output, (1, 0, 2))
+        return output, hn, cn, _, _
--- a/mindspore/ops/_grad/grad_array_ops.py
+++ b/mindspore/ops/_grad/grad_array_ops.py
@ -49,6 +49,7 @@ def get_bprop_dtype(self):

    def bprop(x, out, dout):
        return (zeros_like(x),)
+
    return bprop


@ -61,6 +62,7 @@ def get_bprop_cast(self):
    def bprop(x, t, out, dout):
        dx = cast(dout, get_dtype(x))
        return dx, zeros_like(t)
+
    return bprop


@ -70,6 +72,7 @@ def get_bprop_shape(self):

    def bprop(x, out, dout):
        return (zeros_like(x),)
+
    return bprop


@ -82,6 +85,7 @@ def get_bprop_split(self):
        concat_op = P.Concat(axis)
        dx = concat_op(dout)
        return (dx,)
+
    return bprop


@ -91,6 +95,7 @@ def get_bprop_rank(self):

    def bprop(x, out, dout):
        return (zeros_like(x),)
+
    return bprop


@ -101,6 +106,7 @@ def get_bprop_reshape(self):
    def bprop(x, shp, out, dout):
        shapex = shape_op(x)
        return reshape(dout, shapex), zeros_like(shp)
+
    return bprop


@ -111,6 +117,7 @@ def get_bprop_expand_dims(self):
    def bprop(x, axis, out, dout):
        shapex = shape_op(x)
        return reshape(dout, shapex), zeros_like(axis)
+
    return bprop


@ -121,6 +128,7 @@ def get_bprop_squeeze(self):
    def bprop(x, out, dout):
        shapex = shape_op(x)
        return (reshape(dout, shapex),)
+
    return bprop


@ -132,6 +140,7 @@ def get_bprop_flatten(self):
    def bprop(x, out, dout):
        dx = flatten_grad(dout, shape_op(x))
        return (dx,)
+
    return bprop


@ -166,6 +175,7 @@ def _tile_shape(multiples, shapex):
@bprop_getters.register(P.Tile)
 def get_bprop_tile(self):
    """Generate bprop for Tile"""
+
    def bprop(x, multiples, out, dout):
        shapex = shape_op(x)
        r_shape = _tile_shape(multiples, shapex)
@ -174,6 +184,7 @@ def get_bprop_tile(self):
        dx = reduce_sum(reshape(dout, r_shape), axis)
        dx = reshape(dx, shapex)
        return dx, zeros_like(multiples)
+
    return bprop


@ -183,6 +194,7 @@ def get_bprop_transpose(self):

    def bprop(x, perm, out, dout):
        return transpose(dout, invert_permutation(perm)), zeros_like(perm)
+
    return bprop


@ -198,6 +210,7 @@ def get_bprop_concat(self):
            slice_out = P.Slice()(dout, out_offset[i], shape_op(x[i]))
            dx = dx + (slice_out,)
        return (dx,)
+
    return bprop


@ -215,12 +228,12 @@ def get_bprop_slice(self):
        dx = P.Pad(_slice_grad_pad(begin, size, shape_op(x)))(dout)
        return (dx, zeros_like(begin), zeros_like(size))

-    def bprop_gpu(x, begin, size, out, dout):
+    def bprop_grad(x, begin, size, out, dout):
        dx = dx = G.SliceGrad()(dout, x, begin, size)
        return (dx, zeros_like(begin), zeros_like(size))

-    if context.get_context('device_target') == "GPU":
-        return bprop_gpu
+    if context.get_context('device_target') == "GPU" or context.get_context('device_target') == "CPU":
+        return bprop_grad
    return bprop


@ -249,6 +262,7 @@ def _generate_inverse_index(x_shape, axis):
@bprop_getters.register(P.GatherV2)
 def get_bprop_gather_v2(self):
    """Generate bprop for GatherV2"""
+
    def bprop(x, indices, axis, out, dout):
        if F.rank(dout) == 0:
            dout = P.ExpandDims()(dout, -1)
@ -265,6 +279,7 @@ def get_bprop_gather_v2(self):
        perm_2 = _generate_inverse_index(x_shp, axis)
        params_grad = transpose(params_grad, perm_2)
        return params_grad, zeros_like(indices), zeros_like(axis)
+
    return bprop


@ -286,6 +301,7 @@ def get_bprop_pack(self):
        pack_grad = P.Unpack(axis)
        out = pack_grad(dout)
        return (out,)
+
    return bprop


@ -298,6 +314,7 @@ def get_bprop_unpack(self):
        unpack_grad = P.Pack(axis)
        out = unpack_grad(dout)
        return (out,)
+
    return bprop


@ -313,6 +330,7 @@ def get_bprop_strided_slice(self):
    def bprop(x, begin, end, strides, out, dout):
        dx = input_grad(dout, shape_op(x), begin, end, strides)
        return dx, zeros_like(begin), zeros_like(end), zeros_like(strides)
+
    return bprop


@ -322,6 +340,7 @@ def get_bprop_eye(self):

    def bprop(n, m, t, out, dout):
        return zeros_like(n), zeros_like(m), zeros_like(t)
+
    return bprop


@ -332,6 +351,7 @@ def get_bprop_select(self):

    def bprop(cond, x, y, out, dout):
        return zeros_like(cond), select(cond, dout, zeros_like(x)), select(cond, zeros_like(y), dout)
+
    return bprop


@ -522,9 +542,11 @@ def get_bprop_unsorted_segment_min(self):
 def get_bprop_space_to_batch(self):
    """Generate bprop for SpaceToBatch"""
    space_to_batch_grad = P.BatchToSpace(self.block_size, self.paddings)
+
    def bprop(x, out, dout):
        dx = space_to_batch_grad(dout)
        return (dx,)
+
    return bprop


@ -532,7 +554,9 @@ def get_bprop_space_to_batch(self):
 def get_bprop_batch_to_space(self):
    """Generate bprop for BatchToSpace"""
    batch_to_space_grad = P.SpaceToBatch(self.block_size, self.crops)
+
    def bprop(x, out, dout):
        dx = batch_to_space_grad(dout)
        return (dx,)
+
    return bprop
--- a/mindspore/ops/_grad/grad_math_ops.py
+++ b/mindspore/ops/_grad/grad_math_ops.py
@ -15,7 +15,6 @@

 """Define the grad rules of math related operations."""

-
 from functools import reduce
 import numpy as np
 from .. import functional as F
@ -26,7 +25,6 @@ from ..functional import broadcast_gradient_args, reduced_shape, tuple_div
 from .grad_base import bprop_getters
 from ..primitive import constexpr

-
 shape_op = P.Shape()
 reduce_sum = P.ReduceSum()
 reshape = P.Reshape()
@ -129,6 +127,7 @@ def bprop_matmul(self):
        else:
            dw = mul2(x, dout)
        return dx, dw
+
    return bprop


@ -152,6 +151,7 @@ def bprop_batchmatmul(self):
        else:
            dw = mul2(x, dout)
        return dx, dw
+
    return bprop


@ -161,6 +161,7 @@ def get_bprop_tensor_add(self):

    def bprop(x, y, out, dout):
        return binop_grad_common(x, y, dout, dout)
+
    return bprop


@ -172,6 +173,7 @@ def get_bprop_neg(self):
    def bprop(x, out, dout):
        dx = neg_grad(dout)
        return (dx,)
+
    return bprop


@ -182,6 +184,7 @@ def get_bprop_sub(self):

    def bprop(x, y, out, dout):
        return binop_grad_common(x, y, dout, neg_func(dout))
+
    return bprop


@ -194,6 +197,7 @@ def get_bprop_mul(self):
        bc_dx = mul_func(dout, y)
        bc_dy = mul_func(dout, x)
        return binop_grad_common(x, y, bc_dx, bc_dy)
+
    return bprop


@ -208,6 +212,7 @@ def get_bprop_real_div(self):
        bc_x = div_op(dout, y)
        bc_y = neg(mul_op(bc_x, out))
        return binop_grad_common(x, y, bc_x, bc_y)
+
    return bprop


@ -222,6 +227,7 @@ def get_bprop_div(self):
        bc_x = div_op(dout, y)
        bc_y = neg(mul_op(bc_x, out))
        return binop_grad_common(x, y, bc_x, bc_y)
+
    return bprop


@ -235,6 +241,7 @@ def get_bprop_floor(self):
    def bprop(x, out, dout):
        bc_x = fill_(dtype_(x), shape_(x), 0.)
        return (bc_x,)
+
    return bprop


@ -249,6 +256,7 @@ def get_bprop_floordiv(self):
        bc_x = div_op(dout, y)
        bc_y = neg(mul_op(bc_x, out))
        return binop_grad_common(x, y, bc_x, bc_y)
+
    return bprop


@ -260,6 +268,7 @@ def get_bprop_floormod(self):
        bc_x = dout
        bc_y = -dout * (x // y)
        return binop_grad_common(x, y, bc_x, bc_y)
+
    return bprop


@ -274,6 +283,7 @@ def get_bprop_square(self):
        temp = mul_func(dout, x)
        dx = mul_func(fill_func(dtype(temp), shape_op(x), 2.0), temp)
        return (dx,)
+
    return bprop


@ -290,6 +300,7 @@ def get_bprop_sqrt(self):
        temp = div_op(fill_func(dtype(x), shape_op(x), 0.5), sqrt(x))
        dx = mul_func(dout, temp)
        return (dx,)
+
    return bprop


@ -298,9 +309,10 @@ def get_bprop_rsqrt(self):
    """Grad definition for `Rsqrt` operation."""

    def bprop(x, out, dout):
-        grad = F.fill(F.dtype(x), F.shape(x), -0.5) / (F.sqrt(x)*x)
+        grad = F.fill(F.dtype(x), F.shape(x), -0.5) / (F.sqrt(x) * x)
        dx = dout * grad
        return (dx,)
+
    return bprop


@ -316,6 +328,7 @@ def get_bprop_reciprocal(self):
        g = neg(reciprocal(square(x)))
        dx = mul(dout, g)
        return (dx,)
+
    return bprop


@ -328,6 +341,7 @@ def get_bprop_log(self):
        g = reciprocal(x)
        dx = g * dout
        return dx, 0
+
    return bprop


@ -341,6 +355,7 @@ def get_bprop_log1p(self):
        g = reciprocal(x_1p)
        dx = g * dout
        return dx, 0
+
    return bprop


@ -358,6 +373,7 @@ def get_bprop_erf(self):
        x_square = square(x)
        dx = dout * half_root_pi * exp(-x_square)
        return (dx,)
+
    return bprop


@ -388,6 +404,7 @@ def get_bprop_pow(self):
        bc_dx = power * pow_op(x, power - 1.0) * dout
        bc_dpower = out * ln(x) * dout
        return binop_grad_common(x, power, bc_dx, bc_dpower)
+
    return bprop


@ -400,6 +417,7 @@ def get_bprop_exp(self):
        g = exp_(x)
        dx = g * dout
        return (dx,)
+
    return bprop


@ -411,6 +429,7 @@ def get_bprop_minimum(self):
    def bprop(x, y, out, dout):
        dx, dy = input_grad(x, y, dout)
        return dx, dy
+
    return bprop


@ -422,6 +441,7 @@ def get_bprop_maximum(self):
    def bprop(x, y, out, dout):
        dx, dy = input_grad(x, y, dout)
        return dx, dy
+
    return bprop


@ -432,6 +452,7 @@ def get_bprop_reducesum(self):
    def bprop(x, axis, out, dout):
        dx = _sum_grad(x, axis, dout)
        return dx, zeros_like(axis)
+
    return bprop


@ -442,6 +463,7 @@ def get_bprop_cumsum(self):

    def bprop(x, axis, out, dout):
        return cumsum(dout, axis), zeros_like(axis)
+
    return bprop


@ -500,6 +522,7 @@ def get_bprop_reduceprod(self):
        out = transpose(y, _invert_permutation(perm)) * grad
        dx = reshape(out, input_shape)
        return dx, zeros_like(axis)
+
    return bprop


@ -515,6 +538,7 @@ def get_bprop_cumprod(self):
        prod = cumprod(x, axis)
        out = cumsum(prod * dout, axis)
        return out / x, zeros_like(axis)
+
    return bprop


@ -524,6 +548,7 @@ def get_bprop_reduceall(self):

    def bprop(x, axis, out, dout):
        return zeros_like(x), zeros_like(axis)
+
    return bprop


@ -534,6 +559,7 @@ def get_bprop_reducemax(self):
    def bprop(x, axis, out, dout):
        dx = _min_or_max_grad(x, axis, out, dout)
        return (dx, zeros_like(axis))
+
    return bprop


@ -547,6 +573,7 @@ def get_bprop_argmaxwithvalue(self):
    def bprop(x, out, dout):
        dx = _argmin_or_argmax_grad(x, axis, keep_dims, op, out, dout)
        return (dx,)
+
    return bprop


@ -557,6 +584,7 @@ def get_bprop_reducemin(self):
    def bprop(x, axis, out, dout):
        dx = _min_or_max_grad(x, axis, out, dout)
        return (dx, zeros_like(axis))
+
    return bprop


@ -570,6 +598,7 @@ def get_bprop_argminwithvalue(self):
    def bprop(x, out, dout):
        dx = _argmin_or_argmax_grad(x, axis, keep_dims, op, out, dout)
        return (dx,)
+
    return bprop


@ -585,6 +614,7 @@ def get_bprop_reduce_mean(self):
        div_shape = F.shape_mul(shape_op(x)) / F.shape_mul(shape_op(out))
        dx = div_op(grad, cast(F.scalar_to_array(div_shape), dtype(grad)))
        return dx, zeros_like(axis)
+
    return bprop


@ -604,6 +634,7 @@ def get_bprop_not_equal(self):

    def bprop(x, y, out, dout):
        return zeros_like(x), zeros_like(y)
+
    return bprop


@ -613,6 +644,7 @@ def get_bprop_greater(self):

    def bprop(x, y, out, dout):
        return zeros_like(x), zeros_like(y)
+
    return bprop


@ -622,6 +654,7 @@ def get_bprop_greater_equal(self):

    def bprop(x, y, out, dout):
        return zeros_like(x), zeros_like(y)
+
    return bprop


@ -631,6 +664,7 @@ def get_bprop_less(self):

    def bprop(x, y, out, dout):
        return zeros_like(x), zeros_like(y)
+
    return bprop


@ -640,6 +674,7 @@ def get_bprop_less_equal(self):

    def bprop(x, y, out, dout):
        return zeros_like(x), zeros_like(y)
+
    return bprop


@ -649,6 +684,7 @@ def get_bprop_logical_not(self):

    def bprop(x, out, dout):
        return (zeros_like(x),)
+
    return bprop


@ -658,6 +694,7 @@ def get_bprop_logical_and(self):

    def bprop(x, y, out, dout):
        return zeros_like(x), zeros_like(y)
+
    return bprop


@ -667,6 +704,7 @@ def get_bprop_logical_or(self):

    def bprop(x, y, out, dout):
        return zeros_like(x), zeros_like(y)
+
    return bprop


@ -676,6 +714,7 @@ def get_bprop_npu_alloc_float_status(self):

    def bprop(out, dout):
        return ()
+
    return bprop


@ -685,6 +724,7 @@ def get_bprop_npu_get_float_status(self):

    def bprop(x, out, dout):
        return (zeros_like(x),)
+
    return bprop


@ -694,6 +734,7 @@ def get_bprop_npu_clear_float_status(self):

    def bprop(x, out, dout):
        return (zeros_like(x),)
+
    return bprop


@ -703,6 +744,7 @@ def get_bprop_assign_add(self):

    def bprop(x, y, out, dout):
        return zeros_like(x), zeros_like(y)
+
    return bprop


@ -712,6 +754,7 @@ def get_bprop_assign_sub(self):

    def bprop(x, y, out, dout):
        return zeros_like(x), zeros_like(y)
+
    return bprop


@ -721,8 +764,9 @@ def get_bprop_sin(self):
    cos = P.Cos()

    def bprop(x, out, dout):
-        dx = dout*cos(x)
+        dx = dout * cos(x)
        return (dx,)
+
    return bprop


@ -733,8 +777,9 @@ def get_bprop_cos(self):
    neg = P.Neg()

    def bprop(x, out, dout):
-        dx = dout*neg(sin(x))
+        dx = dout * neg(sin(x))
        return (dx,)
+
    return bprop


@ -746,6 +791,7 @@ def get_bprop_acos(self):
    def bprop(x, out, dout):
        dx = input_grad(x, dout)
        return (dx,)
+
    return bprop


@ -757,6 +803,7 @@ def get_bprop_acosh(self):
    def bprop(x, out, dout):
        dx = input_grad(out, dout)
        return (dx,)
+
    return bprop


@ -768,6 +815,7 @@ def get_bprop_abs(self):
    def bprop(x, out, dout):
        dx = abs_grad(x, dout)
        return (dx,)
+
    return bprop


@ -777,6 +825,7 @@ def get_bprop_scalar_cast(self):

    def bprop(x, t, out, dout):
        return F.scalar_cast(dout, F.typeof(x)), zeros_like(t)
+
    return bprop


@ -789,6 +838,7 @@ def get_bprop_scalar_addn(self):
        for _ in range(len(x)):
            dx = dx + (dout,)
        return dx
+
    return bprop


@ -798,6 +848,7 @@ def get_bprop_sign(self):

    def bprop(x, out, dout):
        return (zeros_like(x),)
+
    return bprop


@ -807,6 +858,7 @@ def get_bprop_round(self):

    def bprop(x, out, dout):
        return (zeros_like(x),)
+
    return bprop


@ -821,4 +873,5 @@ def get_bprop_atan2(self):
        bc_dx = tmp * y
        bc_dy = tmp * (-x)
        return binop_grad_common(x, y, bc_dx, bc_dy)
+
    return bprop
--- a/mindspore/ops/_grad/grad_nn_ops.py
+++ b/mindspore/ops/_grad/grad_nn_ops.py
@ -21,6 +21,7 @@ from ..operations import _grad_ops as G
 from ..operations import _inner_ops as inner
 from ..composite.multitype_ops.zeros_like_impl import zeros_like
 from .grad_base import bprop_getters
+from ... import context


@bprop_getters.register(P.BiasAdd)
@ -551,6 +552,14 @@ def get_bprop_lstm(self):
        bidirectional=self.bidirectional,
        dropout=self.dropout
    )
+    lstm_grad = G.LSTMGrad(
+        input_size=self.input_size,
+        hidden_size=self.hidden_size,
+        num_layers=self.num_layers,
+        has_bias=self.has_bias,
+        bidirectional=self.bidirectional,
+        dropout=self.dropout
+    )

    def bprop(x, hx, cx, w, out, dout):
        y, _, _, reserve, state = out
@ -559,6 +568,16 @@ def get_bprop_lstm(self):
        dw = lstm_grad_weight(F.depend(x, dx), hx, y, reserve, state)
        return dx, dhx, dcx, dw

+    #
+    def bprop_cpu(x, hx, cx, w, out, dout):
+        y, hy, cy, reserve, _ = out
+        dy, dhy, dcy, _, _ = dout
+        dx, dhx, dcx, dw = lstm_grad(x, hx, cx, w, y, hy, cy, dy, dhy, dcy, reserve)
+        return dx, dhx, dcx, dw
+
+    if context.get_context('device_target') == "CPU":
+        return bprop_cpu
+
    return bprop


--- a/mindspore/ops/operations/_grad_ops.py
+++ b/mindspore/ops/operations/_grad_ops.py
@ -107,6 +107,7 @@ class BiasAddGrad(Primitive):

 class BinaryCrossEntropyGrad(PrimitiveWithInfer):
    """Computes gradients for `BinaryCrossEntropy` operation."""
+
    @prim_attr_register
    def __init__(self, reduction='mean'):
        self.reduction = validator.check_string('reduction', reduction, ['none', 'mean', 'sum'], self.name)
@ -665,6 +666,62 @@ class LSTMGradWeight(PrimitiveWithInfer):
        return hx_dtype


+class LSTMGrad(PrimitiveWithInfer):
+    """Computes the data and weight gradients of LSTM."""
+
+    @prim_attr_register
+    def __init__(self, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout):
+        self.input_size = validator.check_integer('input_size', input_size, 0, Rel.GT, self.name)
+        self.hidden_size = validator.check_integer('hidden_size', hidden_size, 0, Rel.GT, self.name)
+        self.num_layers = validator.check_integer('num_layers', num_layers, 0, Rel.GT, self.name)
+        self.has_bias = validator.check_value_type('has_bias', has_bias, (bool,), self.name)
+        self.bidirectional = validator.check_value_type('bidirectional', bidirectional, (bool,), self.name)
+        self.dropout = validator.check_value_type("dropout", dropout, [float], self.name)
+        self.dropout = validator.check_number_range('dropout', dropout, 0, 1, Rel.INC_BOTH, self.name)
+
+        if bidirectional:
+            self.num_directions = 2
+        else:
+            self.num_directions = 1
+
+    def infer_shape(self, x_shape, hx_shape, cx_shape, w_shape, y_shape, hy_shape, cy_shape, dy_shape, dhy_shape,
+                    dcy_shape, reserve_shape):
+        # dhy and dcy should be same shape
+        validator.check_integer("h_shape", len(dhy_shape), 3, Rel.EQ, self.name)
+        validator.check_integer("h_shape", len(dhy_shape), len(dcy_shape), Rel.EQ, self.name)
+        validator.check_integer("h_shape[0]", dhy_shape[0], dcy_shape[0], Rel.EQ, self.name)
+        validator.check_integer("h_shape[1]", dhy_shape[1], dcy_shape[1], Rel.EQ, self.name)
+        validator.check_integer("h_shape[2]", dhy_shape[2], dcy_shape[2], Rel.EQ, self.name)
+
+        validator.check_integer("h_shape[0]", dhy_shape[0], self.num_layers * self.num_directions, Rel.EQ, self.name)
+        validator.check_integer("h_shape[2]", dhy_shape[2], self.hidden_size, Rel.EQ, self.name)
+
+        # dy: (seq_len, batch_size, hidden_size * num_directions)
+        validator.check_integer("dy_shape", len(dy_shape), 3, Rel.EQ, self.name)
+        validator.check_integer("dy[1]", dy_shape[1], dhy_shape[1], Rel.EQ, self.name)
+        validator.check_integer("dy[2]", dy_shape[2], self.hidden_size * self.num_directions, Rel.EQ, self.name)
+
+        # (seq_len, batch_size, input_size)
+        dx_shape = (y_shape[0], y_shape[1], self.input_size)
+        dhx_shape = dhy_shape
+        dcx_shape = dcy_shape
+        weight_size = 0
+        gate_size = 4 * self.hidden_size
+        for layer in range(self.num_layers):
+            for _ in range(self.num_directions):
+                input_layer_size = self.input_size if layer == 0 else self.hidden_size * self.num_directions
+                weight_size += gate_size * input_layer_size
+                weight_size += gate_size * self.hidden_size
+                if self.has_bias:
+                    weight_size += gate_size
+
+        return (dx_shape, dhx_shape, dcx_shape, (weight_size, 1, 1))
+
+    def infer_dtype(self, x_dtype, hx_dtype, cx_dtype, w_dtype, y_dtype, hy_dtype, cy_dtype, dy_dtype, dhy_dtype,
+                    dcy_dtype, reserve_dtype):
+        return (dy_dtype, dy_dtype, dy_dtype, hx_dtype)
+
+
 class PReLUGrad(PrimitiveWithInfer):
    r"""
    Gradients of PReLU operation.
@ -1051,6 +1108,7 @@ class RefToEmbed(Primitive):
    __mindspore_signature__ = (
        ('variable', sig_rw.RW_REF, sig_kind.KIND_POSITIONAL_KEYWORD),
    )
+
    @prim_attr_register
    def __init__(self):
        pass
--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@ -35,9 +35,11 @@ def _check_positive_int_or_tuple(arg_name, arg_value, prim_name, allow_four=Fals
    """
    Checks whether an argument is a positive int or tuple with 2 or 4(when allow_four is True) positive int elements.
    """
+
    def _raise_message():
        raise ValueError(f"For '{prim_name}' attr '{arg_name}' should be an positive int number or a tuple of two "
                         f"{'or four ' if allow_four else ''}positive int numbers, but got {arg_value}")
+
    def _get_return_value():
        if isinstance(arg_value, int):
            ret = (1, 1, arg_value, arg_value) if ret_four else (arg_value, arg_value)
@ -50,6 +52,7 @@ def _check_positive_int_or_tuple(arg_name, arg_value, prim_name, allow_four=Fals
        else:
            _raise_message()
        return ret
+
    validator.check_value_type(arg_name, arg_value, (int, tuple), prim_name)
    ret_value = _get_return_value()
    for item in ret_value:
@ -58,6 +61,7 @@ def _check_positive_int_or_tuple(arg_name, arg_value, prim_name, allow_four=Fals
        _raise_message()
    return ret_value

+
 class Flatten(PrimitiveWithInfer):
    r"""
    Flattens a tensor without changing its batch size on the 0-th axis.
@ -205,6 +209,7 @@ class Softplus(PrimitiveWithInfer):
        >>> softplus(input_x)
        [1.3132615, 2.126928, 3.0485873, 4.01815, 5.0067153]
    """
+
    @prim_attr_register
    def __init__(self):
        """init Softplus"""
@ -301,6 +306,7 @@ class ReLUV2(PrimitiveWithInfer):
        ([[[[1., 0.], [0., 4.]], [[0., 6.], [7., 0.]]]],
         [[[[1, 0], [2, 0]], [[2, 0], [1, 0]]]])
    """
+
    @prim_attr_register
    def __init__(self):
        """init ReLUV2"""
@ -398,6 +404,7 @@ class HSwish(PrimitiveWithInfer):
        >>> input_x = Tensor(np.array([-1, -2, 0, 2, 1]), mindspore.float16)
        >>> result = hswish(input_x)
    """
+
    @prim_attr_register
    def __init__(self):
        self.init_prim_io_names(inputs=['x'], outputs=['output'])
@ -1077,6 +1084,7 @@ class MaxPoolWithArgmax(_Pool):
        >>> maxpool_arg_op = P.MaxPoolWithArgmax(padding="VALID", ksize=2, strides=1)
        >>> output_tensor, argmax = maxpool_arg_op(input_tensor)
    """
+
    def __init__(self, ksize=1, strides=1, padding="valid"):
        super(MaxPoolWithArgmax, self).__init__(ksize, strides, padding)
        self.is_tbe = context.get_context("device_target") == "Ascend"
@ -1495,6 +1503,7 @@ class ApplyMomentum(PrimitiveWithInfer):
        ('gradient', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD),
        ('momentum', sig_rw.RW_READ, sig_kind.KIND_POSITIONAL_KEYWORD)
    )
+
    @prim_attr_register
    def __init__(self, use_nesterov=False, use_locking=False, gradient_scale=1.0):
        self.init_prim_io_names(inputs=['variable', 'accumulation', 'learning_rate', 'gradient', 'momentum'],
@ -1584,6 +1593,7 @@ class L2Loss(PrimitiveWithInfer):
        >>> l2_loss(input_x)
        7.0
    """
+
    @prim_attr_register
    def __init__(self):
        """init L2Loss"""
@ -2326,7 +2336,29 @@ class LSTM(PrimitiveWithInfer):
        y_shape = (x_shape[0], x_shape[1], self.hidden_size * self.num_directions)

        # set arbitrary shape for reserved space
-        reserved_shape = (1, 1)
+        type_size = 4
+        gates_ws_ld = self.get_good_ld(self.hidden_size * 4, type_size)
+        states_ws_ld = self.get_good_ld(max(self.hidden_size, self.input_size), type_size)
+        self.ws_gates_size = self.num_layers * self.num_directions * x_shape[0] * x_shape[1] * gates_ws_ld * type_size
+        self.ws_states_size = (self.num_layers + 1) * self.num_directions * (x_shape[0] + 1) * x_shape[
+            1] * states_ws_ld * type_size
+        self.ws_c_states_size = (self.num_layers + 1) * self.num_directions * (x_shape[0] + 1) * x_shape[
+            1] * states_ws_ld * type_size
+        self.ws_diff_states_size = (self.num_layers + 1) * self.num_directions * (x_shape[0] + 1) * (2 + 1) * x_shape[
+            1] * states_ws_ld * type_size
+        self.ws_grid_comp_size = 0
+        self.page_size = 4096
+        current_offset = 0
+        current_offset += self.ws_gates_size
+        current_offset = self.rnd_up(current_offset, self.page_size)
+        current_offset += self.ws_states_size
+        current_offset = self.rnd_up(current_offset, self.page_size)
+        current_offset += self.ws_c_states_size
+        current_offset = self.rnd_up(current_offset, self.page_size)
+        current_offset += self.ws_diff_states_size
+        current_offset = self.rnd_up(current_offset, self.page_size)
+        current_offset += self.ws_grid_comp_size
+        reserved_shape = (current_offset, 1)
        state_shape = (1, 1)
        return (y_shape, h_shape, c_shape, reserved_shape, state_shape)

@ -2335,6 +2367,15 @@ class LSTM(PrimitiveWithInfer):
        validator.check_tensor_type_same(args, (mstype.float32, mstype.float16), self.name)
        return (x_dtype, x_dtype, x_dtype, x_dtype, x_dtype)

+    def rnd_up(self, current_offset, page_size):
+        return ((current_offset + page_size - 1) // page_size) * page_size
+
+    def get_good_ld(self, dim, type_size):
+        ld = self.rnd_up(dim, 64 // type_size)
+        if ld * 256 == 0:
+            return ld + 64 // type_size
+        return ld
+

 class SigmoidCrossEntropyWithLogits(PrimitiveWithInfer):
    r"""
@ -3000,6 +3041,7 @@ class Dropout(PrimitiveWithInfer):
        >>> in = Tensor((20, 16, 50, 50))
        >>> out = dropout(in)
    """
+
    @prim_attr_register
    def __init__(self, drop_prob=0):
        self.drop_prob = validator.check_number_range("drop_prob", drop_prob, 0, 1, Rel.INC_BOTH, self.name)
@ -3034,6 +3076,7 @@ class DropoutGrad(PrimitiveWithInfer):
        >>> in = Tensor((20, 16, 50, 50))
        >>> out = dropout_grad(in)
    """
+
    @prim_attr_register
    def __init__(self, drop_prob=0):
        self.drop_prob = validator.check_number_range("drop_prob", drop_prob, 0, 1, Rel.INC_BOTH, self.name)
@ -3084,6 +3127,7 @@ class CTCLoss(PrimitiveWithInfer):
        >>> ctc_loss = P.CTCloss()
        >>> output = ctc_loss(inputs, labels_indices, labels_values, sequence_length)
    """
+
    @prim_attr_register
    def __init__(self, preprocess_collapse_repeated=False, ctc_merge_repeated=False,
                 ignore_longer_outputs_than_inputs=False):
--- a/tests/st/ops/cpu/test_lstm_op.py
+++ b/tests/st/ops/cpu/test_lstm_op.py
@ -0,0 +1,335 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import pytest
+import mindspore.nn as nn
+from mindspore.common.api import ms_function
+import numpy as np
+import mindspore.context as context
+from mindspore.common.initializer import initializer
+from mindspore.ops import composite as C
+from mindspore.ops import operations as P
+from mindspore.common.tensor import Tensor
+from mindspore.common.parameter import ParameterTuple, Parameter
+
+context.set_context(device_target='CPU')
+
+
+class LstmNet(nn.Cell):
+    def __init__(self, seq_len, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout):
+        super(LstmNet, self).__init__()
+
+        num_directions = 1
+        if bidirectional:
+            num_directions = 2
+
+        self.lstm = P.LSTM(input_size, hidden_size, num_layers, has_bias, bidirectional, dropout)
+        input_np = np.array([[[0.6755, -1.6607, 0.1367], [0.4276, -0.7850, -0.3758]],
+                             [[-0.6424, -0.6095, 0.6639], [0.7918, 0.4147, -0.5089]],
+                             [[-1.5612, 0.0120, -0.7289], [-0.6656, -0.6626, -0.5883]],
+                             [[-0.9667, -0.6296, -0.7310], [0.1026, -0.6821, -0.4387]],
+                             [[-0.4710, 0.6558, -0.3144], [-0.8449, -0.2184, -0.1806]]
+                             ]).astype(np.float32)
+        self.x = Parameter(initializer(Tensor(input_np), [seq_len, batch_size, input_size]), name='x')
+
+        self.h = Parameter(initializer(
+            Tensor(
+                np.array([0.1, 0.1, 0.1, 0.1]).reshape((num_layers * num_directions, batch_size, hidden_size)).astype(
+                    np.float32)),
+            [num_layers * num_directions, batch_size, hidden_size]), name='h')
+
+        self.c = Parameter(initializer(
+            Tensor(
+                np.array([0.2, 0.2, 0.2, 0.2]).reshape((num_layers * num_directions, batch_size, hidden_size)).astype(
+                    np.float32)),
+            [num_layers * num_directions, batch_size, hidden_size]), name='c')
+
+        wih = np.array([[3.4021e-01, -4.6622e-01, 4.5117e-01],
+                        [-6.4257e-02, -2.4807e-01, 1.3550e-02],  # i
+                        [-3.2140e-01, 5.5578e-01, 6.3589e-01],
+                        [1.6547e-01, -7.9030e-02, -2.0045e-01],
+                        [-6.9863e-01, 5.9773e-01, -3.9062e-01],
+                        [-3.0253e-01, -1.9464e-01, 7.0591e-01],
+                        [-4.0835e-01, 3.6751e-01, 4.7989e-01],
+                        [-5.6894e-01, -5.0359e-01, 4.7491e-01]]).astype(np.float32)  # .reshape([1,-1])
+        whh = np.array([[-0.4820, -0.2350],
+                        [-0.1195, 0.0519],
+                        [0.2162, -0.1178],
+                        [0.6237, 0.0711],
+                        [0.4511, -0.3961],
+                        [-0.5962, 0.0906],
+                        [0.1867, -0.1225],
+                        [0.1831, 0.0850]]).astype(np.float32)  # .reshape([1,-1])
+        wih = wih.transpose((1, 0))
+        whh = whh.transpose((1, 0))
+        bih = np.zeros((1, 8)).astype(np.float32)
+        w_np = np.concatenate((wih, whh, bih), axis=0).reshape([-1, 1, 1])
+        self.w = Parameter(initializer(Tensor(w_np), w_np.shape), name='w')
+
+    @ms_function
+    def construct(self):
+        return self.lstm(self.x, self.h, self.c, self.w)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_lstm():
+    seq_len = 5
+    batch_size = 2
+    input_size = 3
+    hidden_size = 2
+    num_layers = 1
+    has_bias = True
+    bidirectional = False
+    dropout = 0.0
+    num_directions = 1
+    if bidirectional:
+        num_directions = 2
+    net = LstmNet(seq_len, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout)
+    y, h, c, _, _ = net()
+    print(y)
+    print(c)
+    print(h)
+    expect_y = np.array([[[-0.16709016, 0.13125697],
+                          [-0.08438572, -0.01969833]],
+                         [[-0.2746155, 0.32764038],
+                          [-0.06504016, -0.07770399]],
+                         [[-0.00140004, 0.17706314],
+                          [0.03244496, -0.10135599]],
+                         [[0.08328028, 0.06437367],
+                          [-0.04133911, -0.11072896]],
+                         [[0.19004421, -0.02852732],
+                          [0.09138509, -0.00344161]]]
+                        )
+    error = np.ones([num_layers, batch_size, hidden_size]) * 1.0e-4
+    diff = y.asnumpy() - expect_y
+    assert np.all(diff < error)
+    assert np.all(-diff < error)
+    #
+    expect_h = np.array([[[0.19004421, -0.02852732],
+                          [0.09138509, -0.00344161]]])
+
+    error = np.ones((num_layers * num_directions, batch_size, hidden_size)) * 1.0e-4
+    diff = h.asnumpy() - expect_h
+    assert np.all(diff < error)
+    assert np.all(-diff < error)
+    #
+    expect_c = np.array([[[0.34533143, -0.06313794],
+                          [0.169008, -0.00555446]]])
+    error = np.ones((num_layers * num_directions, batch_size, hidden_size)) * 1.0e-4
+    diff = c.asnumpy() - expect_c
+    assert np.all(diff < error)
+    assert np.all(-diff < error)
+
+
+class MultiLayerBiLstmNet(nn.Cell):
+    def __init__(self, seq_len, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout):
+        super(MultiLayerBiLstmNet, self).__init__()
+
+        num_directions = 1
+        if bidirectional:
+            num_directions = 2
+
+        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, has_bias=has_bias,
+                            bidirectional=bidirectional, dropout=dropout)
+
+        input_np = np.array([[[-0.1887, -0.4144, -0.0235, 0.7489, 0.7522, 0.5969, 0.3342, 1.2198, 0.6786, -0.9404],
+                              [-0.8643, -1.6835, -2.4965, 2.8093, 0.1741, 0.2707, 0.7387, -0.0939, -1.7990, 0.4765]],
+
+                             [[-0.5963, -1.2598, -0.7226, 1.1365, -1.7320, -0.7302, 0.1221, -0.2111, -1.6173, -0.0706],
+                              [0.8964, 0.1737, -1.0077, -0.1389, 0.4889, 0.4391, 0.7911, 0.3614, -1.9533, -0.9936]],
+
+                             [[0.3260, -1.3312, 0.0601, 1.0726, -1.6010, -1.8733, -1.5775, 1.1579, -0.8801, -0.5742],
+                              [-2.2998, -0.6344, -0.5409, -0.9221, -0.6500, 0.1206, 1.5215, 0.7517, 1.3691, 2.0021]],
+
+                             [[-0.1245, -0.3690, 2.1193, 1.3852, -0.1841, -0.8899, -0.3646, -0.8575, -0.3131, 0.2026],
+                              [1.0218, -1.4331, 0.1744, 0.5442, -0.7808, 0.2527, 0.1566, 1.1484, -0.7766, -0.6747]],
+
+                             [[-0.6752, 0.9906, -0.4973, 0.3471, -0.1202, -0.4213, 2.0213, 0.0441, 0.9016, 1.0365],
+                              [1.2223, -1.3248, 0.1207, -0.8256, 0.1816, 0.7057, -0.3105, 0.5713, 0.2804,
+                               -1.0685]]]).astype(np.float32)
+
+        self.x = Parameter(initializer(Tensor(input_np), [seq_len, batch_size, input_size]), name='x')
+
+        self.h0 = Parameter(initializer(
+            Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32)),
+            [num_directions, batch_size, hidden_size]), name='h0')
+        self.c0 = Parameter(initializer(
+            Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32)),
+            [num_directions, batch_size, hidden_size]), name='c0')
+        self.h1 = Parameter(initializer(
+            Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32)),
+            [num_directions, batch_size, hidden_size]), name='h1')
+        self.c1 = Parameter(initializer(
+            Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32)),
+            [num_directions, batch_size, hidden_size]), name='c1')
+        self.h = ParameterTuple((self.h0, self.h1))
+        self.c = ParameterTuple((self.c0, self.c1))
+
+    @ms_function
+    def construct(self):
+        return self.lstm(self.x, (self.h, self.c))
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_multi_layer_bilstm():
+    seq_len = 5
+    batch_size = 2
+    input_size = 10
+    hidden_size = 2
+    num_layers = 2
+    has_bias = True
+    bidirectional = True
+    dropout = 0.0
+
+    num_directions = 1
+    if bidirectional:
+        num_directions = 2
+
+    net = MultiLayerBiLstmNet(seq_len, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional,
+                              dropout)
+    y, h, c, _, _ = net()
+    print(y)
+    print(h)
+    print(c)
+
+
+class Grad(nn.Cell):
+    def __init__(self, network):
+        super(Grad, self).__init__()
+        self.network = network
+        self.weights = ParameterTuple(network.trainable_params())
+        self.grad = C.GradOperation('grad',
+                                    get_by_list=True,
+                                    sens_param=True)
+
+    @ms_function
+    def construct(self, output_grad):
+        weights = self.weights
+        grads = self.grad(self.network, weights)(output_grad)
+        return grads
+
+
+class Net(nn.Cell):
+    def __init__(self, seq_len, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout):
+        super(Net, self).__init__()
+
+        num_directions = 1
+        if bidirectional:
+            num_directions = 2
+        input_np = np.array([[[-0.5907, 1.0557, 1.7283, 0.6706, -1.2550, -0.5298, -0.2290, -0.6735, 0.8555, 1.4836],
+                              [-1.7070, -0.5347, -0.9105, -0.2598, 0.0588, 1.5496, 1.0757, 0.3760, -1.2020, -0.2868]],
+
+                             [[0.0151, 0.2126, 0.8090, -0.5292, -2.5590, 0.4279, -0.3081, -1.4706, -0.0498, 1.2301],
+                              [0.4165, -0.5391, -0.0996, 0.1928, -0.4909, -0.1255, 0.4444, -1.3687, 1.3096, 0.6553]],
+
+                             [[-0.7802, -0.2083, -0.6388, 1.3757, 0.4293, 0.5363, 0.3202, -0.6687, -1.3864, -0.2953],
+                              [1.0799, -0.7204, 0.1130, -0.5857, -0.4855, -1.1068, 1.0126, 0.8716, 1.5460, -0.7392]],
+
+                             [[2.2645, -0.6586, -0.2227, 1.4290, -0.5006, -1.6576, -0.1793, 0.5319, 0.1360, 0.2707],
+                              [-0.4071, 0.1575, 1.4199, -0.9156, 0.1855, 0.4947, 1.0460, -0.6365, 0.1191, -0.6374]],
+
+                             [[0.2468, 1.0815, -0.4893, 0.0664, 0.6405, -2.2967, 0.7612, 0.8759, 0.5685, -1.0999],
+                              [-0.7272, -1.7750, -0.1164, -0.7159, 0.0061, -0.7839, -1.8329, 0.3434, -0.5634,
+                               0.5384]]]).astype(np.float32)
+
+        self.x = Parameter(initializer(Tensor(input_np), [seq_len, batch_size, input_size]), name='x')
+
+        self.h0 = Parameter(initializer(
+            Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32)),
+            [num_directions, batch_size, hidden_size]), name='h0')
+
+        self.c0 = Parameter(initializer(
+            Tensor(np.ones((num_directions, batch_size, hidden_size)).astype(np.float32)),
+            [num_directions, batch_size, hidden_size]), name='c0')
+
+        wih_l0 = np.array([[0.2300, 0.6668, 0.4703, 0.0425, 0.0464, 0.6825, 0.2249, -0.4315, -0.2449, 0.2964],
+                           [-0.2811, -0.3444, 0.2557, -0.5137, -0.5518, 0.1652, -0.6720, 0.1066, 0.3586, 0.6299],
+                           [0.5728, -0.1784, 0.5661, 0.4012, 0.3856, -0.1899, 0.3102, 0.3717, -0.5651, 0.1952],
+                           [0.1026, -0.0527, 0.1198, -0.3080, 0.2292, 0.5757, -0.3567, -0.2731, -0.0586, -0.2849],
+                           [0.2194, -0.1622, 0.3219, -0.3008, -0.3713, -0.3034, -0.2385, 0.0412, -0.5205, 0.0280],
+                           [-0.5499, -0.0733, -0.5236, -0.6753, -0.7045, -0.1839, -0.1037, -0.5026, -0.4055, -0.3416],
+                           [0.1573, -0.1301, -0.2882, -0.3464, 0.6643, 0.1980, -0.6804, 0.5359, 0.5996, 0.0124],
+                           [-0.6436, 0.0587, -0.6520, -0.0471, 0.1667, 0.6042, 0.5752, -0.6296, -0.2976,
+                            -0.3757]]).astype(np.float32).reshape([1, -1])
+
+        whh_l0 = np.array([[0.3358, 0.2790],
+                           [-0.5355, 0.0989],
+                           [-0.1402, 0.5120],
+                           [0.1335, 0.1653],
+                           [0.3533, -0.3531],
+                           [0.4166, -0.4420],
+                           [-0.5454, -0.1720],
+                           [0.0041, -0.0799]]).astype(np.float32).reshape([1, -1])
+
+        bih_l0 = np.array([0.5518, 0.1083, 0.4829, 0.0607, -0.1770, -0.6944, 0.3059, 0.5354]).astype(
+            np.float32).reshape([1, -1])
+        bhh_l0 = np.array([0.5025, -0.1261, -0.5405, 0.3220, -0.3441, 0.6488, -0.0284, -0.2334]).astype(
+            np.float32).reshape([1, -1])
+
+        w0_np = np.concatenate(
+            (wih_l0, whh_l0, bih_l0 + bhh_l0),
+            axis=1).reshape([-1, 1, 1])
+        self.w0 = Parameter(initializer(Tensor(w0_np), w0_np.shape), name='w0')
+        self.lstm = P.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers,
+                           has_bias=has_bias, bidirectional=bidirectional, dropout=dropout)
+
+    @ms_function
+    def construct(self):
+        return self.lstm(self.x, self.h0, self.c0, self.w0)[0]
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_grad():
+    seq_len = 5
+    batch_size = 2
+    input_size = 10
+    hidden_size = 2
+    num_layers = 1
+    has_bias = True
+    bidirectional = False
+    dropout = 0.0
+    num_directions = 1
+    if bidirectional:
+        num_directions = 2
+    net = Grad(Net(seq_len, batch_size, input_size, hidden_size, num_layers, has_bias, bidirectional, dropout))
+    dy = np.array([[[-3.5471e-01, 7.0540e-01],
+                    [2.7161e-01, 1.0865e+00]],
+
+                   [[-4.2431e-01, 1.4955e+00],
+                    [-4.0418e-01, -2.3282e-01]],
+
+                   [[-1.3654e+00, 1.9251e+00],
+                    [-4.6481e-01, 1.3138e+00]],
+
+                   [[1.2914e+00, -2.3753e-01],
+                    [5.3589e-01, -1.0981e-01]],
+
+                   [[-1.6032e+00, -1.8818e-01],
+                    [1.0065e-01, 9.2045e-01]]]).astype(np.float32)
+    dx, dhx, dcx, dw = net(Tensor(dy))
+    print(dx)
+    print(dhx)
+    print(dcx)
+    print(dw)
+
+# test_multi_layer_bilstm()
+# test_lstm()
+# tf_lstm_test()
+# test_grad()