add some cpu operator

2020-10-14 16:39:40 +08:00 · 2020-10-14 16:39:40 +08:00 · f2e9d9cfc7
parent af78c12a73
commit f2e9d9cfc7
44 changed files with 1946 additions and 311 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.cc
@ -13,9 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include "backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h"
-#include <thread>
+#include <cmath>
 #include <string>
+#include <thread>
+#include "backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
@ -52,13 +53,35 @@ void ArithmeticCPUKernel::Mul(const T *input1, const T *input2, T *out, size_t s
 }

 template <typename T>
-void ArithmeticCPUKernel::Div(const T *input1, const T *input2, T *out, size_t start, size_t end) {
+void ArithmeticCPUKernel::RealDiv(const T *input1, const T *input2, T *out, size_t start, size_t end) {
  for (size_t i = start; i < end; i++) {
-    auto div_number = input2[i];
+    std::vector<size_t> idx;
+    GenIndex(i, &idx);
+    auto div_number = input2[idx[1]];
    if (div_number == 0) {
      MS_LOG(EXCEPTION) << "Cannot divided by 0!";
    }
-    out[i] = input1[i] / div_number;
+    out[i] = input1[idx[0]] / div_number;
+  }
+}
+
+template <typename T>
+void ArithmeticCPUKernel::Pow(const T *input1, const T *input2, T *out, size_t start, size_t end) {
+  for (size_t i = start; i < end; i++) {
+    std::vector<size_t> idx;
+    GenIndex(i, &idx);
+    auto x = static_cast<double>(input1[idx[0]]);
+    auto y = static_cast<double>(input2[idx[1]]);
+    out[i] = static_cast<T>(std::pow(x, y));
+  }
+}
+
+template <typename T>
+void ArithmeticCPUKernel::Less(const T *input1, const T *input2, bool *out, size_t start, size_t end) {
+  for (size_t i = start; i < end; i++) {
+    std::vector<size_t> idx;
+    GenIndex(i, &idx);
+    out[i] = input1[idx[0]] < input2[idx[1]];
  }
 }

@ -71,10 +94,16 @@ void ArithmeticCPUKernel::InitKernel(const CNodePtr &kernel_node) {
    operate_type_ = SUB;
  } else if (kernel_name == prim::kPrimMul->name()) {
    operate_type_ = MUL;
-  } else if (kernel_name == "Div") {
-    operate_type_ = DIV;
+  } else if (kernel_name == prim::kPrimRealDiv->name()) {
+    operate_type_ = REALDIV;
+  } else if (kernel_name == prim::kPrimPow->name()) {
+    operate_type_ = POW;
+  } else if (kernel_name == prim::kPrimLess->name()) {
+    operate_type_ = LESS;
  } else if (kernel_name == prim::kPrimAssignAdd->name()) {
    operate_type_ = ASSIGNADD;
+  } else {
+    MS_LOG(EXCEPTION) << "Not support " << kernel_name;
  }

  input_shape0_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
@ -145,14 +174,45 @@ void ArithmeticCPUKernel::GenIndex(size_t num, std::vector<size_t> *idx) {
  idx->push_back(idx0);
  idx->push_back(idx1);
 }
+
+template <typename T>
+void ArithmeticCPUKernel::LaunchLess(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
+  T *input1 = reinterpret_cast<T *>(inputs[0]->addr);
+  T *input2 = reinterpret_cast<T *>(inputs[1]->addr);
+  bool *output = reinterpret_cast<bool *>(outputs[0]->addr);
+
+  size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1;
+  auto max_thread_num = std::thread::hardware_concurrency();
+  size_t thread_num = lens < 128 * max_thread_num ? std::ceil(lens / 128.0) : max_thread_num;
+  MS_LOG(INFO) << "Lens=" << lens << "; use thread_num=" << thread_num << "; max_thread_num: " << max_thread_num;
+  std::vector<std::thread> threads;
+  threads.reserve(thread_num);
+  size_t start = 0;
+  size_t once_compute_size = (lens + thread_num - 1) / thread_num;
+  while (start < lens) {
+    size_t end = (start + once_compute_size) > lens ? lens : (start + once_compute_size);
+    threads.emplace_back(std::thread(&ArithmeticCPUKernel::Less<T>, this, input1, input2, output, start, end));
+    start += once_compute_size;
+  }
+  for (size_t i = 0; i < threads.size(); ++i) {
+    threads[i].join();
+  }
+}
+
 template <typename T>
 void ArithmeticCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
+  if (operate_type_ == LESS) {
+    LaunchLess<T>(inputs, outputs);
+    return;
+  }
  T *input1 = reinterpret_cast<T *>(inputs[0]->addr);
  T *input2 = reinterpret_cast<T *>(inputs[1]->addr);
  T *output = reinterpret_cast<T *>(outputs[0]->addr);
-  auto lens = outputs[0]->size / sizeof(T);
-  size_t thread_num = lens < 128 * 24 ? std::ceil(lens / 128.0) : 24;
-  MS_LOG(INFO) << "lens=" << lens << "; use thread_num=" << thread_num;
+
+  size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1;
+  auto max_thread_num = std::thread::hardware_concurrency();
+  size_t thread_num = lens < 128 * max_thread_num ? std::ceil(lens / 128.0) : max_thread_num;
+  MS_LOG(INFO) << "Lens=" << lens << "; use thread_num=" << thread_num << "; max_thread_num: " << max_thread_num;
  std::vector<std::thread> threads;
  threads.reserve(thread_num);
  size_t start = 0;
@ -165,10 +225,14 @@ void ArithmeticCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, co
      threads.emplace_back(std::thread(&ArithmeticCPUKernel::Sub<T>, this, input1, input2, output, start, end));
    } else if (operate_type_ == MUL) {
      threads.emplace_back(std::thread(&ArithmeticCPUKernel::Mul<T>, this, input1, input2, output, start, end));
-    } else if (operate_type_ == DIV) {
-      threads.emplace_back(std::thread(&ArithmeticCPUKernel::Div<T>, this, input1, input2, output, start, end));
+    } else if (operate_type_ == REALDIV) {
+      threads.emplace_back(std::thread(&ArithmeticCPUKernel::RealDiv<T>, this, input1, input2, output, start, end));
+    } else if (operate_type_ == POW) {
+      threads.emplace_back(std::thread(&ArithmeticCPUKernel::Pow<T>, this, input1, input2, output, start, end));
    } else if (operate_type_ == ASSIGNADD) {
      threads.emplace_back(std::thread(&ArithmeticCPUKernel::AssignAdd<T>, this, input1, input2, output, start, end));
+    } else {
+      MS_LOG(EXCEPTION) << "Not support " << operate_type_;
    }
    start += once_compute_size;
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_cpu_kernel.h
@ -15,8 +15,8 @@
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_CPU_KERNEL_H_
-#include <vector>
 #include <memory>
+#include <vector>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@ -31,7 +31,8 @@ class ArithmeticCPUKernel : public CPUKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
-
+  template <typename T>
+  void LaunchLess(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
  template <typename T>
  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);

@ -44,9 +45,13 @@ class ArithmeticCPUKernel : public CPUKernel {
  template <typename T>
  void Mul(const T *input1, const T *input2, T *out, size_t start, size_t end);
  template <typename T>
-  void Div(const T *input1, const T *input2, T *out, size_t start, size_t end);
+  void RealDiv(const T *input1, const T *input2, T *out, size_t start, size_t end);
+  template <typename T>
+  void Pow(const T *input1, const T *input2, T *out, size_t start, size_t end);
  template <typename T>
  void AssignAdd(T *input1, const T *input2, T *out, size_t start, size_t end);
+  template <typename T>
+  void Less(const T *input1, const T *input2, bool *out, size_t start, size_t end);
  std::vector<size_t> input_shape0_;
  std::vector<size_t> input_shape1_;
  std::vector<size_t> input_element_num0_;
@ -66,6 +71,34 @@ MS_REG_CPU_KERNEL(
 MS_REG_CPU_KERNEL(
  Sub, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
  ArithmeticCPUKernel);
+MS_REG_CPU_KERNEL(
+  Pow, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  ArithmeticCPUKernel);
+MS_REG_CPU_KERNEL(
+  Pow, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  ArithmeticCPUKernel);
+MS_REG_CPU_KERNEL(
+  Pow, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
+  ArithmeticCPUKernel);
+MS_REG_CPU_KERNEL(
+  RealDiv, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+  ArithmeticCPUKernel);
+MS_REG_CPU_KERNEL(
+  RealDiv,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  ArithmeticCPUKernel);
+MS_REG_CPU_KERNEL(
+  RealDiv, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
+  ArithmeticCPUKernel);
+MS_REG_CPU_KERNEL(
+  Less, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool),
+  ArithmeticCPUKernel);
+MS_REG_CPU_KERNEL(
+  Less, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeBool),
+  ArithmeticCPUKernel);
+MS_REG_CPU_KERNEL(
+  Less, KernelAttr().AddInputAttr(kNumberTypeInt64).AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeBool),
+  ArithmeticCPUKernel);
 MS_REG_CPU_KERNEL(
  AssignAdd, KernelAttr().AddInputAttr(kNumberTypeInt32).AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
  ArithmeticCPUKernel);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.cc
@ -13,10 +13,10 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include "backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h"
 #include <cmath>
-#include <thread>
 #include <string>
+#include <thread>
+#include "backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h"
 #include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
@ -30,9 +30,9 @@ void Square(const T *in, T *out, size_t start, size_t end) {
 }

 template <typename T>
-void Sqrt(const T *in, T *out, size_t start, size_t end) {
+void Neg(const T *in, T *out, size_t start, size_t end) {
  for (size_t i = start; i < end; i++) {
-    out[i] = sqrtf(in[i]);
+    out[i] = -in[i];
  }
 }
 }  // namespace
@ -42,8 +42,8 @@ void ArithmeticSelfCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
  if (kernel_name == prim::kPrimSquare->name()) {
    operate_type_ = SQUARE;
-  } else if (kernel_name == prim::kPrimSqrt->name()) {
-    operate_type_ = SQRT;
+  } else if (kernel_name == prim::kPrimNeg->name()) {
+    operate_type_ = NEG;
  }
  dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
 }
@ -66,10 +66,11 @@ void ArithmeticSelfCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs
                                           const std::vector<AddressPtr> &outputs) {
  T *input = reinterpret_cast<T *>(inputs[0]->addr);
  T *output = reinterpret_cast<T *>(outputs[0]->addr);
-  auto lens = inputs[0]->size / sizeof(T);
-  MS_LOG(INFO) << "lens=" << lens;
+  size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1;

-  const size_t thread_num = 24;
+  auto max_thread_num = std::thread::hardware_concurrency();
+  size_t thread_num = lens < 128 * max_thread_num ? std::ceil(lens / 128.0) : max_thread_num;
+  MS_LOG(INFO) << "Lens=" << lens << "; use thread_num=" << thread_num << "; max_thread_num: " << max_thread_num;
  std::vector<std::thread> threads;
  threads.reserve(thread_num);
  size_t start = 0;
@ -78,8 +79,8 @@ void ArithmeticSelfCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs
    size_t end = (start + once_compute_size) > lens ? lens : (start + once_compute_size);
    if (operate_type_ == SQUARE) {
      threads.emplace_back(std::thread(Square<T>, input, output, start, end));
-    } else if (operate_type_ == SQRT) {
-      threads.emplace_back(std::thread(Sqrt<T>, input, output, start, end));
+    } else if (operate_type_ == NEG) {
+      threads.emplace_back(std::thread(Neg<T>, input, output, start, end));
    }
    start += once_compute_size;
  }
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/arithmetic_self_cpu_kernel.h
@ -15,8 +15,8 @@
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_SELF_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ARITHMETIC_SELF_CPU_KERNEL_H_
-#include <vector>
 #include <memory>
+#include <vector>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

@ -40,10 +40,12 @@ class ArithmeticSelfCPUKernel : public CPUKernel {
  TypeId dtype_{kTypeUnknown};
 };

-MS_REG_CPU_KERNEL(Square, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-                  ArithmeticSelfCPUKernel);
 MS_REG_CPU_KERNEL(Square, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
                  ArithmeticSelfCPUKernel);
+MS_REG_CPU_KERNEL(Neg, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  ArithmeticSelfCPUKernel);
+MS_REG_CPU_KERNEL(Neg, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+                  ArithmeticSelfCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cast_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cast_cpu_kernel.cc
@ -0,0 +1,82 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cmath>
+#include <map>
+#include <string>
+#include <thread>
+#include "backend/kernel_compiler/cpu/cast_cpu_kernel.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+
+template <typename S, typename T>
+void Cast(const S *in, T *out, size_t start, size_t end) {
+  for (size_t i = start; i < end; i++) {
+    out[i] = static_cast<T>(in[i]);
+  }
+}
+
+template <typename S, typename T>
+void LaunchCast(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &outputs) {
+  S *input = reinterpret_cast<S *>(inputs[0]->addr);
+  T *output = reinterpret_cast<T *>(outputs[0]->addr);
+  MS_LOG(DEBUG) << "Type source: " << typeid(S).name() << "; target: " << typeid(T).name();
+
+  size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1;
+  auto max_thread_num = std::thread::hardware_concurrency();
+  size_t thread_num = lens < 128 * max_thread_num ? std::ceil(lens / 128.0) : max_thread_num;
+  MS_LOG(INFO) << "Lens=" << lens << "; use thread_num=" << thread_num << "; max_thread_num: " << max_thread_num;
+  std::vector<std::thread> threads;
+  threads.reserve(thread_num);
+  size_t start = 0;
+  size_t once_compute_size = (lens + thread_num - 1) / thread_num;
+  while (start < lens) {
+    size_t end = (start + once_compute_size) > lens ? lens : (start + once_compute_size);
+    threads.emplace_back(std::thread(Cast<S, T>, input, output, start, end));
+    start += once_compute_size;
+  }
+  for (size_t i = 0; i < threads.size(); ++i) {
+    threads[i].join();
+  }
+}
+
+void CastCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  source_dtype = AnfAlgo::GetPrevNodeOutputDeviceDataType(kernel_node, 0);
+  target_dtype = AnfAlgo::GetOutputInferDataType(kernel_node, 0);
+}
+
+bool CastCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                           const std::vector<kernel::AddressPtr> & /*workspace*/,
+                           const std::vector<kernel::AddressPtr> &outputs) {
+  using TypePair =
+    std::function<void(const std::vector<kernel::AddressPtr> &, const std::vector<kernel::AddressPtr> &)>;
+  std::map<TypeId, std::map<TypeId, TypePair>> mode_map;
+  mode_map[kNumberTypeFloat32][kNumberTypeFloat32] = LaunchCast<float, float>;
+  mode_map[kNumberTypeFloat32][kNumberTypeInt32] = LaunchCast<float, int>;
+  mode_map[kNumberTypeFloat32][kNumberTypeBool] = LaunchCast<float, bool>;
+  mode_map[kNumberTypeInt32][kNumberTypeFloat32] = LaunchCast<int, float>;
+  mode_map[kNumberTypeInt32][kNumberTypeInt32] = LaunchCast<int, int>;
+  mode_map[kNumberTypeInt32][kNumberTypeBool] = LaunchCast<int, bool>;
+  mode_map[kNumberTypeBool][kNumberTypeFloat32] = LaunchCast<bool, float>;
+  mode_map[kNumberTypeBool][kNumberTypeBool] = LaunchCast<bool, bool>;
+  mode_map[kNumberTypeBool][kNumberTypeInt32] = LaunchCast<bool, int>;
+  mode_map[source_dtype][target_dtype](inputs, outputs);
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cast_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cast_cpu_kernel.h
@ -0,0 +1,54 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CAST_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CAST_CPU_KERNEL_H_
+#include <functional>
+#include <memory>
+#include <vector>
+#include "backend/kernel_compiler/cpu/cpu_kernel.h"
+#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+
+class CastCPUKernel : public CPUKernel {
+ public:
+  CastCPUKernel() = default;
+  ~CastCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  TypeId source_dtype{kTypeUnknown};
+  TypeId target_dtype{kTypeUnknown};
+};
+
+MS_REG_CPU_KERNEL(Cast, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), CastCPUKernel);
+MS_REG_CPU_KERNEL(Cast, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeInt32), CastCPUKernel);
+MS_REG_CPU_KERNEL(Cast, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeBool), CastCPUKernel);
+MS_REG_CPU_KERNEL(Cast, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32), CastCPUKernel);
+MS_REG_CPU_KERNEL(Cast, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeFloat32), CastCPUKernel);
+MS_REG_CPU_KERNEL(Cast, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeBool), CastCPUKernel);
+MS_REG_CPU_KERNEL(Cast, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), CastCPUKernel);
+MS_REG_CPU_KERNEL(Cast, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeInt32), CastCPUKernel);
+MS_REG_CPU_KERNEL(Cast, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeFloat32), CastCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CAST_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/cpu_kernel.h
@ -15,15 +15,14 @@
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CPU_KERNEL_H_
-
-#include <string>
-#include <vector>
+#include <functional>
 #include <memory>
 #include <numeric>
-#include <functional>
+#include <string>
+#include <vector>
 #include "backend/kernel_compiler/kernel.h"
-#include "ir/anf.h"
 #include "backend/session/anf_runtime_algorithm.h"
+#include "ir/anf.h"

 using mindspore::kernel::Address;
 using mindspore::kernel::AddressPtr;
@ -52,7 +51,26 @@ const char END[] = "end";
 const char SIZE[] = "size";
 const char USE_NESTEROV[] = "use_nesterov";
 const char GROUP[] = "group";
-enum OperateType { ADD = 0, SUB, MUL, DIV, SQUARE, SQRT, ASSIGNADD };
+
+enum OperateType {
+  ADD = 0,
+  SUB,
+  MUL,
+  DIV,
+  SQUARE,
+  SQRT,
+  POW,
+  REALDIV,
+  NEG,
+  LESS,
+  ASSIGNADD,
+  RELUGRAD,
+  RELU6GRAD,
+  ABSGRAD,
+  TANHGRAD,
+  SQRTGRAD,
+  SIGMOIDGRAD
+};

 class CPUKernel : public kernel::KernelMod {
 public:
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc
@ -0,0 +1,177 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cmath>
+#include <string>
+#include <thread>
+#include "backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+
+namespace mindspore {
+namespace kernel {
+template <typename T>
+void EltWiseGradCPUKernel::ReluGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) {
+  for (size_t i = start; i < end; i++) {
+    if (input2[i] > 0) {
+      out[i] = input1[i];
+    } else {
+      out[i] = 0;
+    }
+  }
+}
+
+template <typename T>
+void EltWiseGradCPUKernel::ReLU6Grad(const T *input1, const T *input2, T *out, size_t start, size_t end) {
+  for (size_t i = start; i < end; i++) {
+    if (input2[i] > 0 && input2[i] <= 6) {
+      out[i] = input1[i];
+    } else {
+      out[i] = 0;
+    }
+  }
+}
+
+template <typename T>
+void EltWiseGradCPUKernel::AbsGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) {
+  for (size_t i = start; i < end; i++) {
+    if (input1[i] > 0) {
+      out[i] = input2[i];
+    } else if (input1[i] < 0) {
+      out[i] = -input2[i];
+    } else {
+      out[i] = 0;
+    }
+  }
+}
+
+template <typename T>
+void EltWiseGradCPUKernel::SigmoidGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) {
+  for (size_t i = start; i < end; i++) {
+    out[i] = input2[i] * input1[i] * (1 - input1[i]);
+  }
+}
+
+template <typename T>
+void EltWiseGradCPUKernel::SqrtGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) {
+  for (size_t i = start; i < end; i++) {
+    out[i] = input2[i] / (input1[i] * 2);
+  }
+}
+
+template <typename T>
+void EltWiseGradCPUKernel::TanhGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) {
+  for (size_t i = start; i < end; i++) {
+    T tmp = (1 - input1[i]);
+    out[i] = input2[i] * tmp * tmp;
+  }
+}
+
+void EltWiseGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
+  if (kernel_name == "ReluGrad") {
+    operate_type_ = RELUGRAD;
+  } else if (kernel_name == "ReLU6Grad") {
+    operate_type_ = RELU6GRAD;
+  } else if (kernel_name == "SigmoidGrad") {
+    operate_type_ = SIGMOIDGRAD;
+  } else if (kernel_name == "AbsGrad") {
+    operate_type_ = ABSGRAD;
+  } else if (kernel_name == "TanhGrad") {
+    operate_type_ = TANHGRAD;
+  } else if (kernel_name == "SqrtGrad") {
+    operate_type_ = SQRTGRAD;
+  } else {
+    MS_LOG(EXCEPTION) << "Not support " << kernel_name;
+  }
+
+  input_shape0_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
+  input_shape1_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 1);
+  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+  if (output_shape_.size() == 0) {
+    output_shape_.insert(output_shape_.begin(), 1);
+  }
+  size_t l = input_shape0_.size();
+  for (size_t i = 0; i < output_shape_.size() - l; ++i) {
+    input_shape0_.insert(input_shape0_.begin(), 1);
+  }
+  l = input_shape1_.size();
+  for (size_t i = 0; i < output_shape_.size() - l; ++i) {
+    input_shape1_.insert(input_shape1_.begin(), 1);
+  }
+  CPUKernelUtils::GetElementNumEveryDim(input_shape0_, &input_element_num0_);
+  CPUKernelUtils::GetElementNumEveryDim(input_shape1_, &input_element_num1_);
+  CPUKernelUtils::GetElementNumEveryDim(output_shape_, &output_element_num_);
+  dtype_ = AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 0);
+  if (dtype_ != AnfAlgo::GetPrevNodeOutputInferDataType(kernel_node, 1)) {
+    MS_LOG(EXCEPTION) << "Input0 and input1 must has the same data type";
+  }
+}
+
+bool EltWiseGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                  const std::vector<kernel::AddressPtr> & /*workspace*/,
+                                  const std::vector<kernel::AddressPtr> &outputs) {
+  if (dtype_ == kNumberTypeInt32) {
+    LaunchKernel<int>(inputs, outputs);
+  } else if (dtype_ == kNumberTypeFloat32) {
+    LaunchKernel<float>(inputs, outputs);
+  } else if (dtype_ == kNumberTypeInt64) {
+    LaunchKernel<int64_t>(inputs, outputs);
+  } else {
+    MS_LOG(EXCEPTION) << "Only support int32, float32, but actual data type is " << TypeIdLabel(dtype_);
+  }
+  return true;
+}
+
+template <typename T>
+void EltWiseGradCPUKernel::LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs) {
+  T *input1 = reinterpret_cast<T *>(inputs[0]->addr);
+  T *input2 = reinterpret_cast<T *>(inputs[1]->addr);
+  T *output = reinterpret_cast<T *>(outputs[0]->addr);
+
+  size_t lens = outputs[0]->size > 0 ? static_cast<size_t>(outputs[0]->size / sizeof(T)) : 1;
+  auto max_thread_num = std::thread::hardware_concurrency();
+  size_t thread_num = lens < 128 * max_thread_num ? std::ceil(lens / 128.0) : max_thread_num;
+  MS_LOG(INFO) << "Lens=" << lens << "; use thread_num=" << thread_num << "; max_thread_num: " << max_thread_num;
+  std::vector<std::thread> threads;
+  threads.reserve(thread_num);
+  size_t start = 0;
+  size_t once_compute_size = (lens + thread_num - 1) / thread_num;
+  while (start < lens) {
+    size_t end = (start + once_compute_size) > lens ? lens : (start + once_compute_size);
+    if (operate_type_ == RELUGRAD) {
+      threads.emplace_back(std::thread(&EltWiseGradCPUKernel::ReluGrad<T>, this, input1, input2, output, start, end));
+    } else if (operate_type_ == RELU6GRAD) {
+      threads.emplace_back(std::thread(&EltWiseGradCPUKernel::ReLU6Grad<T>, this, input1, input2, output, start, end));
+    } else if (operate_type_ == ABSGRAD) {
+      threads.emplace_back(std::thread(&EltWiseGradCPUKernel::AbsGrad<T>, this, input1, input2, output, start, end));
+    } else if (operate_type_ == SIGMOIDGRAD) {
+      threads.emplace_back(
+        std::thread(&EltWiseGradCPUKernel::SigmoidGrad<T>, this, input1, input2, output, start, end));
+    } else if (operate_type_ == TANHGRAD) {
+      threads.emplace_back(std::thread(&EltWiseGradCPUKernel::TanhGrad<T>, this, input1, input2, output, start, end));
+    } else if (operate_type_ == SQRTGRAD) {
+      threads.emplace_back(std::thread(&EltWiseGradCPUKernel::SqrtGrad<T>, this, input1, input2, output, start, end));
+    } else {
+      MS_LOG(EXCEPTION) << "Not support " << operate_type_;
+    }
+    start += once_compute_size;
+  }
+  for (size_t i = 0; i < threads.size(); ++i) {
+    threads[i].join();
+  }
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.h
@ -0,0 +1,87 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ELTWISE_GRAD_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ELTWISE_GRAD_CPU_KERNEL_H_
+#include <memory>
+#include <vector>
+#include "backend/kernel_compiler/cpu/cpu_kernel.h"
+#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+
+namespace mindspore {
+namespace kernel {
+class EltWiseGradCPUKernel : public CPUKernel {
+ public:
+  EltWiseGradCPUKernel() = default;
+  ~EltWiseGradCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+  template <typename T>
+  void LaunchKernel(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &outputs);
+
+ private:
+  template <typename T>
+  void ReluGrad(const T *input1, const T *input2, T *out, size_t start, size_t end);
+  template <typename T>
+  void ReLU6Grad(const T *input1, const T *input2, T *out, size_t start, size_t end);
+  template <typename T>
+  void AbsGrad(const T *input1, const T *input2, T *out, size_t start, size_t end);
+  template <typename T>
+  void SigmoidGrad(const T *input1, const T *input2, T *out, size_t start, size_t end);
+  template <typename T>
+  void SqrtGrad(const T *input1, const T *input2, T *out, size_t start, size_t end);
+  template <typename T>
+  void TanhGrad(const T *input1, const T *input2, T *out, size_t start, size_t end);
+  std::vector<size_t> input_shape0_;
+  std::vector<size_t> input_shape1_;
+  std::vector<size_t> input_element_num0_;
+  std::vector<size_t> input_element_num1_;
+  std::vector<size_t> output_shape_;
+  std::vector<size_t> output_element_num_;
+  OperateType operate_type_{RELUGRAD};
+  TypeId dtype_{kTypeUnknown};
+};
+
+MS_REG_CPU_KERNEL(
+  ReluGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  EltWiseGradCPUKernel);
+MS_REG_CPU_KERNEL(
+  ReLU6Grad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  EltWiseGradCPUKernel);
+MS_REG_CPU_KERNEL(
+  AbsGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  EltWiseGradCPUKernel);
+MS_REG_CPU_KERNEL(
+  SigmoidGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  EltWiseGradCPUKernel);
+MS_REG_CPU_KERNEL(
+  SqrtGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  EltWiseGradCPUKernel);
+MS_REG_CPU_KERNEL(
+  TanhGrad,
+  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+  EltWiseGradCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ELTWISE_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.cc
@ -0,0 +1,76 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h"
+
+#include <string>
+#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+#include "utils/ms_utils.h"
+
+namespace mindspore {
+namespace kernel {
+dnnl::eltwise_forward::desc EltWiseCPUKernel::GetForwardEltwiseDesc(const CNodePtr &kernel_node,
+                                                                    dnnl::memory::desc src_desc) {
+  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
+  if (kernel_name == "ReLU") {
+    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_relu, src_desc, 0.0);
+  } else if (kernel_name == "ReLU6") {
+    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_clip, src_desc, 0.0, 6.0);
+  } else if (kernel_name == "Abs") {
+    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_abs, src_desc);
+  } else if (kernel_name == "Exp") {
+    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_exp, src_desc);
+  } else if (kernel_name == "Log") {
+    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_log, src_desc);
+  } else if (kernel_name == "Sigmoid") {
+    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_logistic, src_desc);
+  } else if (kernel_name == "Sqrt") {
+    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_sqrt, src_desc);
+  } else if (kernel_name == "Square") {
+    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_square, src_desc);
+  } else if (kernel_name == "Tanh") {
+    return dnnl::eltwise_forward::desc(DnnlForward, dnnl::algorithm::eltwise_tanh, src_desc);
+  } else {
+    MS_LOG(EXCEPTION) << "Eltwise operators don't support " << kernel_name;
+  }
+}
+
+void EltWiseCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);
+
+  auto desc = GetForwardEltwiseDesc(kernel_node, src_desc);
+  auto prim_desc = dnnl::eltwise_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
+  primitive_ = std::make_shared<dnnl::eltwise_forward>(prim_desc);
+
+  AddArgument(DNNL_ARG_SRC, src_desc);
+  AddArgument(DNNL_ARG_DST, src_desc);
+}
+
+bool EltWiseCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                              const std::vector<kernel::AddressPtr> & /*workspace*/,
+                              const std::vector<kernel::AddressPtr> &outputs) {
+  if (inputs.empty() || outputs.empty()) {
+    MS_LOG(EXCEPTION) << "error input output size!";
+  }
+  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
+  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
+  ExecutePrimitive();
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/eltwise_cpu_kernel.h
@ -0,0 +1,60 @@
+/**
+ * Copyright 2019 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ELTWISE_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ELTWISE_CPU_KERNEL_H_
+#include <memory>
+#include <vector>
+#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+class EltWiseCPUKernel : public MKLCPUKernel {
+ public:
+  EltWiseCPUKernel() = default;
+  ~EltWiseCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  dnnl::eltwise_forward::desc GetForwardEltwiseDesc(const CNodePtr &kernel_node, dnnl::memory::desc src_desc);
+  dnnl::prop_kind DnnlForward = dnnl::prop_kind::forward_training;
+};
+
+MS_REG_CPU_KERNEL(ReLU, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  EltWiseCPUKernel);
+MS_REG_CPU_KERNEL(ReLU6, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  EltWiseCPUKernel);
+MS_REG_CPU_KERNEL(Abs, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  EltWiseCPUKernel);
+MS_REG_CPU_KERNEL(Exp, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  EltWiseCPUKernel);
+MS_REG_CPU_KERNEL(Log, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  EltWiseCPUKernel);
+MS_REG_CPU_KERNEL(Sigmoid, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  EltWiseCPUKernel);
+MS_REG_CPU_KERNEL(Sqrt, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  EltWiseCPUKernel);
+MS_REG_CPU_KERNEL(Square, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  EltWiseCPUKernel);
+MS_REG_CPU_KERNEL(Tanh, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                  EltWiseCPUKernel);
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_ELTWISE_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/fused_batch_norm_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/fused_batch_norm_cpu_kernel.cc
@ -13,12 +13,11 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-
 #include <string>
 #include "backend/kernel_compiler/cpu/mkldnn/fused_batch_norm_cpu_kernel.h"
-#include "utils/ms_utils.h"
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
 #include "runtime/device/cpu/cpu_device_address.h"
+#include "utils/ms_utils.h"

 namespace mindspore {
 namespace kernel {
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/fused_batch_norm_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/fused_batch_norm_cpu_kernel.h
@ -15,9 +15,8 @@
 */
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_FUSED_BATCH_NORM_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_FUSED_BATCH_NORM_CPU_KERNEL_H_
-
-#include <vector>
 #include <memory>
+#include <vector>
 #include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"

 namespace mindspore {
@ -74,4 +73,4 @@ MS_REG_CPU_KERNEL(BatchNorm,
 }  // namespace kernel
 }  // namespace mindspore

-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_CONV2D_CPU_KERNEL_H_
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_FUSED_BATCH_NORM_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/fused_batch_norm_gard_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/fused_batch_norm_gard_cpu_kernel.cc
@ -0,0 +1,110 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "backend/kernel_compiler/cpu/mkldnn/fused_batch_norm_gard_cpu_kernel.h"
+
+#include <string>
+#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
+#include "runtime/device/cpu/cpu_device_address.h"
+#include "utils/ms_utils.h"
+
+namespace mindspore {
+namespace kernel {
+void FusedBatchNormGradCPUKernel::InitInputOutputSize(const CNodePtr &kernel_node) {
+  CPUKernel::InitInputOutputSize(kernel_node);
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  size_t type_size = sizeof(float);
+  std::vector<size_t> shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  size_t tensor_size = shape[1] * 2 * type_size;
+  // [2, c] to store scale and bias
+  workspace_size_list_.emplace_back(tensor_size);
+  // [2, c] to store diff_scale and diff_bias
+  workspace_size_list_.emplace_back(tensor_size);
+}
+
+void FusedBatchNormGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  std::vector<size_t> x_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
+  if (x_shape.size() != 4) {
+    MS_LOG(EXCEPTION) << "Fused batchnorm only support nchw input!";
+  }
+  batch_size = x_shape[0];
+  channel = x_shape[1];
+  hw_size = x_shape[2] * x_shape[3];
+  nhw_size = x_shape[0] * hw_size;
+  dnnl::memory::desc x_desc = GetDefaultMemDesc(x_shape);
+  dnnl::memory::desc scale_bias_desc = GetDefaultMemDesc({2, channel});
+  auto epsilon = AnfAlgo::GetNodeAttr<float>(kernel_node, "epsilon");
+  auto prop_kind = dnnl::prop_kind::forward_training;
+  auto normalization_flags = dnnl::normalization_flags::use_scale_shift;
+
+  // fused batch normalization forward description
+  dnnl::batch_normalization_forward::desc desc =
+    dnnl::batch_normalization_forward::desc(prop_kind, x_desc, epsilon, normalization_flags);
+  auto forward_prim_desc = dnnl::batch_normalization_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
+
+  // fused batch normalization backward description
+  dnnl::batch_normalization_backward::desc backward_desc =
+    dnnl::batch_normalization_backward::desc(dnnl::prop_kind::backward, x_desc, x_desc, epsilon, normalization_flags);
+  auto backward_prim_desc = dnnl::batch_normalization_backward::primitive_desc(
+    backward_desc, MKLKernelEngine::Get().engine(), forward_prim_desc);
+  primitive_ = std::make_shared<dnnl::batch_normalization_backward>(backward_prim_desc);
+  AddArgument(DNNL_ARG_SRC, x_desc);
+  AddArgument(DNNL_ARG_MEAN, forward_prim_desc.mean_desc());
+  AddArgument(DNNL_ARG_VARIANCE, forward_prim_desc.variance_desc());
+  AddArgument(DNNL_ARG_SCALE_SHIFT, scale_bias_desc);
+  AddArgument(DNNL_ARG_WORKSPACE, forward_prim_desc.workspace_desc());
+  AddArgument(DNNL_ARG_DST, x_desc);
+  AddArgument(DNNL_ARG_DIFF_DST, x_desc);
+  AddArgument(DNNL_ARG_DIFF_SRC, x_desc);
+  AddArgument(DNNL_ARG_DIFF_SCALE_SHIFT, scale_bias_desc);
+}
+
+bool FusedBatchNormGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
+                                         const std::vector<kernel::AddressPtr> &workspace,
+                                         const std::vector<kernel::AddressPtr> &outputs) {
+  if (inputs.size() < 5 || outputs.empty()) {
+    MS_LOG(EXCEPTION) << "Error input output size!";
+  }
+  auto wksp_in = reinterpret_cast<float *>(workspace[0]->addr);
+  auto scale_ret = memcpy_s(wksp_in, workspace[0]->size, inputs[2]->addr, inputs[2]->size);
+  auto max_size = workspace[0]->size - inputs[2]->size;
+  auto bias_ret = memcpy_s(wksp_in + (inputs[2]->size / sizeof(float)), max_size, inputs[3]->addr, inputs[3]->size);
+  if (scale_ret != 0 || bias_ret != 0) {
+    MS_LOG(EXCEPTION) << "Memcpy_s error.";
+    return false;
+  }
+
+  SetArgumentHandle(DNNL_ARG_DIFF_DST, inputs[0]->addr);
+  SetArgumentHandle(DNNL_ARG_SRC, inputs[1]->addr);
+  SetArgumentHandle(DNNL_ARG_MEAN, inputs[4]->addr);
+  SetArgumentHandle(DNNL_ARG_VARIANCE, inputs[5]->addr);
+  SetArgumentHandle(DNNL_ARG_SCALE_SHIFT, workspace[0]->addr);
+  SetArgumentHandle(DNNL_ARG_DIFF_SRC, outputs[0]->addr);
+  SetArgumentHandle(DNNL_ARG_DIFF_SCALE_SHIFT, workspace[1]->addr);
+  ExecutePrimitive();
+
+  auto wksp_out = reinterpret_cast<float *>(workspace[1]->addr);
+  auto diff_scale_ret = memcpy_s(outputs[1]->addr, outputs[1]->size, wksp_out, inputs[2]->size);
+  auto diff_bias_ret =
+    memcpy_s(outputs[2]->addr, outputs[2]->size, wksp_out + (outputs[1]->size / sizeof(float)), inputs[3]->size);
+  if (diff_scale_ret != 0 || diff_bias_ret != 0) {
+    MS_LOG(EXCEPTION) << "Memcpy_s error.";
+    return false;
+  }
+  return true;
+}
+}  // namespace kernel
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/fused_batch_norm_gard_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/fused_batch_norm_gard_cpu_kernel.h
@ -0,0 +1,61 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_FUSED_BATCH_NORM_GRAD_CPU_KERNEL_H_
+#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_FUSED_BATCH_NORM_GRAD_CPU_KERNEL_H_
+#include <memory>
+#include <vector>
+#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
+
+namespace mindspore {
+namespace kernel {
+class FusedBatchNormGradCPUKernel : public MKLCPUKernel {
+ public:
+  FusedBatchNormGradCPUKernel() = default;
+  ~FusedBatchNormGradCPUKernel() override = default;
+
+  void InitKernel(const CNodePtr &kernel_node) override;
+
+  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
+              const std::vector<AddressPtr> &outputs) override;
+
+ protected:
+  void InitInputOutputSize(const CNodePtr &kernel_node) override;
+
+ private:
+  float momentum{0.9};
+  size_t batch_size{0};
+  size_t channel{0};
+  size_t hw_size{0};
+  size_t nhw_size{0};
+};
+
+MS_REG_CPU_KERNEL(FusedBatchNormGradCPU,
+                  KernelAttr()
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddInputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32)
+                    .AddOutputAttr(kNumberTypeFloat32),
+                  FusedBatchNormGradCPUKernel)
+
+}  // namespace kernel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_FUSED_BATCH_NORM_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.cc
@ -25,24 +25,53 @@ void MulCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
-  if (src0_shape.size() != src1_shape.size() && src1_shape.size() > 1) {
-    MS_LOG(EXCEPTION) << "mul only support same dim input or tensor * scalar " << src0_shape.size() << " vs "
-                      << src1_shape.size();
+  if (src1_shape.size() != src0_shape.size()) {
+    if (src0_shape.size() == 0) {
+      need_swap_ = true;
+      for (size_t i = 0; i < src1_shape.size(); ++i) {
+        src0_shape.emplace_back(1);
      }
-  if (src1_shape.size() < src0_shape.size()) {
-    for (size_t i = src1_shape.size(); i < src0_shape.size(); ++i) {
+    } else if (src1_shape.size() == 0) {
+      for (size_t i = 0; i < src0_shape.size(); ++i) {
        src1_shape.emplace_back(1);
      }
+    } else {
+      MS_LOG(EXCEPTION) << "Invalid broadcast! " << src0_shape << " vs " << src1_shape;
    }
-  dnnl::memory::desc src0_mem_desc = GetDefaultMemDesc(src0_shape);
-  dnnl::memory::desc src1_mem_desc = GetDefaultMemDesc(src1_shape);
-  dnnl::memory::desc dst_mem_desc = GetDefaultMemDesc(dst_shape);
-  dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_mul, src0_mem_desc, src1_mem_desc, dst_mem_desc);
+  } else {
+    bool visit_src0 = false;
+    bool visit_src1 = false;
+    for (size_t i = 0; i < src0_shape.size(); ++i) {
+      if (src0_shape[i] != src1_shape[i]) {
+        if (src0_shape[i] == 1 && !visit_src1) {
+          need_swap_ = true;
+          visit_src0 = true;
+        } else if (src1_shape[i] == 1 && !visit_src0) {
+          need_swap_ = false;
+          visit_src1 = true;
+        } else {
+          MS_LOG(EXCEPTION) << "Invalid broadcast! " << src0_shape << " vs " << src1_shape;
+        }
+      }
+    }
+  }
+  dnnl::memory::desc src0_desc;
+  dnnl::memory::desc src1_desc;
+  if (need_swap_) {
+    src0_desc = GetDefaultMemDesc(src1_shape);
+    src1_desc = GetDefaultMemDesc(src0_shape);
+  } else {
+    src0_desc = GetDefaultMemDesc(src0_shape);
+    src1_desc = GetDefaultMemDesc(src1_shape);
+  }
+  dnnl::memory::desc dst_desc = GetDefaultMemDesc(dst_shape);
+  dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_mul, src0_desc, src1_desc, dst_desc);
  auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine());
  primitive_ = std::make_shared<dnnl::binary>(prim_desc);
-  AddArgument(DNNL_ARG_SRC_0, src0_mem_desc);
-  AddArgument(DNNL_ARG_SRC_1, src1_mem_desc);
-  AddArgument(DNNL_ARG_DST, dst_mem_desc);
+
+  AddArgument(DNNL_ARG_SRC_0, src0_desc);
+  AddArgument(DNNL_ARG_SRC_1, src1_desc);
+  AddArgument(DNNL_ARG_DST, dst_desc);
 }

 bool MulCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
@ -51,8 +80,13 @@ bool MulCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "mul error input output size!";
  }
+  if (need_swap_) {
+    SetArgumentHandle(DNNL_ARG_SRC_0, inputs[1]->addr);
+    SetArgumentHandle(DNNL_ARG_SRC_1, inputs[0]->addr);
+  } else {
    SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr);
    SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr);
+  }
  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
  ExecutePrimitive();
  return true;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/mul_cpu_kernel.h
@ -31,6 +31,9 @@ class MulCPUKernel : public MKLCPUKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  bool need_swap_{false};
 };

 MS_REG_CPU_KERNEL(
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_cpu_kernel.cc
@ -1,59 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <string>
-#include "backend/kernel_compiler/cpu/mkldnn/relu_cpu_kernel.h"
-#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
-#include "runtime/device/cpu/cpu_device_address.h"
-#include "utils/ms_utils.h"
-
-namespace mindspore {
-namespace kernel {
-void ReluCPUKernel::InitKernel(const CNodePtr &kernel_node) {
-  MS_EXCEPTION_IF_NULL(kernel_node);
-  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
-  if (src_shape.size() != 4 && src_shape.size() != 2) {
-    MS_LOG(EXCEPTION) << "relu kernel dims invalid " << src_shape.size();
-  }
-  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);
-
-  dnnl::eltwise_forward::desc desc =
-    dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_training, dnnl::algorithm::eltwise_relu, src_desc, 0.0);
-  std::string kernel_name = AnfAlgo::GetCNodeName(kernel_node);
-  if (kernel_name == "ReLU6") {
-    desc =
-      dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_training, dnnl::algorithm::eltwise_clip, src_desc, 0.0, 6.0);
-  }
-
-  auto prim_desc = dnnl::eltwise_forward::primitive_desc(desc, MKLKernelEngine::Get().engine());
-  primitive_ = std::make_shared<dnnl::eltwise_forward>(prim_desc);
-
-  AddArgument(DNNL_ARG_SRC, src_desc);
-  AddArgument(DNNL_ARG_DST, src_desc);
-}
-
-bool ReluCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
-                           const std::vector<kernel::AddressPtr> & /*workspace*/,
-                           const std::vector<kernel::AddressPtr> &outputs) {
-  if (inputs.empty() || outputs.empty()) {
-    MS_LOG(EXCEPTION) << "error input output size!";
-  }
-  SetArgumentHandle(DNNL_ARG_SRC, inputs[0]->addr);
-  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
-  ExecutePrimitive();
-  return true;
-}
-}  // namespace kernel
-}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_cpu_kernel.h
@ -1,42 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_RELU_CPU_KERNEL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_RELU_CPU_KERNEL_H_
-
-#include <vector>
-#include <memory>
-#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
-
-namespace mindspore {
-namespace kernel {
-class ReluCPUKernel : public MKLCPUKernel {
- public:
-  ReluCPUKernel() = default;
-  ~ReluCPUKernel() override = default;
-
-  void InitKernel(const CNodePtr &kernel_node) override;
-
-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
-              const std::vector<AddressPtr> &outputs) override;
-};
-
-MS_REG_CPU_KERNEL(ReLU, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32), ReluCPUKernel);
-MS_REG_CPU_KERNEL(ReLU6, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-                  ReluCPUKernel);
-}  // namespace kernel
-}  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_RELU_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_grad_cpu_kernel.cc
@ -1,69 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "backend/kernel_compiler/cpu/mkldnn/relu_grad_cpu_kernel.h"
-#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
-#include "runtime/device/cpu/cpu_device_address.h"
-#include "utils/ms_utils.h"
-
-namespace mindspore {
-namespace kernel {
-void ReluGradCPUKernel::InitKernel(const CNodePtr &kernel_node) {
-  MS_EXCEPTION_IF_NULL(kernel_node);
-  std::vector<size_t> src_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
-  if (src_shape.size() != 4 && src_shape.size() != 2) {
-    MS_LOG(EXCEPTION) << "relu grad kernel dims invalid " << src_shape.size();
-  }
-  dnnl::memory::desc src_desc = GetDefaultMemDesc(src_shape);
-
-  dnnl::eltwise_forward::desc forward_desc =
-    dnnl::eltwise_forward::desc(dnnl::prop_kind::forward_training, dnnl::algorithm::eltwise_relu, src_desc, 0.0);
-  auto forward_prim_desc = dnnl::eltwise_forward::primitive_desc(forward_desc, MKLKernelEngine::Get().engine());
-
-  dnnl::eltwise_backward::desc backward_desc =
-    dnnl::eltwise_backward::desc(dnnl::algorithm::eltwise_relu, src_desc, src_desc, 0.0, 0.0);
-  auto backward_prim_desc =
-    dnnl::eltwise_backward::primitive_desc(backward_desc, MKLKernelEngine::Get().engine(), forward_prim_desc);
-  primitive_ = std::make_shared<dnnl::eltwise_backward>(backward_prim_desc);
-
-  AddArgument(DNNL_ARG_SRC, src_desc);
-  AddArgument(DNNL_ARG_DIFF_SRC, src_desc);
-  AddArgument(DNNL_ARG_DIFF_DST, src_desc);
-}
-
-bool ReluGradCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
-                               const std::vector<kernel::AddressPtr> & /*workspace*/,
-                               const std::vector<kernel::AddressPtr> &outputs) {
-  if (inputs.size() < 2 || outputs.empty()) {
-    MS_LOG(EXCEPTION) << "relu grad error input output size!";
-  }
-  if (inputs[0]->size != outputs[0]->size) {
-    MS_LOG(EXCEPTION) << "relu grad error input output data size!";
-  }
-
-  SetArgumentHandle(DNNL_ARG_SRC, inputs[1]->addr);
-  SetArgumentHandle(DNNL_ARG_DIFF_SRC, inputs[0]->addr);
-  SetArgumentHandle(DNNL_ARG_DIFF_DST, inputs[0]->addr);
-  ExecutePrimitive();
-  size_t mem_bits = outputs[0]->size;
-  auto ret = memcpy_s(outputs[0]->addr, mem_bits, inputs[0]->addr, mem_bits);
-  if (ret != 0) {
-    MS_LOG(EXCEPTION) << "memcpy_s error, errorno " << ret;
-    return false;
-  }
-  return true;
-}
-}  // namespace kernel
-}  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/relu_grad_cpu_kernel.h
@ -1,43 +0,0 @@
-/**
- * Copyright 2019 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_RELU_GRAD_CPU_KERNEL_H_
-#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_RELU_GRAD_CPU_KERNEL_H_
-
-#include <vector>
-#include <memory>
-#include "backend/kernel_compiler/cpu/mkldnn/mkl_cpu_kernel.h"
-
-namespace mindspore {
-namespace kernel {
-class ReluGradCPUKernel : public MKLCPUKernel {
- public:
-  ReluGradCPUKernel() = default;
-  ~ReluGradCPUKernel() override = default;
-
-  void InitKernel(const CNodePtr &kernel_node) override;
-
-  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
-              const std::vector<AddressPtr> &outputs) override;
-};
-
-MS_REG_CPU_KERNEL(
-  ReluGrad,
-  KernelAttr().AddInputAttr(kNumberTypeFloat32).AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-  ReluGradCPUKernel);
-}  // namespace kernel
-}  // namespace mindspore
-
-#endif  // MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_RELU_GRAD_CPU_KERNEL_H_
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/tensoradd_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/tensoradd_cpu_kernel.cc
@ -25,17 +25,45 @@ void TensorAddCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  std::vector<size_t> src0_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  std::vector<size_t> src1_shape = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  std::vector<size_t> dst_shape = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
-  if (src0_shape.size() != src1_shape.size() && src1_shape.size() > 1) {
-    MS_LOG(EXCEPTION) << "TensorAdd only support same dim input or tensor * scalar " << src0_shape.size() << " vs "
-                      << src1_shape.size();
+  if (src1_shape.size() != src0_shape.size()) {
+    if (src0_shape.size() == 0) {
+      need_swap_ = true;
+      for (size_t i = 0; i < src1_shape.size(); ++i) {
+        src0_shape.emplace_back(1);
      }
-  if (src1_shape.size() < src0_shape.size()) {
-    for (size_t i = src1_shape.size(); i < src0_shape.size(); ++i) {
+    } else if (src1_shape.size() == 0) {
+      for (size_t i = 0; i < src0_shape.size(); ++i) {
        src1_shape.emplace_back(1);
      }
+    } else {
+      MS_LOG(EXCEPTION) << "Invalid broadcast! " << src0_shape << " vs " << src1_shape;
+    }
+  } else {
+    bool visit_src0 = false;
+    bool visit_src1 = false;
+    for (size_t i = 0; i < src0_shape.size(); ++i) {
+      if (src0_shape[i] != src1_shape[i]) {
+        if (src0_shape[i] == 1 && !visit_src1) {
+          need_swap_ = true;
+          visit_src0 = true;
+        } else if (src1_shape[i] == 1 && !visit_src0) {
+          need_swap_ = false;
+          visit_src1 = true;
+        } else {
+          MS_LOG(EXCEPTION) << "Invalid broadcast! " << src0_shape << " vs " << src1_shape;
+        }
+      }
+    }
+  }
+  dnnl::memory::desc src0_desc;
+  dnnl::memory::desc src1_desc;
+  if (need_swap_) {
+    src0_desc = GetDefaultMemDesc(src1_shape);
+    src1_desc = GetDefaultMemDesc(src0_shape);
+  } else {
+    src0_desc = GetDefaultMemDesc(src0_shape);
+    src1_desc = GetDefaultMemDesc(src1_shape);
  }
-  dnnl::memory::desc src0_desc = GetDefaultMemDesc(src0_shape);
-  dnnl::memory::desc src1_desc = GetDefaultMemDesc(src1_shape);
  dnnl::memory::desc dst_desc = GetDefaultMemDesc(dst_shape);
  dnnl::binary::desc desc = dnnl::binary::desc(dnnl::algorithm::binary_add, src0_desc, src1_desc, dst_desc);
  auto prim_desc = dnnl::binary::primitive_desc(desc, MKLKernelEngine::Get().engine());
@ -51,8 +79,13 @@ bool TensorAddCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
  if (inputs.size() < 2 || outputs.empty()) {
    MS_LOG(EXCEPTION) << "TensorAdd error input output size!";
  }
+  if (need_swap_) {
+    SetArgumentHandle(DNNL_ARG_SRC_0, inputs[1]->addr);
+    SetArgumentHandle(DNNL_ARG_SRC_1, inputs[0]->addr);
+  } else {
    SetArgumentHandle(DNNL_ARG_SRC_0, inputs[0]->addr);
    SetArgumentHandle(DNNL_ARG_SRC_1, inputs[1]->addr);
+  }
  SetArgumentHandle(DNNL_ARG_DST, outputs[0]->addr);
  ExecutePrimitive();
  return true;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/tensoradd_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/mkldnn/tensoradd_cpu_kernel.h
@ -31,6 +31,9 @@ class TensorAddCPUKernel : public MKLCPUKernel {

  bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &workspace,
              const std::vector<AddressPtr> &outputs) override;
+
+ private:
+  bool need_swap_{false};
 };

 MS_REG_CPU_KERNEL(
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/reshape_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/reshape_cpu_kernel.h
@ -39,6 +39,7 @@ MS_REG_CPU_KERNEL(Reshape, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutput
                  ReshapeCPUKernel);
 MS_REG_CPU_KERNEL(Reshape, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
                  ReshapeCPUKernel);
+MS_REG_CPU_KERNEL(Reshape, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), ReshapeCPUKernel);

 MS_REG_CPU_KERNEL(Flatten, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  ReshapeCPUKernel);
@ -46,6 +47,7 @@ MS_REG_CPU_KERNEL(Flatten, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutput
                  ReshapeCPUKernel);
 MS_REG_CPU_KERNEL(Flatten, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
                  ReshapeCPUKernel);
+MS_REG_CPU_KERNEL(Flatten, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool), ReshapeCPUKernel);

 MS_REG_CPU_KERNEL(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
                  ReshapeCPUKernel);
@ -53,6 +55,8 @@ MS_REG_CPU_KERNEL(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOut
                  ReshapeCPUKernel);
 MS_REG_CPU_KERNEL(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeInt64).AddOutputAttr(kNumberTypeInt64),
                  ReshapeCPUKernel);
+MS_REG_CPU_KERNEL(ExpandDims, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
+                  ReshapeCPUKernel);
 }  // namespace kernel
 }  // namespace mindspore

--- a/mindspore/ops/_grad/grad_nn_ops.py
+++ b/mindspore/ops/_grad/grad_nn_ops.py
@ -560,10 +560,16 @@ def get_bprop_gelu(self):
 def get_bprop_fused_batch_norm(self):
    """Grad definition for `FusedBatchNorm` operation."""
    input_grad = G.FusedBatchNormGrad(self.epsilon, self.momentum)
-
+    target_cpu = False
+    if self.target == "CPU":
+        input_grad = G.FusedBatchNormGradCPU(self.epsilon, self.momentum)
+        target_cpu = True
    def bprop(x, scale, b, mean, variance, out, dout):
        saved_mean = out[3]
        saved_variance = out[4]
+        if target_cpu:
+            out = input_grad(dout[0], x, scale, b, saved_mean, saved_variance)
+        else:
            out = input_grad(dout[0], x, scale, saved_mean, saved_variance)
        dx = out[0]
        dscale = out[1]
--- a/mindspore/ops/operations/_grad_ops.py
+++ b/mindspore/ops/operations/_grad_ops.py
@ -540,6 +540,22 @@ class FusedBatchNormGrad(Primitive):
        raise NotImplementedError


+class FusedBatchNormGradCPU(PrimitiveWithInfer):
+    """Gradients of FusedBatchNorm operation for CPU."""
+
+    @prim_attr_register
+    def __init__(self, epsilon=0.0, momentum=0.1):
+        self.init_prim_io_names(inputs=['dy', 'x', 'scale', 'bias', 'save_mean', 'save_inv_variance'],
+                                outputs=['dx', 'bn_scale', 'bn_bias'])
+        self.add_prim_attr('data_format', "NCHW")
+
+    def infer_shape(self, dy_shape, x_shape, scale_shape, bias_shape, save_mean_shape, save_inv_variance_shape):
+        return (x_shape, scale_shape, bias_shape)
+
+    def infer_dtype(self, dy_type, x_type, scale_type, bias_type, save_mean_type, save_inv_variance_type):
+        return (x_type, scale_type, bias_type)
+
+
 class FusedBatchNormGradEx(PrimitiveWithInfer):
    """Gradients of FusedBatchNormEx operation."""

--- a/mindspore/ops/operations/nn_ops.py
+++ b/mindspore/ops/operations/nn_ops.py
@ -640,6 +640,7 @@ class FusedBatchNorm(Primitive):
        self.epsilon = validator.check_float_range(epsilon, 0, 1, Rel.INC_RIGHT, 'epsilon', self.name)
        self.momentum = validator.check_float_range(momentum, 0, 1, Rel.INC_BOTH, 'momentum', self.name)
        self._update_parameter = True
+        self.target = context.get_context("device_target")


 class FusedBatchNormEx(PrimitiveWithInfer):
--- a/tests/st/ops/cpu/test_abs_op.py
+++ b/tests/st/ops/cpu/test_abs_op.py
@ -0,0 +1,60 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.common.api import ms_function
+from mindspore.ops import operations as P
+from mindspore.ops.composite import GradOperation
+
+context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+
+
+class Grad(nn.Cell):
+    def __init__(self, network):
+        super(Grad, self).__init__()
+        self.grad = GradOperation(get_all=True, sens_param=True)
+        self.network = network
+
+    @ms_function
+    def construct(self, input_, output_grad):
+        return self.grad(self.network)(input_, output_grad)
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.ops = P.Abs()
+
+    def construct(self, x):
+        return self.ops(x)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_net():
+    x = np.random.randn(2, 3, 3, 4).astype(np.float32)
+    y_expect = np.abs(x)
+    net = Net()
+    out = net(Tensor(x))
+    assert (out.asnumpy() == y_expect).all()
+    sens = np.random.randn(2, 3, 3, 4).astype(np.float32)
+    backword_net = Grad(Net())
+    output = backword_net(Tensor(x), Tensor(sens))
+    print(len(output))
+    print(output[0].asnumpy())
--- a/tests/st/ops/cpu/test_batchnorm_op.py
+++ b/tests/st/ops/cpu/test_batchnorm_op.py
@ -80,3 +80,39 @@ def test_train_forward():
    bn_net = Batchnorm_Net(2, Tensor(weight), Tensor(bias), Tensor(moving_mean), Tensor(moving_var_init))
    bn_net.set_train(False)
    output = bn_net(Tensor(x))
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_train_backward():
+    x = np.array([[
+        [[1, 3, 3, 5], [2, 4, 6, 8], [3, 6, 7, 7], [4, 3, 8, 2]],
+        [[5, 7, 6, 3], [3, 5, 6, 7], [9, 4, 2, 5], [7, 5, 8, 1]]]]).astype(np.float32)
+    grad = np.array([[
+        [[1, 2, 7, 1], [4, 2, 1, 3], [1, 6, 5, 2], [2, 4, 3, 2]],
+        [[9, 4, 3, 5], [1, 3, 7, 6], [5, 7, 9, 9], [1, 4, 6, 8]]]]).astype(np.float32)
+    expect_output = np.array([[[[-0.69126546, -0.32903028, 1.9651246, -0.88445705],
+                                [0.6369296, -0.37732816, -0.93275493, -0.11168876],
+                                [-0.7878612, 1.3614, 0.8542711, -0.52222186],
+                                [-0.37732816, 0.5886317, -0.11168876, -0.28073236]],
+
+                               [[1.6447213, -0.38968924, -1.0174079, -0.55067265],
+                                [-2.4305856, -1.1751484, 0.86250514, 0.5502673],
+                                [0.39576983, 0.5470243, 1.1715001, 1.6447213],
+                                [-1.7996241, -0.7051701, 0.7080077, 0.5437813]]]]).astype(np.float32)
+
+    weight = Tensor(np.ones(2).astype(np.float32))
+    bias = Tensor(np.ones(2).astype(np.float32))
+    moving_mean = Tensor(np.ones(2).astype(np.float32))
+    moving_var_init = Tensor(np.ones(2).astype(np.float32))
+    error = np.ones(shape=[1, 2, 4, 4]) * 1.0e-6
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    bn_net = Batchnorm_Net(2, weight, bias, moving_mean, moving_var_init)
+    bn_net.set_train()
+    bn_grad = Grad(bn_net)
+    output = bn_grad(Tensor(x), Tensor(grad))
+    diff = output[0].asnumpy() - expect_output
+    assert np.all(diff < error)
+    assert np.all(-diff < error)
--- a/tests/st/ops/cpu/test_cast_op.py
+++ b/tests/st/ops/cpu/test_cast_op.py
@ -0,0 +1,76 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.common.dtype as mstype
+import mindspore.context as context
+from mindspore.common.tensor import Tensor
+from mindspore.nn import Cell
+from mindspore.ops import operations as P
+
+
+class Net(Cell):
+    def __init__(self, dtype):
+        super(Net, self).__init__()
+        self.Cast = P.Cast()
+        self.dtype = dtype
+
+    def construct(self, x):
+        return self.Cast(x, self.dtype)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_cast_int32():
+    x0 = Tensor(np.random.uniform(-2, 2, (3, 2)).astype(np.float32))
+    x1 = Tensor(np.random.uniform(-2, 2, (3, 2)).astype(np.int32))
+    x2 = Tensor(np.random.uniform(-2, 2, (3, 2)).astype(np.bool))
+    t = mstype.int32
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+    net = Net(t)
+    output = net(x0)
+    type0 = output.asnumpy().dtype
+    assert type0 == 'int32'
+    output = net(x1)
+    type1 = output.asnumpy().dtype
+    assert type1 == 'int32'
+    output = net(x2)
+    type2 = output.asnumpy().dtype
+    assert type2 == 'int32'
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_cast_float32():
+    x0 = Tensor(np.random.uniform(-2, 2, (3, 2)).astype(np.float32))
+    x1 = Tensor(np.random.uniform(-2, 2, (3, 2)).astype(np.int32))
+    x2 = Tensor(np.random.uniform(-2, 2, (3, 2)).astype(np.bool))
+    t = mstype.float32
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+    net = Net(t)
+    output = net(x0)
+    type0 = output.asnumpy().dtype
+    assert type0 == 'float32'
+    output = net(x1)
+    type1 = output.asnumpy().dtype
+    assert type1 == 'float32'
+    output = net(x2)
+    type2 = output.asnumpy().dtype
+    assert type2 == 'float32'
--- a/tests/st/ops/cpu/test_exp_op.py
+++ b/tests/st/ops/cpu/test_exp_op.py
@ -0,0 +1,56 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class NetExp(nn.Cell):
+    def __init__(self):
+        super(NetExp, self).__init__()
+        self.exp = P.Exp()
+
+    def construct(self, x):
+        return self.exp(x)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_exp():
+    x0_np = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32)
+    x1_np = np.random.uniform(-2, 2, 1).astype(np.float32)
+    x0 = Tensor(x0_np)
+    x1 = Tensor(x1_np)
+    expect0 = np.exp(x0_np)
+    expect1 = np.exp(x1_np)
+    error0 = np.ones(shape=expect0.shape) * 1.0e-5
+    error1 = np.ones(shape=expect1.shape) * 1.0e-5
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    exp = NetExp()
+    output0 = exp(x0)
+    diff0 = output0.asnumpy() - expect0
+    assert np.all(diff0 < error0)
+    assert output0.shape == expect0.shape
+    output1 = exp(x1)
+    diff1 = output1.asnumpy() - expect1
+    assert np.all(diff1 < error1)
+    assert output1.shape == expect1.shape
--- a/tests/st/ops/cpu/test_less_op.py
+++ b/tests/st/ops/cpu/test_less_op.py
@ -0,0 +1,83 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.ops = P.Less()
+
+    def construct(self, x, y):
+        return self.ops(x, y)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu_training
+@pytest.mark.env_onecard
+def test_net():
+    x0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    y0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    x1_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    y1_np = np.random.randint(1, 5, (2, 1, 4, 4)).astype(np.float32)
+    x2_np = np.random.randint(1, 5, (2, 1, 1, 4)).astype(np.float32)
+    y2_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    x3_np = np.random.randint(1, 5, 1).astype(np.float32)
+    y3_np = np.random.randint(1, 5, 1).astype(np.float32)
+    x4_np = np.array(768).astype(np.float32)
+    y4_np = np.array(3072.5).astype(np.float32)
+
+    x0 = Tensor(x0_np)
+    y0 = Tensor(y0_np)
+    x1 = Tensor(x1_np)
+    y1 = Tensor(y1_np)
+    x2 = Tensor(x2_np)
+    y2 = Tensor(y2_np)
+    x3 = Tensor(x3_np)
+    y3 = Tensor(y3_np)
+    x4 = Tensor(x4_np)
+    y4 = Tensor(y4_np)
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+    net = Net()
+    out = net(x0, y0).asnumpy()
+    expect = x0_np < y0_np
+    assert np.all(out == expect)
+    assert out.shape == expect.shape
+
+    out = net(x1, y1).asnumpy()
+    expect = x1_np < y1_np
+    assert np.all(out == expect)
+    assert out.shape == expect.shape
+
+    out = net(x2, y2).asnumpy()
+    expect = x2_np < y2_np
+    assert np.all(out == expect)
+    assert out.shape == expect.shape
+
+    out = net(x3, y3).asnumpy()
+    expect = x3_np < y3_np
+    assert np.all(out == expect)
+    assert out.shape == expect.shape
+
+    out = net(x4, y4).asnumpy()
+    expect = x4_np < y4_np
+    assert np.all(out == expect)
+    assert out.shape == expect.shape
--- a/tests/st/ops/cpu/test_log_op.py
+++ b/tests/st/ops/cpu/test_log_op.py
@ -0,0 +1,56 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class NetLog(nn.Cell):
+    def __init__(self):
+        super(NetLog, self).__init__()
+        self.log = P.Log()
+
+    def construct(self, x):
+        return self.log(x)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_log():
+    x0_np = np.random.uniform(1, 2, (2, 3, 4, 4)).astype(np.float32)
+    x1_np = np.random.uniform(1, 2, 1).astype(np.float32)
+    x0 = Tensor(x0_np)
+    x1 = Tensor(x1_np)
+    expect0 = np.log(x0_np)
+    expect1 = np.log(x1_np)
+    error0 = np.ones(shape=expect0.shape) * 1.0e-5
+    error1 = np.ones(shape=expect1.shape) * 1.0e-5
+
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    log = NetLog()
+    output0 = log(x0)
+    output1 = log(x1)
+    diff0 = output0.asnumpy() - expect0
+    assert np.all(diff0 < error0)
+    assert output0.shape == expect0.shape
+    diff1 = output1.asnumpy() - expect1
+    assert np.all(diff1 < error1)
+    assert output1.shape == expect1.shape
--- a/tests/st/ops/cpu/test_mul_op.py
+++ b/tests/st/ops/cpu/test_mul_op.py
@ -16,38 +16,53 @@
 import numpy as np
 import pytest

-import mindspore.context as context
+import mindspore.common.dtype as mstype
 import mindspore.nn as nn
-from mindspore import Tensor
+from mindspore import Tensor, context
 from mindspore.common.api import ms_function
-from mindspore.common.initializer import initializer
-from mindspore.common.parameter import Parameter
 from mindspore.ops import operations as P

-x = np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32)
-y = np.random.uniform(-2, 2, (1, 1, 1, 1)).astype(np.float32)
-
-context.set_context(device_target='CPU')
+context.set_context(mode=context.GRAPH_MODE, device_target='CPU')


 class Net(nn.Cell):
    def __init__(self):
        super(Net, self).__init__()
        self.mul = P.Mul()
-        self.x = Parameter(initializer(Tensor(x), x.shape), name='x3')
-        self.y = Parameter(initializer(Tensor(y), y.shape), name='y3')

    @ms_function
-    def construct(self):
-        return self.mul(self.x, self.y)
+    def construct(self, x, y):
+        return self.mul(x, y)


@pytest.mark.level0
@pytest.mark.platform_x86_cpu
@pytest.mark.env_onecard
-def test_Mul():
+def test_mul():
+    x0 = Tensor(np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32))
+    y0 = Tensor(np.random.uniform(-2, 2, (1, 1, 1, 1)).astype(np.float32))
+    x1 = Tensor(np.random.uniform(-2, 2, (1, 3, 1, 4)).astype(np.float32))
+    y1 = Tensor(np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32))
+    x2 = Tensor(np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32))
+    y2 = Tensor(2, mstype.float32)
    mul = Net()
-    output = mul()
-    print(x)
-    print(y)
-    print(output)
+    out = mul(x0, y0).asnumpy()
+    exp = x0.asnumpy() * y0.asnumpy()
+    diff = np.abs(out - exp)
+    err = np.ones(shape=exp.shape) * 1.0e-5
+    assert np.all(diff < err)
+    assert out.shape == exp.shape
+
+    out = mul(x1, y1).asnumpy()
+    exp = x1.asnumpy() * y1.asnumpy()
+    diff = np.abs(out - exp)
+    err = np.ones(shape=exp.shape) * 1.0e-5
+    assert np.all(diff < err)
+    assert out.shape == exp.shape
+
+    out = mul(x2, y2).asnumpy()
+    exp = x2.asnumpy() * y2.asnumpy()
+    diff = np.abs(out - exp)
+    err = np.ones(shape=exp.shape) * 1.0e-5
+    assert np.all(diff < err)
+    assert out.shape == exp.shape
--- a/tests/st/ops/cpu/test_neg_op.py
+++ b/tests/st/ops/cpu/test_neg_op.py
@ -0,0 +1,60 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.common.api import ms_function
+from mindspore.ops import operations as P
+from mindspore.ops.composite import GradOperation
+
+context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+
+
+class Grad(nn.Cell):
+    def __init__(self, network):
+        super(Grad, self).__init__()
+        self.grad = GradOperation(get_all=True, sens_param=True)
+        self.network = network
+
+    @ms_function
+    def construct(self, input_, output_grad):
+        return self.grad(self.network)(input_, output_grad)
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.ops = P.Neg()
+
+    def construct(self, x):
+        return self.ops(x)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_net():
+    x = np.random.randn(2, 3, 3, 4).astype(np.float32)
+    y_expect = -x
+    net = Net()
+    out = net(Tensor(x))
+    assert (out.asnumpy() == y_expect).all()
+    sens = np.random.randn(2, 3, 3, 4).astype(np.float32)
+    backword_net = Grad(Net())
+    output = backword_net(Tensor(x), Tensor(sens))
+    print(len(output))
+    print(output[0].asnumpy())
--- a/tests/st/ops/cpu/test_pow_op.py
+++ b/tests/st/ops/cpu/test_pow_op.py
@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.ops = P.Pow()
+
+    def construct(self, x, y):
+        return self.ops(x, y)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu_training
+@pytest.mark.env_onecard
+def test_net():
+    x0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    y0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    x1_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    y1_np = np.array(3).astype(np.float32)
+
+    x0 = Tensor(x0_np)
+    y0 = Tensor(y0_np)
+    x1 = Tensor(x1_np)
+    y1 = Tensor(y1_np)
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+    net = Net()
+    out = net(x0, y0).asnumpy()
+    expect = np.power(x0_np, y0_np)
+    assert np.all(out == expect)
+    assert out.shape == expect.shape
+
+    out = net(x1, y1).asnumpy()
+    expect = np.power(x1_np, y1_np)
+    assert np.all(out == expect)
+    assert out.shape == expect.shape
--- a/tests/st/ops/cpu/test_realdiv_op.py
+++ b/tests/st/ops/cpu/test_realdiv_op.py
@ -0,0 +1,95 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+
+
+class NetRealDiv(nn.Cell):
+    def __init__(self):
+        super(NetRealDiv, self).__init__()
+        self.divide = P.RealDiv()
+
+    def construct(self, x, y):
+        return self.divide(x, y)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu_training
+@pytest.mark.env_onecard
+def test_real_div():
+    x0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    y0_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    x1_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    y1_np = np.random.randint(1, 5, (2, 1, 4, 4)).astype(np.float32)
+    x2_np = np.random.randint(1, 5, (2, 1, 1, 4)).astype(np.float32)
+    y2_np = np.random.randint(1, 5, (2, 3, 4, 4)).astype(np.float32)
+    x3_np = np.random.randint(1, 5, 1).astype(np.float32)
+    y3_np = np.random.randint(1, 5, 1).astype(np.float32)
+    x4_np = np.array(768).astype(np.float32)
+    y4_np = np.array(3072.5).astype(np.float32)
+
+    x0 = Tensor(x0_np)
+    y0 = Tensor(y0_np)
+    x1 = Tensor(x1_np)
+    y1 = Tensor(y1_np)
+    x2 = Tensor(x2_np)
+    y2 = Tensor(y2_np)
+    x3 = Tensor(x3_np)
+    y3 = Tensor(y3_np)
+    x4 = Tensor(x4_np)
+    y4 = Tensor(y4_np)
+
+    context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+    real_div = NetRealDiv()
+    output0 = real_div(x0, y0)
+    expect0 = np.divide(x0_np, y0_np)
+    diff0 = output0.asnumpy() - expect0
+    error0 = np.ones(shape=expect0.shape) * 1.0e-5
+    assert np.all(diff0 < error0)
+    assert output0.shape == expect0.shape
+
+    output1 = real_div(x1, y1)
+    expect1 = np.divide(x1_np, y1_np)
+    diff1 = output1.asnumpy() - expect1
+    error1 = np.ones(shape=expect1.shape) * 1.0e-5
+    assert np.all(diff1 < error1)
+    assert output1.shape == expect1.shape
+
+    output2 = real_div(x2, y2)
+    expect2 = np.divide(x2_np, y2_np)
+    diff2 = output2.asnumpy() - expect2
+    error2 = np.ones(shape=expect2.shape) * 1.0e-5
+    assert np.all(diff2 < error2)
+    assert output2.shape == expect2.shape
+
+    output3 = real_div(x3, y3)
+    expect3 = np.divide(x3_np, y3_np)
+    diff3 = output3.asnumpy() - expect3
+    error3 = np.ones(shape=expect3.shape) * 1.0e-5
+    assert np.all(diff3 < error3)
+    assert output3.shape == expect3.shape
+
+    output4 = real_div(x4, y4)
+    expect4 = np.divide(x4_np, y4_np)
+    diff4 = output4.asnumpy() - expect4
+    error4 = np.ones(shape=expect4.shape) * 1.0e-5
+    assert np.all(diff4 < error4)
+    assert output4.shape == expect4.shape
--- a/tests/st/ops/cpu/test_relu6_op.py
+++ b/tests/st/ops/cpu/test_relu6_op.py
@ -20,7 +20,9 @@ import mindspore.context as context
 import mindspore.nn as nn
 from mindspore import Tensor
 from mindspore.ops import operations as P
+from mindspore.ops.operations import _grad_ops as G

+context.set_context(mode=context.GRAPH_MODE, device_target="CPU")

 class NetReLU6(nn.Cell):
    def __init__(self):
@ -30,6 +32,13 @@ class NetReLU6(nn.Cell):
    def construct(self, x):
        return self.relu6(x)

+class NetReLU6Grad(nn.Cell):
+    def __init__(self):
+        super(NetReLU6Grad, self).__init__()
+        self.relu6_grad = G.ReLU6Grad()
+
+    def construct(self, x, dy):
+        return self.relu6_grad(dy, x)

@pytest.mark.level0
@pytest.mark.platform_x86_cpu
@ -42,7 +51,26 @@ def test_relu6():
                         [5.9, 6, 6,],
                         [6, 1, 0.]]]]).astype(np.float32)

-    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
    relu6 = NetReLU6()
    output = relu6(x)
    assert (output.asnumpy() == expect).all()
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_relu6_grad():
+    x = Tensor(np.array([[[[-1, 1, 10],
+                           [5.9, 6.1, 6],
+                           [10, 1, -1]]]]).astype(np.float32))
+    dy = Tensor(np.array([[[[1, 1, 1],
+                            [1, 1, 1],
+                            [1, 1, 1]]]]).astype(np.float32))
+    expect = np.array([[[[0, 1, 0,],
+                         [1, 0, 1,],
+                         [0, 1, 0,]]]]).astype(np.float32)
+    error = np.ones(shape=[3, 3]) * 1.0e-6
+
+    relu6_grad = NetReLU6Grad()
+    output = relu6_grad(x, dy)
+    diff = np.abs(output.asnumpy() - expect)
+    assert np.all(np.abs(diff) < error)
--- a/tests/st/ops/cpu/test_relu_grad_op.py
+++ b/tests/st/ops/cpu/test_relu_grad_op.py
@ -49,5 +49,5 @@ def test_relu_grad():
    output = relu_grad()
    expect = np.array([[[[0, 0, 1,], [0, 0, 0,], [1, 1, 0.]]]]).astype(np.float32)
    error = np.ones(shape=[3, 3]) * 1.0e-6
-    diff = output.asnumpy() - expect
+    diff = np.abs(output.asnumpy() - expect)
    assert np.all(diff < error)
--- a/tests/st/ops/cpu/test_sigmoid_op.py
+++ b/tests/st/ops/cpu/test_sigmoid_op.py
@ -0,0 +1,78 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+from mindspore.ops.operations import _grad_ops as G
+
+context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+
+
+class NetSigmoidGrad(nn.Cell):
+    def __init__(self):
+        super(NetSigmoidGrad, self).__init__()
+        self.sigmoid_grad = G.SigmoidGrad()
+
+    def construct(self, y, dy):
+        return self.sigmoid_grad(y, dy)
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.ops = P.Sigmoid()
+
+    def construct(self, x):
+        return self.ops(x)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_net():
+    x = np.random.randn(2, 3, 3, 4).astype(np.float32)
+    y_expect = 1 / (1 + np.exp(-x))
+    net = Net()
+    out = net(Tensor(x))
+    diff = out.asnumpy() - y_expect
+    err = np.ones(shape=y_expect.shape) * 1.0e-5
+    assert np.all(diff < err)
+    assert out.shape == y_expect.shape
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_sigmoid_grad():
+    y = Tensor(np.array([[[[-1, 1, 2],
+                           [1, -1, 1],
+                           [2, 1, -1]]]]).astype(np.float32))
+    dy = Tensor(np.array([[[[-11, 2, 4],
+                            [-1, 1, -1],
+                            [-4, 4, -4]]]]).astype(np.float32))
+
+    expect = np.array([[[[22, 0, -8],
+                         [0, -2, 0],
+                         [8, 0, 8]]]]).astype(np.float32)
+
+    error = np.ones(shape=[1, 1, 3, 3]) * 1.0e-6
+
+    sigmoid_grad = NetSigmoidGrad()
+    output = sigmoid_grad(y, dy)
+    diff = np.abs(output.asnumpy() - expect)
+    assert np.all(abs(diff) < error)
--- a/tests/st/ops/cpu/test_sqrt_op.py
+++ b/tests/st/ops/cpu/test_sqrt_op.py
@ -0,0 +1,75 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+from mindspore.ops.operations import _grad_ops as G
+
+context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+
+class NetSqrtGrad(nn.Cell):
+    def __init__(self):
+        super(NetSqrtGrad, self).__init__()
+        self.sqrt_grad = G.SqrtGrad()
+
+    def construct(self, x, dx):
+        return self.sqrt_grad(x, dx)
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.ops = P.Sqrt()
+
+    def construct(self, x):
+        return self.ops(x)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_net():
+    x = np.abs(np.random.randn(2, 3, 3, 4)).astype(np.float32)
+    y_expect = np.sqrt(x)
+    net = Net()
+    out = net(Tensor(x))
+    diff = out.asnumpy() - y_expect
+    err = np.ones(shape=y_expect.shape) * 1.0e-5
+    assert np.all(diff < err)
+    assert out.shape == y_expect.shape
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_sqrt_grad():
+    x = Tensor(np.array([[[[-1, 1, 10],
+                           [5.9, 6.1, 6],
+                           [10, 1, -1]]]]).astype(np.float32))
+    dx = Tensor(np.array([[[[1, 1, 1],
+                            [2, 2, 2],
+                            [3, 3, 3]]]]).astype(np.float32))
+    expect = np.array([[[[-0.5, 0.5, 0.05,],
+                         [0.16949153, 0.16393442, 0.16666667,],
+                         [0.15, 1.5, -1.5,]]]]).astype(np.float32)
+    error = np.ones(shape=[3, 3]) * 1.0e-6
+
+    sqrt_grad = NetSqrtGrad()
+    output = sqrt_grad(x, dx)
+    diff = np.abs(output.asnumpy() - expect)
+    assert np.all(np.abs(diff) < error)
--- a/tests/st/ops/cpu/test_square_op.py
+++ b/tests/st/ops/cpu/test_square_op.py
@ -0,0 +1,63 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.common.api import ms_function
+from mindspore.ops import operations as P
+from mindspore.ops.composite import GradOperation
+
+context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+
+
+class Grad(nn.Cell):
+    def __init__(self, network):
+        super(Grad, self).__init__()
+        self.grad = GradOperation(get_all=True, sens_param=True)
+        self.network = network
+
+    @ms_function
+    def construct(self, input_, output_grad):
+        return self.grad(self.network)(input_, output_grad)
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.ops = P.Square()
+
+    def construct(self, x):
+        return self.ops(x)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_net():
+    x = np.random.randn(2, 3, 3, 4).astype(np.float32)
+    y_expect = x * x
+    net = Net()
+    out = net(Tensor(x))
+    diff = out.asnumpy() - y_expect
+    err = np.ones(shape=y_expect.shape) * 1.0e-5
+    assert np.all(diff < err)
+    assert out.shape == y_expect.shape
+    sens = np.random.randn(2, 3, 3, 4).astype(np.float32)
+    backword_net = Grad(Net())
+    output = backword_net(Tensor(x), Tensor(sens))
+    print(len(output))
+    print(output[0].asnumpy())
--- a/tests/st/ops/cpu/test_tanh_op.py
+++ b/tests/st/ops/cpu/test_tanh_op.py
@ -0,0 +1,63 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.common.api import ms_function
+from mindspore.ops import operations as P
+from mindspore.ops.composite import GradOperation
+
+context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+
+
+class Grad(nn.Cell):
+    def __init__(self, network):
+        super(Grad, self).__init__()
+        self.grad = GradOperation(get_all=True, sens_param=True)
+        self.network = network
+
+    @ms_function
+    def construct(self, input_, output_grad):
+        return self.grad(self.network)(input_, output_grad)
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.ops = P.Tanh()
+
+    def construct(self, x):
+        return self.ops(x)
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_net():
+    x = np.random.randn(2, 3, 3, 4).astype(np.float32)
+    y_expect = np.tanh(x)
+    net = Net()
+    out = net(Tensor(x))
+    diff = out.asnumpy() - y_expect
+    err = np.ones(shape=y_expect.shape) * 1.0e-5
+    assert np.all(diff < err)
+    assert out.shape == y_expect.shape
+    sens = np.random.randn(2, 3, 3, 4).astype(np.float32)
+    backword_net = Grad(Net())
+    output = backword_net(Tensor(x), Tensor(sens))
+    print(len(output))
+    print(output[0].asnumpy())
--- a/tests/st/ops/cpu/test_tensoradd.py
+++ b/tests/st/ops/cpu/test_tensoradd.py
@ -13,12 +13,15 @@
 # limitations under the License.
 # ============================================================================

-import pytest
 import numpy as np
-from mindspore import Tensor
-from mindspore.ops import operations as P
+import pytest
+
+import mindspore.common.dtype as mstype
 import mindspore.nn as nn
-import mindspore.context as context
+from mindspore import Tensor, context
+from mindspore.ops import operations as P
+
+context.set_context(mode=context.GRAPH_MODE, device_target='CPU')

 class TensorAdd(nn.Cell):
    def __init__(self):
@ -34,10 +37,30 @@ class TensorAdd(nn.Cell):
@pytest.mark.platform_x86_cpu
@pytest.mark.env_onecard
 def test_tensor_add():
-    x = np.arange(1 * 3 * 3 * 3).reshape(1, 3, 3, 3).astype(np.float32)
-    y = np.arange(1 * 3 * 3 * 3).reshape(1, 3, 3, 3).astype(np.float32)
-
-    context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+    x0 = Tensor(np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32))
+    y0 = Tensor(np.random.uniform(-2, 2, (1, 1, 1, 1)).astype(np.float32))
+    x1 = Tensor(np.random.uniform(-2, 2, (1, 3, 1, 4)).astype(np.float32))
+    y1 = Tensor(np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32))
+    x2 = Tensor(np.random.uniform(-2, 2, (2, 3, 4, 4)).astype(np.float32))
+    y2 = Tensor(2, mstype.float32)
    add = TensorAdd()
-    output = add(Tensor(x), Tensor(y))
-    assert (output.asnumpy() == x + y).all()
+    out = add(x0, y0).asnumpy()
+    exp = x0.asnumpy() + y0.asnumpy()
+    diff = np.abs(out - exp)
+    err = np.ones(shape=exp.shape) * 1.0e-5
+    assert np.all(diff < err)
+    assert out.shape == exp.shape
+
+    out = add(x1, y1).asnumpy()
+    exp = x1.asnumpy() + y1.asnumpy()
+    diff = np.abs(out - exp)
+    err = np.ones(shape=exp.shape) * 1.0e-5
+    assert np.all(diff < err)
+    assert out.shape == exp.shape
+
+    out = add(x2, y2).asnumpy()
+    exp = x2.asnumpy() + y2.asnumpy()
+    diff = np.abs(out - exp)
+    err = np.ones(shape=exp.shape) * 1.0e-5
+    assert np.all(diff < err)
+    assert out.shape == exp.shape