diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.cc
index 6d7cb494f74..792e3a3360e 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.cc
@@ -15,80 +15,42 @@
  */
 
 #include "backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.h"
+#include "nnacl/errorcode.h"
 
 namespace mindspore {
 namespace kernel {
-
 template <typename T>
 void BroadcastToCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
   MS_EXCEPTION_IF_NULL(kernel_node);
   input_shape_ = AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, 0);
   output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
+  size_t input_shape_size = input_shape_.size();
+  size_t output_shape_size = output_shape_.size();
 
-  size_t offset = output_shape_.size() - input_shape_.size();
-  for (size_t i = 0; i < offset; ++i) {
-    input_shape_.insert(input_shape_.begin(), 1);
+  if (output_shape_size < input_shape_size) {
+    MS_LOG(EXCEPTION) << "Cannot broadcast input tensor with shape " << input_shape_
+                      << " to  a smaller dimension shape " << output_shape_ << ".";
+  }
+  if (output_shape_size > MAX_SHAPE_SIZE) {
+    MS_LOG(EXCEPTION) << "Cannot broadcast input tensor with shape " << input_shape_ << " to a shape " << output_shape_
+                      << " more than 8-D.";
+  }
+  size_t offset = output_shape_size - input_shape_size;
+  for (size_t i = 0; i < input_shape_size; ++i) {
+    if (input_shape_[i] != output_shape_[i + offset] && input_shape_[i] != 1) {
+      MS_LOG(EXCEPTION) << "Cannot broadcast input tensor with shape " << input_shape_ << " to a shape "
+                        << output_shape_ << ".";
+    }
   }
 
-  for (size_t i = 0; i < input_shape_.size(); ++i) {
-    if (output_shape_[i] < input_shape_[i] || output_shape_[i] % input_shape_[i] != 0) {
-      MS_LOG(EXCEPTION) << "Cannot broadcast input tensor with shape " << input_shape_ << " to "
-                        << "output tensor with shape " << output_shape_
-                        << ". Output shape must be the integer times of input shape at the " << i << " dim!";
-    }
+  for (size_t i = 0; i < input_shape_size; ++i) {
+    shape_info_.input_shape_[i] = SizeToInt(input_shape_[i]);
   }
-  for (size_t j = 0; j < output_shape_.size(); j++) {
-    nums_ *= output_shape_[j];
-  }
-
-  tmp_ptr_ = reinterpret_cast<T *>(malloc(nums_ * sizeof(T)));
-}
-
-// BroadcastTo
-template <typename T>
-void BroadcastToCPUKernel<T>::BroadcastToImpl(size_t dim) {
-  if (dim == output_shape_.size() - 1) {
-    size_t input_nums = 1;
-    for (size_t j = 0; j < input_shape_.size() - 1; ++j) {
-      input_nums *= input_shape_[j];
-    }
-    size_t rate = output_shape_[dim] / input_shape_[dim];
-
-    for (size_t j = 0; j < input_nums; ++j) {
-      T *in_ptr = input_ptr_ + input_shape_[dim] * j;
-      for (size_t i = 0; i < rate; ++i) {
-        T *out_ptr = tmp_ptr_ + (j * rate + i) * input_shape_[dim];
-        memcpy_s(out_ptr, input_shape_[dim] * sizeof(T), in_ptr, input_shape_[dim] * sizeof(T));
-      }
-    }
-    size_t elems = input_shape_[dim] * rate * input_nums;
-    memcpy_s(output_ptr_, elems * sizeof(T), tmp_ptr_, elems * sizeof(T));
-    return;
-  }
-
-  BroadcastToImpl(dim + 1);
-
-  size_t rate = output_shape_[dim] / input_shape_[dim];
-  if (rate > 1) {
-    size_t elems_nums = 1;
-    for (size_t j = output_shape_.size() - 1; j > dim; --j) {
-      elems_nums *= output_shape_[j];
-    }
-    size_t input_nums = 1;
-    for (size_t j = 0; j < dim; ++j) {
-      input_nums *= input_shape_[j];
-    }
-
-    for (size_t j = 0; j < input_nums; ++j) {
-      T *in_ptr = output_ptr_ + elems_nums * j;
-      for (size_t i = 0; i < rate; ++i) {
-        T *out_ptr = tmp_ptr_ + (j * rate + i) * elems_nums;
-        memcpy_s(out_ptr, elems_nums * sizeof(T), in_ptr, elems_nums * sizeof(T));
-      }
-    }
-    size_t elems = elems_nums * rate * input_nums;
-    memcpy_s(output_ptr_, elems * sizeof(T), tmp_ptr_, elems * sizeof(T));
+  for (size_t i = 0; i < output_shape_size; ++i) {
+    shape_info_.output_shape_[i] = SizeToInt(output_shape_[i]);
   }
+  shape_info_.input_shape_size_ = SizeToInt(input_shape_size);
+  shape_info_.output_shape_size_ = SizeToInt(output_shape_size);
 }
 
 template <typename T>
@@ -96,25 +58,33 @@ bool BroadcastToCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs, cons
                                      const std::vector<AddressPtr> &outputs) {
   if (inputs.size() != 1 || outputs.size() != 1) {
     MS_LOG(EXCEPTION) << "Wrong number of inputs or outputs!";
-    return false;
   }
-
   if ((inputs[0] == nullptr) || (inputs[0]->size == 0)) {
     MS_LOG(EXCEPTION) << "Input data is NULL!";
-    return false;
   }
-
   if ((outputs[0] == nullptr) || (outputs[0]->size == 0)) {
     MS_LOG(EXCEPTION) << "Output data is NULL!";
-    return false;
   }
 
-  input_ptr_ = reinterpret_cast<T *>(inputs[0]->addr);
-  output_ptr_ = reinterpret_cast<T *>(outputs[0]->addr);
+  const auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
+  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
+  int ret = NNACL_ERR;
+  if constexpr (std::is_same_v<T, bool>) {
+    ret = BroadcastTo(bool, input_addr, &shape_info_, output_addr);
+  } else if constexpr (std::is_same_v<T, int>) {
+    ret = BroadcastTo(int, input_addr, &shape_info_, output_addr);
+  } else if constexpr (std::is_same_v<T, float>) {
+    ret = BroadcastTo(float, input_addr, &shape_info_, output_addr);
+  } else {
+    MS_LOG(EXCEPTION) << "Not supported data type for BroadcastTo.";
+  }
 
-  BroadcastToImpl(0);
-
-  return true;
+  if (ret == NNACL_OK) {
+    return true;
+  }
+  MS_LOG(ERROR) << "Broadcast tensor with shape " << input_shape_ << " to shape " << output_shape_
+                << " execute failed.";
+  return false;
 }
 
 }  // namespace kernel
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.h
index b535c445b93..6d9c288787c 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/broadcast_to_cpu_kernel.h
@@ -21,44 +21,32 @@
 #include <memory>
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
+#include "nnacl/base/broadcast_to.h"
 
 namespace mindspore {
 namespace kernel {
-
 template <typename T>
 class BroadcastToCPUKernel : public CPUKernel {
  public:
   BroadcastToCPUKernel() = default;
-  ~BroadcastToCPUKernel() override {
-    if (tmp_ptr_ != nullptr) {
-      free(tmp_ptr_);
-      tmp_ptr_ = nullptr;
-    }
-  };
+  ~BroadcastToCPUKernel() = default;
 
   bool Launch(const std::vector<AddressPtr> &inputs, const std::vector<AddressPtr> &,
               const std::vector<AddressPtr> &outputs) override;
   void InitKernel(const CNodePtr &kernel_node) override;
 
-  void BroadcastToImpl(size_t dim);
-
-  size_t Index(const size_t &index, const size_t &dim) { return dim == 1 ? 0 : index; }
-
  private:
   std::vector<size_t> input_shape_;
   std::vector<size_t> output_shape_;
-  size_t nums_{1};
-  T *input_ptr_{nullptr};
-  T *output_ptr_{nullptr};
-  T *tmp_ptr_{nullptr};
+  BroadcastShapeInfo shape_info_;
 };
 
-MS_REG_CPU_KERNEL(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
-                  BroadcastToCPUKernel<float>);
-MS_REG_CPU_KERNEL(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
-                  BroadcastToCPUKernel<int>);
-MS_REG_CPU_KERNEL(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
-                  BroadcastToCPUKernel<bool>);
+MS_REG_CPU_KERNEL_T(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeFloat32).AddOutputAttr(kNumberTypeFloat32),
+                    BroadcastToCPUKernel, float);
+MS_REG_CPU_KERNEL_T(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeInt32).AddOutputAttr(kNumberTypeInt32),
+                    BroadcastToCPUKernel, int);
+MS_REG_CPU_KERNEL_T(BroadcastTo, KernelAttr().AddInputAttr(kNumberTypeBool).AddOutputAttr(kNumberTypeBool),
+                    BroadcastToCPUKernel, bool);
 }  // namespace kernel
 }  // namespace mindspore
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc
index eeecddf5abe..fb9f1d88a09 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.cc
@@ -18,28 +18,32 @@
 #include "backend/kernel_compiler/cpu/eltwise_grad_cpu_kernel.h"
 #include "common/thread_pool.h"
 #include "runtime/device/cpu/cpu_device_address.h"
+#include "nnacl/fp32_grad/activation_grad.h"
+#include "nnacl/errorcode.h"
 
 namespace mindspore {
 namespace kernel {
 template <typename T>
 void EltWiseGradCPUKernel<T>::ReluGrad(const T *input1, const T *input2, T *out, size_t start, size_t end) {
-  for (size_t i = start; i < end; i++) {
-    if (input2[i] > 0) {
-      out[i] = input1[i];
-    } else {
-      out[i] = 0;
+  if constexpr (std::is_same_v<T, float>) {
+    int ret = ::ReluGrad(input1 + start, input2 + start, end - start, out + start);
+    if (ret == NNACL_ERR) {
+      MS_LOG(EXCEPTION) << "ReLUGrad failed.";
     }
+  } else {
+    MS_LOG(EXCEPTION) << "ReLUGrad only support float";
   }
 }
 
 template <typename T>
 void EltWiseGradCPUKernel<T>::ReLU6Grad(const T *input1, const T *input2, T *out, size_t start, size_t end) {
-  for (size_t i = start; i < end; i++) {
-    if (input2[i] > 0 && input2[i] <= 6) {
-      out[i] = input1[i];
-    } else {
-      out[i] = 0;
+  if constexpr (std::is_same_v<T, float>) {
+    int ret = ::Relu6Grad(input1 + start, input2 + start, end - start, out + start);
+    if (ret == NNACL_ERR) {
+      MS_LOG(EXCEPTION) << "ReLU6Grad failed.";
     }
+  } else {
+    MS_LOG(EXCEPTION) << "ReLU6Grad only support float";
   }
 }
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt
index 7263e16d793..a9b70536809 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/CMakeLists.txt
@@ -30,12 +30,9 @@ file(GLOB KERNEL_SRC
     ${NNACL_DIR}/int8/*.c
     ${NNACL_DIR}/infer/*.c
     ${NNACL_DIR}/base/*.c
+    ${NNACL_DIR}/fp32_grad/*.c
 )
 
-if(SUPPORT_TRAIN)
-    file(GLOB TRAIN_SRC ${NNACL_DIR}/fp32_grad/*.c)
-endif()
-
 if(PLATFORM_ARM64)
     file(GLOB ASSEMBLY_SRC ${NNACL_DIR}/assembly/arm64/*.S)
     set_property(SOURCE ${ASSEMBLY_SRC} PROPERTY LANGUAGE C)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.c
new file mode 100644
index 00000000000..cd6eff53856
--- /dev/null
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.c
@@ -0,0 +1,95 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nnacl/base/broadcast_to.h"
+#include <string.h>
+#include "nnacl/op_base.h"
+#include "nnacl/errorcode.h"
+
+size_t accumulate(const int *shape, int start, int end) {
+  size_t product = 1;
+  for (int i = start; i <= end; ++i) {
+    product *= (size_t)shape[i];
+  }
+  return product;
+}
+
+void pad_input_shape(int *input_shape, int input_shape_len, int output_shape_len) {
+  if (input_shape_len < output_shape_len) {
+    const int shape_gap = output_shape_len - input_shape_len;
+    for (int i = input_shape_len - 1; i >= 0; --i) {
+      input_shape[i + shape_gap] = input_shape[i];
+    }
+    for (int i = 0; i < shape_gap; ++i) {
+      input_shape[i] = 1;
+    }
+  }
+}
+
+#define BROADCAST_TO(type)                                                                             \
+  int broadcast_to_##type(const type *input, BroadcastShapeInfo *shape_info, type *output) {           \
+    if (shape_info->output_shape_size_ > MAX_SHAPE_SIZE) {                                             \
+      return NNACL_ERR;                                                                                \
+    }                                                                                                  \
+    int *input_shape = shape_info->input_shape_;                                                       \
+    const int *output_shape = shape_info->output_shape_;                                               \
+    const int dim_max = shape_info->output_shape_size_ - 1;                                            \
+    const size_t bool_length = 1, number_length = 4;                                                   \
+    const size_t data_length = strcmp(#type, "bool") ? number_length : bool_length;                    \
+    const size_t temp_length = accumulate(output_shape, 0, dim_max);                                   \
+    type *data_temp = (type *)malloc(temp_length * data_length);                                       \
+    if (data_temp == NULL) {                                                                           \
+      return NNACL_ERR;                                                                                \
+    }                                                                                                  \
+    pad_input_shape(input_shape, shape_info->input_shape_size_, dim_max + 1);                          \
+    shape_info->input_shape_size_ = dim_max + 1;                                                       \
+                                                                                                       \
+    size_t before_dim_elements_num = accumulate(input_shape, 0, dim_max - 1);                          \
+    size_t after_dim_elements_num = input_shape[dim_max];                                              \
+    size_t dim_broadcast_rate = (size_t)(output_shape[dim_max] / input_shape[dim_max]);                \
+    for (size_t i = 0; i < before_dim_elements_num; ++i) {                                             \
+      const type *in_ptr = input + i * after_dim_elements_num;                                         \
+      for (size_t j = 0; j < dim_broadcast_rate; ++j) {                                                \
+        type *out_ptr = output + (i * dim_broadcast_rate + j) * after_dim_elements_num;                \
+        memcpy(out_ptr, in_ptr, after_dim_elements_num *data_length);                                  \
+      }                                                                                                \
+    }                                                                                                  \
+                                                                                                       \
+    int dim_index = dim_max - 1;                                                                       \
+    while (dim_index >= 0) {                                                                           \
+      dim_broadcast_rate = (size_t)(output_shape[dim_index] / input_shape[dim_index]);                 \
+      if (dim_broadcast_rate > 1) {                                                                    \
+        before_dim_elements_num = accumulate(input_shape, 0, dim_index - 1);                           \
+        after_dim_elements_num = accumulate(output_shape, dim_index + 1, dim_max);                     \
+        for (size_t i = 0; i < before_dim_elements_num; ++i) {                                         \
+          type *in_ptr = output + i * after_dim_elements_num;                                          \
+          for (size_t j = 0; j < dim_broadcast_rate; ++j) {                                            \
+            type *out_ptr = data_temp + (i * dim_broadcast_rate + j) * after_dim_elements_num;         \
+            memcpy(out_ptr, in_ptr, after_dim_elements_num *data_length);                              \
+          }                                                                                            \
+        }                                                                                              \
+        size_t elements_total = before_dim_elements_num * dim_broadcast_rate * after_dim_elements_num; \
+        memcpy(output, data_temp, elements_total *data_length);                                        \
+      }                                                                                                \
+      --dim_index;                                                                                     \
+    }                                                                                                  \
+    free(data_temp);                                                                                   \
+    return NNACL_OK;                                                                                   \
+  }
+
+BROADCAST_TO(int)
+BROADCAST_TO(float)
+BROADCAST_TO(bool)
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/broadcast_to_fp32.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.h
similarity index 55%
rename from mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/broadcast_to_fp32.h
rename to mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.h
index e59c0158c9d..4092bec85f3 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/broadcast_to_fp32.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/base/broadcast_to.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2021 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,18 +13,20 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#ifndef MINDSPORE_NNACL_FP32_BROADCAST_TO_FP32_H_
-#define MINDSPORE_NNACL_FP32_BROADCAST_TO_FP32_H_
+#ifndef MINDSPORE_NNACL_FP32_BROADCAST_TO_H_
+#define MINDSPORE_NNACL_FP32_BROADCAST_TO_H_
 
-#include "nnacl/op_base.h"
 #include "nnacl/broadcast_to_parameter.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
-int BroadcastTo(const float *input, BroadcastShapeInfo *shape_info, float *output);
+#define BroadcastTo(type, input, shape_info, output) broadcast_to_##type(input, shape_info, output)
+int broadcast_to_int(const int *input, BroadcastShapeInfo *shape_info, int *output);
+int broadcast_to_float(const float *input, BroadcastShapeInfo *shape_info, float *output);
+int broadcast_to_bool(const bool *input, BroadcastShapeInfo *shape_info, bool *output);
 #ifdef __cplusplus
 }
 #endif
 
-#endif  // MINDSPORE_NNACL_FP32_BROADCAST_TO_FP32_H_
+#endif  // MINDSPORE_NNACL_FP32_BROADCAST_TO_H_
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/broadcast_to_parameter.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/broadcast_to_parameter.h
index 074dbb9111c..874c246b212 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/broadcast_to_parameter.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/broadcast_to_parameter.h
@@ -20,14 +20,14 @@
 
 typedef struct BroadcastToParameter {
   OpParameter op_parameter_;
-  int shape_[COMM_SHAPE_SIZE];
+  int shape_[MAX_SHAPE_SIZE];
   size_t shape_size_;
 } BroadcastToParameter;
 
 typedef struct BroadcastShapeInfo {
-  int input_shape_[COMM_SHAPE_SIZE];
+  int input_shape_[MAX_SHAPE_SIZE];
   int input_shape_size_;
-  int output_shape_[COMM_SHAPE_SIZE];
+  int output_shape_[MAX_SHAPE_SIZE];
   int output_shape_size_;
 } BroadcastShapeInfo;
 
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/broadcast_to_fp32.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/broadcast_to_fp32.c
deleted file mode 100644
index 73202f663fe..00000000000
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32/broadcast_to_fp32.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "nnacl/fp32/broadcast_to_fp32.h"
-#include <string.h>
-#include "nnacl/op_base.h"
-#include "nnacl/errorcode.h"
-
-void PadBroadcastShapeInfo(BroadcastShapeInfo *shape_info) {
-  if (shape_info->input_shape_size_ < DIMENSION_4D) {
-    int input_shape_tmp[DIMENSION_4D];
-    for (int i = 0; i < shape_info->input_shape_size_; ++i) {
-      input_shape_tmp[i] = shape_info->input_shape_[i];
-    }
-    int input_shape_index = shape_info->input_shape_size_ - 1;
-    for (int i = DIMENSION_4D - 1; i >= 0; --i) {
-      if (input_shape_index >= 0) {
-        shape_info->input_shape_[i] = input_shape_tmp[input_shape_index--];
-      } else {
-        shape_info->input_shape_[i] = 1;
-      }
-    }
-  }
-  if (shape_info->output_shape_size_ < DIMENSION_4D) {
-    int output_shape_tmp[DIMENSION_4D];
-    for (int i = 0; i < shape_info->output_shape_size_; ++i) {
-      output_shape_tmp[i] = shape_info->output_shape_[i];
-    }
-    int output_shape_index = shape_info->output_shape_size_ - 1;
-    for (int i = DIMENSION_4D - 1; i >= 0; --i) {
-      if (output_shape_index >= 0) {
-        shape_info->output_shape_[i] = output_shape_tmp[output_shape_index--];
-      } else {
-        shape_info->output_shape_[i] = 1;
-      }
-    }
-  }
-}
-
-int BroadcastTo(const float *input, BroadcastShapeInfo *shape_info, float *output) {
-  if (shape_info->input_shape_size_ > DIMENSION_4D || shape_info->output_shape_size_ > DIMENSION_4D) {
-    return NNACL_ERR;
-  }
-  PadBroadcastShapeInfo(shape_info);
-  size_t input_dim_offset[DIMENSION_4D - 1];
-  input_dim_offset[2] = shape_info->input_shape_[3] * 4;
-  input_dim_offset[1] = input_dim_offset[2] * shape_info->input_shape_[2];
-  input_dim_offset[0] = input_dim_offset[1] * shape_info->input_shape_[1];
-  size_t output_dim_offset[DIMENSION_4D - 1];
-  output_dim_offset[2] = shape_info->output_shape_[3] * 4;
-  output_dim_offset[1] = output_dim_offset[2] * shape_info->output_shape_[2];
-  output_dim_offset[0] = output_dim_offset[1] * shape_info->output_shape_[1];
-  uint8_t *in_base = (uint8_t *)input;
-  uint8_t *out_base = (uint8_t *)(output);
-  for (int32_t dim0 = 0; dim0 < shape_info->input_shape_[0]; ++dim0) {
-    for (int32_t dim1 = 0; dim1 < shape_info->input_shape_[1]; ++dim1) {
-      for (int32_t dim2 = 0; dim2 < shape_info->input_shape_[2]; ++dim2) {
-        if (shape_info->input_shape_[3] == shape_info->output_shape_[3]) {
-          memcpy(out_base + output_dim_offset[0] * dim0 + output_dim_offset[1] * dim1 + output_dim_offset[2] * dim2,
-                 in_base + input_dim_offset[0] * dim0 + input_dim_offset[1] * dim1 + input_dim_offset[2] * dim2,
-                 input_dim_offset[2]);
-        } else {
-          for (int32_t dim3 = 0; dim3 < shape_info->output_shape_[3]; ++dim3) {
-            memcpy(out_base + output_dim_offset[0] * dim0 + output_dim_offset[1] * dim1 + output_dim_offset[2] * dim2 +
-                     dim3 * 4,
-                   in_base + input_dim_offset[0] * dim0 + input_dim_offset[1] * dim1 + input_dim_offset[2] * dim2, 4);
-          }
-        }
-      }
-      if (shape_info->input_shape_[2] != shape_info->output_shape_[2]) {
-        for (int32_t dim2 = 0; dim2 < shape_info->output_shape_[2]; ++dim2) {
-          memcpy(out_base + output_dim_offset[0] * dim0 + output_dim_offset[1] * dim1 + dim2 * output_dim_offset[2],
-                 out_base + output_dim_offset[0] * dim0 + output_dim_offset[1] * dim1, output_dim_offset[2]);
-        }
-      }
-    }
-    if (shape_info->input_shape_[1] != shape_info->output_shape_[1]) {
-      for (int32_t dim1 = 0; dim1 < shape_info->output_shape_[1]; ++dim1) {
-        memcpy(out_base + output_dim_offset[0] * dim0 + output_dim_offset[1] * dim1,
-               out_base + output_dim_offset[0] * dim0, output_dim_offset[1]);
-      }
-    }
-  }
-  if (shape_info->input_shape_[0] != shape_info->output_shape_[0]) {
-    for (int32_t dim0 = 0; dim0 < shape_info->output_shape_[0]; ++dim0) {
-      memcpy(out_base + output_dim_offset[0] * dim0, out_base, output_dim_offset[0]);
-    }
-  }
-  return NNACL_OK;
-}
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.c b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.c
index ff507f917b2..488d413727b 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.c
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.c
@@ -20,7 +20,7 @@
 #include "nnacl/fp32_grad/activation_grad.h"
 #include "nnacl/errorcode.h"
 
-inline int ReluGrad(float *src0, float *src1, size_t length, float *dst) {
+int ReluGrad(const float *src0, const float *src1, size_t length, float *dst) {
   int i = 0;
 #ifdef ENABLE_ARM
   float32x4_t zero_4 = vdupq_n_f32(0.0f);
@@ -38,7 +38,7 @@ inline int ReluGrad(float *src0, float *src1, size_t length, float *dst) {
   return NNACL_OK;
 }
 
-int Relu6Grad(float *src0, float *src1, size_t length, float *dst) {
+int Relu6Grad(const float *src0, const float *src1, size_t length, float *dst) {
   int i = 0;
 #ifdef ENABLE_ARM
   float32x4_t zero_4 = vdupq_n_f32(0.0f);
@@ -59,28 +59,28 @@ int Relu6Grad(float *src0, float *src1, size_t length, float *dst) {
   return NNACL_OK;
 }
 
-int LReluGrad(float *src0, float *src1, size_t length, float *dst, float alpha) {
+int LReluGrad(const float *src0, const float *src1, size_t length, float *dst, float alpha) {
   for (size_t i = 0; i < length; ++i) {
     dst[i] = src1[i] > 0.0f ? src0[i] : alpha * src0[i];
   }
   return NNACL_OK;
 }
 
-int SigmoidGrad(float *src0, float *src1, size_t length, float *dst) {
+int SigmoidGrad(const float *src0, const float *src1, size_t length, float *dst) {
   for (size_t i = 0; i < length; ++i) {
     dst[i] = src0[i] * (src1[i] * (1.0f - src1[i]));
   }
   return NNACL_OK;
 }
 
-int TanhGrad(float *src0, float *src1, size_t length, float *dst) {
+int TanhGrad(const float *src0, const float *src1, size_t length, float *dst) {
   for (size_t i = 0; i < length; ++i) {
     dst[i] = (1.0f - (src1[i] * src1[i])) * src0[i];
   }
   return NNACL_OK;
 }
 
-int HSwishGrad(float *src0, float *src1, size_t length, float *dst) {
+int HSwishGrad(const float *src0, const float *src1, size_t length, float *dst) {
   for (size_t i = 0; i < length; ++i) {
     float tmp = (src1[i] > 3.0f ? 1.0f : (src1[i] < -3.0f ? 0.0f : (2.0f * src1[i] + 3.0f) / 6.0f));
     dst[i] = tmp * src0[i];
@@ -88,7 +88,7 @@ int HSwishGrad(float *src0, float *src1, size_t length, float *dst) {
   return NNACL_OK;
 }
 
-int HSigmoidGrad(float *src0, float *src1, size_t length, float *dst) {
+int HSigmoidGrad(const float *src0, const float *src1, size_t length, float *dst) {
   for (size_t i = 0; i < length; ++i) {
     float tmp = (src1[i] > 3.0f ? 0.0f : (src1[i] < -3.0f ? 0.0f : 1.0f / 6.0f));
     dst[i] = tmp * src0[i];
@@ -96,14 +96,14 @@ int HSigmoidGrad(float *src0, float *src1, size_t length, float *dst) {
   return NNACL_OK;
 }
 
-int EluGrad(float *src0, float *src1, size_t length, float *dst, float alpha) {
+int EluGrad(const float *src0, const float *src1, size_t length, float *dst, float alpha) {
   for (size_t i = 0; i < length; ++i) {
     dst[i] = (src1[i] > 0.0f ? src0[i] : alpha * expm1(src1[i]) * src0[i]);
   }
   return NNACL_OK;
 }
 
-int GeluGrad(float *src0, float *src1, size_t length, float *dst) {
+int GeluGrad(const float *src0, const float *src1, size_t length, float *dst) {
   for (size_t i = 0; i < length; ++i) {
     dst[i] = src0[i] * ((0.5 * (1.0 + erf(src1[i] / 1.4142135623730951))) +
                         (src1[i] * exp(-0.5 * src1[i] * src1[i]) / 2.5066282746));
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.h
index 8317571386a..e88b27addb5 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/fp32_grad/activation_grad.h
@@ -30,15 +30,15 @@ typedef struct ActivationGradParameter {
 extern "C" {
 #endif
 
-int ReluGrad(float *src0, float *src1, size_t length, float *dst);
-int Relu6Grad(float *src0, float *src1, size_t length, float *dst);
-int LReluGrad(float *src0, float *src1, size_t length, float *dst, float alpha);
-int SigmoidGrad(float *src0, float *src1, size_t length, float *dst);
-int TanhGrad(float *src0, float *src1, size_t length, float *dst);
-int HSwishGrad(float *src0, float *src1, size_t length, float *dst);
-int HSigmoidGrad(float *src0, float *src1, size_t length, float *dst);
-int EluGrad(float *src0, float *src1, size_t length, float *dst, float alpha);
-int GeluGrad(float *src0, float *src1, size_t length, float *dst);
+int ReluGrad(const float *src0, const float *src1, size_t length, float *dst);
+int Relu6Grad(const float *src0, const float *src1, size_t length, float *dst);
+int LReluGrad(const float *src0, const float *src1, size_t length, float *dst, float alpha);
+int SigmoidGrad(const float *src0, const float *src1, size_t length, float *dst);
+int TanhGrad(const float *src0, const float *src1, size_t length, float *dst);
+int HSwishGrad(const float *src0, const float *src1, size_t length, float *dst);
+int HSigmoidGrad(const float *src0, const float *src1, size_t length, float *dst);
+int EluGrad(const float *src0, const float *src1, size_t length, float *dst, float alpha);
+int GeluGrad(const float *src0, const float *src1, size_t length, float *dst);
 
 #ifdef __cplusplus
 }
diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/broadcast_to_infer.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/broadcast_to_infer.h
index 90d818efea7..5688984a969 100644
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/broadcast_to_infer.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/nnacl/infer/broadcast_to_infer.h
@@ -17,7 +17,7 @@
 #define MINDSPORE_NNACL_BROADCAST_TO_INFER_H
 
 #include "nnacl/infer/common_infer.h"
-#include "nnacl/fp32/broadcast_to_fp32.h"
+#include "nnacl/base/broadcast_to.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/mindspore/ccsrc/cxx_api/CMakeLists.txt b/mindspore/ccsrc/cxx_api/CMakeLists.txt
index bfb4ad08c97..b8913c76a03 100644
--- a/mindspore/ccsrc/cxx_api/CMakeLists.txt
+++ b/mindspore/ccsrc/cxx_api/CMakeLists.txt
@@ -57,7 +57,7 @@ else()
 endif()
 
 if(ENABLE_CPU)
-    target_link_libraries(mindspore_shared_lib PRIVATE mindspore::dnnl mindspore::mkldnn)
+    target_link_libraries(mindspore_shared_lib PRIVATE mindspore::dnnl mindspore::mkldnn nnacl)
 endif()
 
 if(USE_GLOG)
diff --git a/mindspore/lite/src/ops/populate/broadcast_to_populate.cc b/mindspore/lite/src/ops/populate/broadcast_to_populate.cc
index 33d8817d5ba..427c74d1847 100644
--- a/mindspore/lite/src/ops/populate/broadcast_to_populate.cc
+++ b/mindspore/lite/src/ops/populate/broadcast_to_populate.cc
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include "src/ops/populate/populate_register.h"
-#include "nnacl/fp32/broadcast_to_fp32.h"
+#include "nnacl/base/broadcast_to.h"
 using mindspore::schema::PrimitiveType_BroadcastTo;
 
 namespace mindspore {
diff --git a/mindspore/lite/src/ops/populate/v0/broadcast_to_populate_v0.cc b/mindspore/lite/src/ops/populate/v0/broadcast_to_populate_v0.cc
index 2d9a37bad62..fa78b3af3d1 100644
--- a/mindspore/lite/src/ops/populate/v0/broadcast_to_populate_v0.cc
+++ b/mindspore/lite/src/ops/populate/v0/broadcast_to_populate_v0.cc
@@ -16,7 +16,7 @@
 
 #include "schema/model_v0_generated.h"
 #include "src/ops/populate/populate_register.h"
-#include "nnacl/fp32/broadcast_to_fp32.h"
+#include "nnacl/base/broadcast_to.h"
 
 namespace mindspore {
 namespace lite {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.cc
index dbd664bbe2b..f1ce54d2959 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.cc
@@ -49,10 +49,10 @@ int BroadcastToCPUKernel::Init() {
 }
 
 int BroadcastToCPUKernel::Run() {
-  auto input_data = reinterpret_cast<float *>(in_tensors_.at(0)->MutableData());
+  const auto input_data = reinterpret_cast<float *>(in_tensors_.at(0)->MutableData());
   auto output_data = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData());
 
-  return BroadcastTo(input_data, &shape_info_, output_data);
+  return BroadcastTo(float, input_data, &shape_info_, output_data);
 }
 
 REG_KERNEL(kCPU, kNumberTypeFloat32, PrimitiveType_BroadcastTo, LiteKernelCreator<BroadcastToCPUKernel>)
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.h b/mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.h
index 9415079d532..c54dc4407de 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/broadcast_to_fp32.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "src/lite_kernel.h"
 
-#include "nnacl/fp32/broadcast_to_fp32.h"
+#include "nnacl/base/broadcast_to.h"
 
 namespace mindspore::kernel {
 class BroadcastToCPUKernel : public LiteKernel {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc
index 9619ea4f77d..62caf197e98 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc
@@ -45,8 +45,8 @@ int ActivationGradCPUKernel::Init() {
 int ActivationGradCPUKernel::ReSize() { return RET_OK; }
 
 int ActivationGradCPUKernel::DoActivation(int task_id) {
-  auto yt_addr = reinterpret_cast<float *>(in_tensors_.at(0)->MutableData());
-  auto input_addr = reinterpret_cast<float *>(in_tensors_.at(1)->MutableData());
+  const auto yt_addr = reinterpret_cast<float *>(in_tensors_.at(0)->MutableData());
+  const auto input_addr = reinterpret_cast<float *>(in_tensors_.at(1)->MutableData());
   auto output_addr = reinterpret_cast<float *>(out_tensors_.at(0)->MutableData());
   int length = in_tensors_.at(0)->ElementsNum();
 
diff --git a/tests/st/ops/cpu/test_broadcast_to_op.py b/tests/st/ops/cpu/test_broadcast_to_op.py
index 05b0969c0d6..bb749b6e4b7 100644
--- a/tests/st/ops/cpu/test_broadcast_to_op.py
+++ b/tests/st/ops/cpu/test_broadcast_to_op.py
@@ -33,6 +33,24 @@ def test_broadcast():
     expect = np.broadcast_to(x_np, shape)
     assert np.allclose(output.asnumpy(), expect)
 
+    shape = (3, 5, 7, 4, 5, 6)
+    x_np = np.arange(20).reshape((4, 5, 1)).astype(np.int32)
+    output = P.BroadcastTo(shape)(Tensor(x_np))
+    expect = np.broadcast_to(x_np, shape)
+    assert np.allclose(output.asnumpy(), expect)
+
+    shape = (8, 5, 7, 4, 5, 6)
+    x_np = np.arange(24).reshape((1, 4, 1, 6)).astype(np.bool) + 0.2
+    output = P.BroadcastTo(shape)(Tensor(x_np))
+    expect = np.broadcast_to(x_np, shape)
+    assert np.allclose(output.asnumpy(), expect)
+
+    shape = (4, 5, 2, 3, 4, 5, 6)
+    x_np = np.random.rand(2, 3, 1, 5, 1).astype(np.float32)
+    output = P.BroadcastTo(shape)(Tensor(x_np))
+    expect = np.broadcast_to(x_np, shape)
+    assert np.allclose(output.asnumpy(), expect)
+
     shape = (3, 4, 5, 6)
     x_np = np.random.rand(3, 1, 5, 1).astype(np.float32)
     output = P.BroadcastTo(shape)(Tensor(x_np))
@@ -50,6 +68,12 @@ def test_broadcast():
     expect = np.broadcast_to(x1_np, shape)
     assert np.allclose(output.asnumpy(), expect)
 
+    shape = (4, 5)
+    x1_np = np.ones((1,)).astype(np.bool_)
+    output = P.BroadcastTo(shape)(Tensor(x1_np))
+    expect = np.broadcast_to(x1_np, shape)
+    assert np.allclose(output.asnumpy(), expect)
+
 
 @pytest.mark.level0
 @pytest.mark.platform_x86_gpu_training
diff --git a/tests/st/ops/cpu/test_relu6_grad_op.py b/tests/st/ops/cpu/test_relu6_grad_op.py
new file mode 100644
index 00000000000..b5e2725319a
--- /dev/null
+++ b/tests/st/ops/cpu/test_relu6_grad_op.py
@@ -0,0 +1,53 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.common.initializer import initializer
+from mindspore.common.parameter import Parameter
+from mindspore.ops.operations import _grad_ops as G
+
+context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
+
+
+class NetReluGrad(nn.Cell):
+    def __init__(self):
+        super(NetReluGrad, self).__init__()
+        self.relu6_grad = G.ReLU6Grad()
+        self.x = Parameter(initializer(Tensor(np.array([[[[1, 0, 6],
+                                                          [-2, 3, 6],
+                                                          [-3, 1, 8]]]]).astype(np.float32)), [1, 1, 3, 3]), name='x')
+        self.dy = Parameter(initializer(Tensor(np.array([[[[1, 2, 3],
+                                                           [4, 5, 6],
+                                                           [7, 8, 9]]]]).astype(np.float32)), [1, 1, 3, 3]), name='dy')
+
+    def construct(self):
+        return self.relu6_grad(self.dy, self.x)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+def test_relu_grad():
+    relu_grad = NetReluGrad()
+    output = relu_grad()
+    expect = np.array([[[[1, 0, 3], [0, 5, 6], [0, 8, 0]]]]).astype(np.float32)
+    error = np.ones(shape=[3, 3]) * 1.0e-6
+    diff = np.abs(output.asnumpy() - expect)
+    assert np.all(diff < error)
diff --git a/tests/st/ops/cpu/test_relu_grad_op.py b/tests/st/ops/cpu/test_relu_grad_op.py
index e76eaae87df..82c821351c6 100644
--- a/tests/st/ops/cpu/test_relu_grad_op.py
+++ b/tests/st/ops/cpu/test_relu_grad_op.py
@@ -29,7 +29,7 @@ context.set_context(mode=context.GRAPH_MODE, device_target='CPU')
 class NetReluGrad(nn.Cell):
     def __init__(self):
         super(NetReluGrad, self).__init__()
-        self.rekuGrad = G.ReluGrad()
+        self.relu_grad = G.ReluGrad()
         self.x = Parameter(initializer(Tensor(np.array([[[[-1, 1, 1],
                                                           [1, -1, 1],
                                                           [1, 1, -1]]]]).astype(np.float32)), [1, 1, 3, 3]), name='x')
@@ -38,7 +38,7 @@ class NetReluGrad(nn.Cell):
                                                            [1, 1, 1]]]]).astype(np.float32)), [1, 1, 3, 3]), name='dy')
 
     def construct(self):
-        return self.rekuGrad(self.dy, self.x)
+        return self.relu_grad(self.dy, self.x)
 
 
 @pytest.mark.level0
@@ -47,7 +47,7 @@ class NetReluGrad(nn.Cell):
 def test_relu_grad():
     relu_grad = NetReluGrad()
     output = relu_grad()
-    expect = np.array([[[[0, 0, 1,], [0, 0, 0,], [1, 1, 0.]]]]).astype(np.float32)
+    expect = np.array([[[[0, 0, 1], [0, 0, 0], [1, 1, 0]]]]).astype(np.float32)
     error = np.ones(shape=[3, 3]) * 1.0e-6
     diff = np.abs(output.asnumpy() - expect)
     assert np.all(diff < error)