!24061 support cpu profiling and cpu ops code check

Merge pull request !24061 from zhangbuxue/support_cpu_profiling_and_code_check_for_cpu_ops
2021-09-27 01:32:34 +00:00 · 2021-09-27 01:32:34 +00:00 · cacf8427dd
parent 2138dd1d70 5418a45752
commit cacf8427dd
19 changed files with 218 additions and 155 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.cc
@ -25,6 +25,13 @@

 namespace mindspore {
 namespace kernel {
+namespace {
+constexpr size_t kSizeFloat32 = sizeof(float);
+constexpr size_t kScalarIndex = 0;
+constexpr size_t kAdamWeightDecayInputsNum = 9;
+constexpr size_t kAdamWeightDecayOutputsNum = 3;
+}  // namespace
+
 template <typename T>
 void AdamWeightDecayCPUKernel::LaunchAdamWeightDecay(const std::vector<AddressPtr> &inputs,
                                                     const std::vector<AddressPtr> &) {
@ -83,26 +90,15 @@ void AdamWeightDecayCPUKernel::LaunchAdamWeightDecayNnacl(const std::vector<Addr

 void AdamWeightDecayCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
-  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
+  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
-  if (input_num != kAdamWeightDecayInputNum) {
-    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but AdamWeightDecay needs 9 inputs.";
-  }
-  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
-  if (output_num != kAdamWeightDecayOutputNum) {
-    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AdamWeightDecay needs 3 outputs.";
-  }
 }

 bool AdamWeightDecayCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> &,
                                      const std::vector<kernel::AddressPtr> &outputs) {
-  if (inputs.size() != kAdamWeightDecayInputNum) {
-    MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but AdamWeightDecay needs 9 inputs.";
-  }
-  if (outputs.size() != kAdamWeightDecayOutputNum) {
-    MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but AdamWeightDecay needs 3 outputs.";
-  }
+  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAdamWeightDecayInputsNum, kernel_name_);
+  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAdamWeightDecayOutputsNum, kernel_name_);
  if (inputs[VAR]->size != inputs[M]->size || inputs[VAR]->size != inputs[V]->size ||
      inputs[VAR]->size != inputs[GRAD]->size) {
    MS_LOG(EXCEPTION) << "Var, m, v, grad input data size must be same!";
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h
@ -25,11 +25,6 @@

 namespace mindspore {
 namespace kernel {
-constexpr size_t kSizeFloat32 = sizeof(float);
-constexpr size_t kScalarIndex = 0;
-constexpr size_t kAdamWeightDecayInputNum = 9;
-constexpr size_t kAdamWeightDecayOutputNum = 3;
-
 class AdamWeightDecayCPUKernel : public CPUKernel {
 public:
  AdamWeightDecayCPUKernel() = default;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.cc
@ -16,16 +16,17 @@

 #include "backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h"

-#include <vector>
-
-#include "runtime/device/cpu/cpu_device_address.h"
-
 namespace mindspore {
 namespace kernel {
+namespace {
+constexpr size_t kDepthToSpaceInputsNum = 1;
+constexpr size_t kDepthToSpaceOutputsNum = 1;
+}  // namespace
+
 template <typename T>
 void DepthToSpaceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
-  CheckParam(kernel_node);
+  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
  block_size_ = LongToSize(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size"));
@ -35,6 +36,8 @@ template <typename T>
 bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> & /* workspace */,
                                      const std::vector<kernel::AddressPtr> &outputs) {
+  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDepthToSpaceInputsNum, kernel_name_);
+  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDepthToSpaceOutputsNum, kernel_name_);
  auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
  size_t size = inputs[0]->size / sizeof(T);
@ -73,17 +76,5 @@ bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inp
  CPUKernelUtils::ParallelFor(task, size);
  return true;
 }
-
-template <typename T>
-void DepthToSpaceCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
-  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
-  if (input_num != 1) {
-    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
-  }
-  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
-  if (output_num != 1) {
-    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
-  }
-}
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h
@ -17,8 +17,6 @@
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_

-#include <memory>
-#include <string>
 #include <vector>

 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
@ -37,7 +35,6 @@ class DepthToSpaceCPUKernel : public CPUKernel {
              const std::vector<AddressPtr> &outputs) override;

 private:
-  void CheckParam(const CNodePtr &kernel_node);
  std::vector<size_t> input_shape_;
  std::vector<size_t> output_shape_;
  size_t block_size_{0};
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/iou_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/iou_cpu_kernel.cc
@ -13,29 +13,35 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include <algorithm>
-#include <string>
+
 #include "backend/kernel_compiler/cpu/iou_cpu_kernel.h"
-#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
-#include "runtime/device/cpu/cpu_device_address.h"
-#include "utils/ms_utils.h"
+
+#include <string>
+#include <algorithm>

 namespace mindspore {
 namespace kernel {
+namespace {
+constexpr size_t kIOUInputsNum = 2;
+constexpr size_t kIOUOutputsNum = 1;
+constexpr size_t kBoxCoordinateLen = 4;
+}  // namespace
+
 template <typename T>
 void IOUCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
+  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  auto anchor_boxes_shape = AnfAlgo::GetInputDeviceShape(kernel_node, ANCHOR_BOXES);
  constexpr size_t BOX_SHAPE_SIZE = 2;
  constexpr size_t BOX_SIZE_INDEX = 0;
  constexpr size_t BOX_COORDINATE_INDEX = 1;

-  if (anchor_boxes_shape.size() != BOX_SHAPE_SIZE || anchor_boxes_shape[BOX_COORDINATE_INDEX] != BOX_COORDINATE_LEN) {
+  if (anchor_boxes_shape.size() != BOX_SHAPE_SIZE || anchor_boxes_shape[BOX_COORDINATE_INDEX] != kBoxCoordinateLen) {
    MS_LOG(EXCEPTION) << "The anchor_boxes shape should be [N, 4].";
  }
  anchor_boxes_size_ = anchor_boxes_shape[BOX_SIZE_INDEX];
  auto gt_boxes_shape = AnfAlgo::GetInputDeviceShape(kernel_node, GT_BOXES);
-  if (gt_boxes_shape.size() != BOX_SHAPE_SIZE || gt_boxes_shape[BOX_COORDINATE_INDEX] != BOX_COORDINATE_LEN) {
+  if (gt_boxes_shape.size() != BOX_SHAPE_SIZE || gt_boxes_shape[BOX_COORDINATE_INDEX] != kBoxCoordinateLen) {
    MS_LOG(EXCEPTION) << "The gt_boxes shape should be [N, 4].";
  }
  gt_boxes_size_ = gt_boxes_shape[BOX_SIZE_INDEX];
@ -52,12 +58,8 @@ void IOUCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
 template <typename T>
 bool IOUCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
                             const std::vector<kernel::AddressPtr> &outputs) {
-  if (inputs.size() != INPUT_NUMS) {
-    MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but IOU needs " << INPUT_NUMS << " inputs.";
-  }
-  if (outputs.size() != OUTPUT_NUMS) {
-    MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but IOU needs " << OUTPUT_NUMS << " outputs.";
-  }
+  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kIOUInputsNum, kernel_name_);
+  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kIOUOutputsNum, kernel_name_);
  auto anchor_boxes = reinterpret_cast<T *>(inputs[ANCHOR_BOXES]->addr);
  auto gt_boxes = reinterpret_cast<T *>(inputs[GT_BOXES]->addr);
  auto iou_score = reinterpret_cast<T *>(outputs[IOU_VALUE]->addr);
@ -71,8 +73,8 @@ bool IOUCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, cons
    constexpr size_t X1_SHIFT = 2;
    constexpr size_t Y1_SHIFT = 3;
    for (size_t i = start; i < end; i++) {
-      size_t idx1 = i % anchor_boxes_size_ * BOX_COORDINATE_LEN;
-      size_t idx2 = i / anchor_boxes_size_ * BOX_COORDINATE_LEN;
+      size_t idx1 = i % anchor_boxes_size_ * kBoxCoordinateLen;
+      size_t idx2 = i / anchor_boxes_size_ * kBoxCoordinateLen;
      T I_x0 = std::max(anchor_boxes[idx1], gt_boxes[idx2]);
      T I_y0 = std::max(anchor_boxes[idx1 + Y0_SHIFT], gt_boxes[idx2 + Y0_SHIFT]);
      T I_x1 = std::min(anchor_boxes[idx1 + X1_SHIFT], gt_boxes[idx2 + X1_SHIFT]);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/iou_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/iou_cpu_kernel.h
@ -17,16 +17,12 @@
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_IOU_CPU_KERNEL_H_

 #include <vector>
-#include <memory>
+
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
-constexpr size_t INPUT_NUMS = 2;
-constexpr size_t OUTPUT_NUMS = 1;
-constexpr size_t BOX_COORDINATE_LEN = 4;
-
 template <typename T>
 class IOUCPUKernel : public CPUKernel {
 public:
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.cc
@ -15,18 +15,33 @@
 */

 #include "backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.h"
-#include "runtime/device/cpu/cpu_device_address.h"
+
+#include <utility>
+#include <limits>

 namespace mindspore {
 namespace kernel {
+namespace {
+constexpr size_t kL2NormalizeInputsNum = 1;
+constexpr size_t kL2NormalizeOutputsNum = 1;
+}  // namespace
+
 template <typename T>
 void L2NormalizeCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
-  epsilon_ = static_cast<T>(AnfAlgo::GetNodeAttr<float>(kernel_node, "epsilon"));
-  axis_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "axis"));
+  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
+  epsilon_ = static_cast<T>(AnfAlgo::GetNodeAttr<float>(kernel_node, EPSILON));
+  axis_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS));
  input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
-  CheckParam(kernel_node);
+
+  int dims = SizeToInt(input_shape_.size());
+  if (axis_ < -dims || axis_ >= dims) {
+    MS_LOG(EXCEPTION) << "Attr axis_ " << axis_ << " must be in " << -dims << "~" << dims;
+  }
+  if (epsilon_ == (T)0.0) {
+    MS_LOG(EXCEPTION) << "Attr epsilon can not be zero.";
+  }
  if (axis_ < 0) {
    axis_ += SizeToInt(input_shape_.size());
  }
@ -112,6 +127,8 @@ template <typename T>
 bool L2NormalizeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                     const std::vector<kernel::AddressPtr> & /* workspace */,
                                     const std::vector<kernel::AddressPtr> &outputs) {
+  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kL2NormalizeInputsNum, kernel_name_);
+  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kL2NormalizeOutputsNum, kernel_name_);
  auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);

@ -131,24 +148,5 @@ bool L2NormalizeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inpu

  return true;
 }
-
-template <typename T>
-void L2NormalizeCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
-  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
-  int dims = SizeToInt(input_shape_.size());
-  if (input_num != 1) {
-    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but L2NormalizeCPUKernel needs 1 input.";
-  }
-  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
-  if (output_num != 1) {
-    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but L2NormalizeCPUKernel needs 1 output.";
-  }
-  if (axis_ < -dims || axis_ >= dims) {
-    MS_LOG(EXCEPTION) << "Attr axis_ " << axis_ << " must be in " << -dims << "~" << dims;
-  }
-  if (epsilon_ == (T)0.0) {
-    MS_LOG(EXCEPTION) << "Attr epsilon can not be zero.";
-  }
-}
 }  // namespace kernel
 }  // namespace mindspore
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.h
@ -16,10 +16,10 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2_NORMALIZE_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2_NORMALIZE_CPU_KERNEL_H_
-#include <memory>
+
 #include <vector>
-#include <limits>
-#include <utility>
+#include <memory>
+
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.cc
@ -15,15 +15,19 @@
 */

 #include "backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.h"
-#include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
+namespace {
+constexpr size_t kL2NormalizeGradInputsNum = 3;
+constexpr size_t kL2NormalizeGradOutputsNum = 1;
+}  // namespace
+
 template <typename T>
 void L2NormalizeGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
-  CheckIONumber(kernel_node);
-  for (size_t i = 0; i < INPUT_SIZE; i++) {
+  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
+  for (size_t i = 0; i < kL2NormalizeGradInputsNum; i++) {
    (void)input_shape_list_.emplace_back(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i));
  }
  auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
@ -45,6 +49,8 @@ template <typename T>
 bool L2NormalizeGradCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs,
                                         const std::vector<AddressPtr> &workspace,
                                         const std::vector<AddressPtr> &outputs) {
+  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kL2NormalizeGradInputsNum, kernel_name_);
+  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kL2NormalizeGradOutputsNum, kernel_name_);
  auto input_x = reinterpret_cast<T *>(inputs[0]->addr);
  auto y = reinterpret_cast<T *>(inputs[1]->addr);
  auto dout = reinterpret_cast<T *>(inputs[2]->addr);
@ -78,18 +84,6 @@ void L2NormalizeGradCPUKernel<T>::CheckInputShape(const std::vector<size_t> &out
  }
 }

-template <typename T>
-void L2NormalizeGradCPUKernel<T>::CheckIONumber(const CNodePtr &kernel_node) {
-  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
-  if (input_num != INPUT_SIZE) {
-    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but L2NormalizeGradCPUKernel needs 3 input.";
-  }
-  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
-  if (output_num != OUTPUT_SIZE) {
-    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but L2NormalizeGradCPUKernel needs 1 output.";
-  }
-}
-
 template <typename T>
 std::vector<size_t> L2NormalizeGradCPUKernel<T>::OneDimIndexToHighDimIndex(size_t one_dim_index) {
  std::vector<size_t> high_dim_index;
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.h
@ -13,17 +13,17 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
+
 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2NORMALIZE_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2NORMALIZE_GRAD_CPU_KERNEL_H_

 #include <vector>
+
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
-constexpr size_t INPUT_SIZE = 3;
-constexpr size_t OUTPUT_SIZE = 1;
 template <typename T>
 class L2NormalizeGradCPUKernel : public CPUKernel {
 public:
@ -37,7 +37,6 @@ class L2NormalizeGradCPUKernel : public CPUKernel {

 private:
  void CheckInputShape(const std::vector<size_t> &output_shape);
-  void CheckIONumber(const CNodePtr &kernel_node);
  std::vector<size_t> OneDimIndexToHighDimIndex(size_t one_dim_index);
  void HighDimIndexToOneDimIndex(size_t *one_dim_index, const std::vector<size_t> &high_dim_index);
  std::vector<T> GetVector(const std::vector<size_t> &high_dim_index, const T *x);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.cc
@ -15,22 +15,18 @@
 */

 #include "backend/kernel_compiler/cpu/masked_select_cpu_kernel.h"
-#include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
+namespace {
+constexpr size_t kMaskedSelectInputsNum = 2;
+constexpr size_t kMaskedSelectOutputsNum = 1;
+}  // namespace
+
 template <typename T>
 void MaskedSelectCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
-  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
-  if (input_num != kInputNum) {
-    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but MaskedSelectCPUKernel needs " << kInputNum
-                      << " input.";
-  }
-  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
-  if (output_num != kOutputNum) {
-    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but MaskedSelectCPUKernel needs " << kOutputNum
-                      << " output.";
-  }
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_shape_a_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
  input_shape_b_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
  output_shape_ = CPUKernelUtils::GetBroadcastShape(input_shape_a_, input_shape_b_);
@ -44,6 +40,8 @@ template <typename T>
 bool MaskedSelectCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                      const std::vector<kernel::AddressPtr> &,
                                      const std::vector<kernel::AddressPtr> &outputs) {
+  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMaskedSelectInputsNum, kernel_name_);
+  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMaskedSelectOutputsNum, kernel_name_);
  auto x = reinterpret_cast<T *>(inputs[0]->addr);
  auto mask = reinterpret_cast<bool *>(inputs[1]->addr);
  auto y = reinterpret_cast<T *>(outputs[0]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.h
@ -16,16 +16,14 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_CPU_KERNEL_H_
-#include <memory>
-#include <unordered_map>
+
 #include <vector>
+
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
-constexpr size_t kInputNum = 2;
-constexpr size_t kOutputNum = 1;
 template <typename T>
 class MaskedSelectCPUKernel : public CPUKernel {
 public:
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.cc
@ -15,22 +15,18 @@
 */

 #include "backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.h"
-#include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
+namespace {
+constexpr size_t kMaskedSelectGradInputsNum = 3;
+constexpr size_t kMaskedSelectGradOutputsNum = 1;
+}  // namespace
+
 template <typename T>
 void MaskedSelectGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
-  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
-  if (input_num != kInputNum) {
-    MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but MaskedSelectGradCPUKernel needs " << kInputNum
-                      << " input.";
-  }
-  size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
-  if (output_num != kOutputNum) {
-    MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but MaskedSelectGradCPUKernel needs " << kOutputNum
-                      << " output.";
-  }
+  MS_EXCEPTION_IF_NULL(kernel_node);
+  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  input_shape_a_ = AnfAlgo::GetInputDeviceShape(kernel_node, INPUT);
  input_shape_b_ = AnfAlgo::GetInputDeviceShape(kernel_node, MASK);
  grad_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, GRAD);
@ -44,6 +40,8 @@ template <typename T>
 bool MaskedSelectGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                          const std::vector<kernel::AddressPtr> &,
                                          const std::vector<kernel::AddressPtr> &outputs) {
+  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMaskedSelectGradInputsNum, kernel_name_);
+  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMaskedSelectGradOutputsNum, kernel_name_);
  auto mask = reinterpret_cast<bool *>(inputs[MASK]->addr);
  auto grad = reinterpret_cast<T *>(inputs[GRAD]->addr);
  auto dx = reinterpret_cast<T *>(outputs[INPUT]->addr);
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.h
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.h
@ -16,16 +16,14 @@

 #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_GRAD_CPU_KERNEL_H_
 #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_GRAD_CPU_KERNEL_H_
-#include <memory>
-#include <unordered_map>
+
 #include <vector>
+
 #include "backend/kernel_compiler/cpu/cpu_kernel.h"
 #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

 namespace mindspore {
 namespace kernel {
-constexpr size_t kInputNum = 3;
-constexpr size_t kOutputNum = 1;
 template <typename T>
 class MaskedSelectGradCPUKernel : public CPUKernel {
 public:
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_and_shift_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_and_shift_cpu_kernel.cc
@ -15,13 +15,17 @@
 */

 #include "backend/kernel_compiler/cpu/pad_and_shift_cpu_kernel.h"
-#include <string>
-#include "runtime/device/cpu/cpu_device_address.h"

 namespace mindspore {
 namespace kernel {
+namespace {
+constexpr size_t kPadAndShiftInputsNum = 3;
+constexpr size_t kPadAndShiftOutputsNum = 1;
+}  // namespace
+
 void PadAndShiftCPUKernel::InitKernel(const CNodePtr &kernel_node) {
  MS_EXCEPTION_IF_NULL(kernel_node);
+  kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
  node_wpt_ = kernel_node;
  input_x_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
  type_size_ = GetTypeByte(TypeIdToType(input_x_dtype_));
@ -41,13 +45,14 @@ void PadAndShiftCPUKernel::InitKernel(const CNodePtr &kernel_node) {
 bool PadAndShiftCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
                                  const std::vector<kernel::AddressPtr> &,
                                  const std::vector<kernel::AddressPtr> &outputs) {
+  CHECK_KERNEL_INPUTS_NUM(inputs.size(), kPadAndShiftInputsNum, kernel_name_);
+  CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kPadAndShiftOutputsNum, kernel_name_);
  if (input_x_dtype_ == kNumberTypeInt32) {
    LaunchKernel<int>(inputs, outputs);
  } else if (input_x_dtype_ == kNumberTypeInt64) {
    LaunchKernel<int64_t>(inputs, outputs);
  } else {
-    MS_LOG(ERROR) << "Dtype of input_x only support int32, int64";
-    return false;
+    MS_LOG(EXCEPTION) << "Dtype of input_x only support int32, int64";
  }
  return true;
 }
--- a/mindspore/profiler/profiling.py
+++ b/mindspore/profiler/profiling.py
@ -122,7 +122,10 @@ class Profiler:

    def __init__(self, **kwargs):
        if c_expression.security.enable_security():
-            raise Runtime("Profiler is not supported if compiled with \'-s on\'")
+            raise RuntimeError("Profiler is not supported if compiled with \'-s on\'")
+
+        if context.get_context("mode") == context.PYNATIVE_MODE:
+            raise RuntimeError("Profiler is not supported in PyNative mode")

        # get device_id and device_target
        self._get_devid_rankid_and_devtarget()
@ -643,7 +646,7 @@ class Profiler:
            dev_id = "0"
            logger.warning("Fail to get DEVICE_ID, use 0 instead.")

-        if device_target and device_target not in ["Ascend", "GPU"]:
+        if device_target and device_target not in ["Ascend", "GPU", "CPU"]:
            msg = "Profiling: unsupported backend: %s" % device_target
            raise RuntimeError(msg)

--- a/tests/st/profiler/test_ascend_profiler.py
+++ b/tests/st/profiler/test_ascend_profiler.py
@ -23,6 +23,7 @@ from mindspore import Tensor
 from mindspore.ops import operations as P
 from mindspore.profiler import Profiler

+
 class Net(nn.Cell):
    def __init__(self):
        super(Net, self).__init__()
@ -35,6 +36,7 @@ class Net(nn.Cell):
 x = np.random.randn(1, 3, 3, 4).astype(np.float32)
 y = np.random.randn(1, 3, 3, 4).astype(np.float32)

+
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training
--- a/tests/st/profiler/test_cpu_profiler.py
+++ b/tests/st/profiler/test_cpu_profiler.py
@ -0,0 +1,70 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""test cpu profiler"""
+import os
+import shutil
+import sys
+
+import numpy as np
+import pytest
+
+import mindspore.context as context
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore.ops import operations as P
+from mindspore.profiler import Profiler
+
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.add = P.Add()
+
+    def construct(self, x_, y_):
+        return self.add(x_, y_)
+
+
+@pytest.mark.level0
+@pytest.mark.platform_x86_cpu
+@pytest.mark.env_onecard
+@pytest.mark.security_off
+def test_cpu_profiling():
+    if sys.platform != 'linux':
+        return
+    data_path = os.path.join(os.getcwd(), 'data_cpu_profiler')
+    if os.path.isdir(data_path):
+        shutil.rmtree(data_path)
+    context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
+    device_id = context.get_context("device_id")
+    profiler = Profiler(output_path="data_cpu_profiler")
+    x = np.random.randn(1, 3, 3, 4).astype(np.float32)
+    y = np.random.randn(1, 3, 3, 4).astype(np.float32)
+    add = Net()
+    add(Tensor(x), Tensor(y))
+    profiler.analyse()
+
+    assert os.path.isdir(data_path)
+    assert len(os.listdir(data_path)) == 1
+
+    profiler_dir = os.path.join(data_path, f"{os.listdir(data_path)[0]}/")
+    op_detail_file = f"{profiler_dir}cpu_op_detail_info_{device_id}.csv"
+    op_type_file = f"{profiler_dir}cpu_op_type_info_{device_id}.csv"
+    timeline_file = f"{profiler_dir}cpu_op_execute_timestamp_{device_id}.txt"
+    cpu_profiler_files = (op_detail_file, op_type_file, timeline_file)
+    for file in cpu_profiler_files:
+        assert os.path.isfile(file)
+
+    if os.path.isdir(data_path):
+        shutil.rmtree(data_path)
--- a/tests/st/profiler/test_profiler.py
+++ b/tests/st/profiler/test_profiler.py
@ -15,6 +15,8 @@
 import os
 import shutil

+import sys
+
 from tests.security_utils import security_off_wrap
 import pytest

@ -53,6 +55,7 @@ def weight_variable():

 class LeNet5(nn.Cell):
    """Define LeNet5 network."""
+
    def __init__(self, num_class=10, channel=1):
        super(LeNet5, self).__init__()
        self.num_class = num_class
@ -86,7 +89,7 @@ class LeNet5(nn.Cell):
 def create_dataset(data_path, batch_size=32, repeat_size=1, num_parallel_workers=1):
    """create dataset for train"""
    # define dataset
-    mnist_ds = ds.MnistDataset(data_path, num_samples=batch_size*100)
+    mnist_ds = ds.MnistDataset(data_path, num_samples=batch_size * 100)

    resize_height, resize_width = 32, 32
    rescale = 1.0 / 255.0
@ -131,10 +134,26 @@ class TestProfiler:
    rank_id = int(os.getenv('RANK_ID')) if os.getenv('RANK_ID') else 0
    mnist_path = '/home/workspace/mindspore_dataset/mnist'

-    def teardown(self):
-        """ Run after each use case."""
+    @classmethod
+    def setup_class(cls):
+        """Run begin all test case start."""
        cleanup()

+    @staticmethod
+    def teardown():
+        """Run after each test case end."""
+        cleanup()
+
+    @pytest.mark.level2
+    @pytest.mark.platform_x86_cpu
+    @pytest.mark.env_onecard
+    @security_off_wrap
+    def test_cpu_profiler(self):
+        if sys.platform != 'linux':
+            return
+        self._train_with_profiler(device_target="CPU")
+        self._check_cpu_profiling_file()
+
    @pytest.mark.level1
    @pytest.mark.platform_x86_gpu_training
    @pytest.mark.env_onecard
@ -177,12 +196,10 @@ class TestProfiler:
        getnext_file = self.profiler_path + f'minddata_getnext_profiling_{self.device_id}.txt'
        pipeline_file = self.profiler_path + f'minddata_pipeline_raw_{self.device_id}.csv'

-        assert os.path.exists(op_detail_file)
-        assert os.path.exists(op_type_file)
-        assert os.path.exists(activity_file)
-        assert os.path.exists(timeline_file)
-        assert os.path.exists(getnext_file)
-        assert os.path.exists(pipeline_file)
+        gpu_profiler_files = (op_detail_file, op_type_file, activity_file,
+                              timeline_file, getnext_file, pipeline_file)
+        for file in gpu_profiler_files:
+            assert os.path.isfile(file)

    def _check_d_profiling_file(self):
        aicore_file = self.profiler_path + f'aicore_intermediate_{self.rank_id}_detail.csv'
@ -193,10 +210,16 @@ class TestProfiler:
        queue_profiling_file = self.profiler_path + f'device_queue_profiling_{self.rank_id}.txt'
        memory_file = self.profiler_path + f'memory_usage_{self.rank_id}.pb'

-        assert os.path.exists(aicore_file)
-        assert os.path.exists(step_trace_file)
-        assert os.path.exists(timeline_file)
-        assert os.path.exists(queue_profiling_file)
-        assert os.path.exists(minddata_pipeline_file)
-        assert os.path.exists(aicpu_file)
-        assert os.path.exists(memory_file)
+        d_profiler_files = (aicore_file, step_trace_file, timeline_file, aicpu_file,
+                            minddata_pipeline_file, queue_profiling_file, memory_file)
+        for file in d_profiler_files:
+            assert os.path.isfile(file)
+
+    def _check_cpu_profiling_file(self):
+        op_detail_file = self.profiler_path + f'cpu_op_detail_info_{self.device_id}.csv'
+        op_type_file = self.profiler_path + f'cpu_op_type_info_{self.device_id}.csv'
+        timeline_file = self.profiler_path + f'cpu_op_execute_timestamp_{self.device_id}.txt'
+
+        cpu_profiler_files = (op_detail_file, op_type_file, timeline_file)
+        for file in cpu_profiler_files:
+            assert os.path.isfile(file)