From 5418a45752d744f17a27a1cd47896e6711da80c7 Mon Sep 17 00:00:00 2001 From: buxue Date: Wed, 22 Sep 2021 16:35:21 +0800 Subject: [PATCH] support cpu profiling and code check for cpu ops --- .../cpu/adam_weight_decay_cpu_kernel.cc | 24 +++---- .../cpu/adam_weight_decay_cpu_kernel.h | 5 -- .../cpu/depthtospace_cpu_kernel.cc | 25 +++---- .../cpu/depthtospace_cpu_kernel.h | 3 - .../kernel_compiler/cpu/iou_cpu_kernel.cc | 32 +++++---- .../kernel_compiler/cpu/iou_cpu_kernel.h | 6 +- .../cpu/l2_normalize_cpu_kernel.cc | 44 ++++++------ .../cpu/l2_normalize_cpu_kernel.h | 6 +- .../cpu/l2normalize_grad_cpu_kernel.cc | 24 +++---- .../cpu/l2normalize_grad_cpu_kernel.h | 5 +- .../cpu/masked_select_cpu_kernel.cc | 20 +++--- .../cpu/masked_select_cpu_kernel.h | 6 +- .../cpu/masked_select_grad_cpu_kernel.cc | 20 +++--- .../cpu/masked_select_grad_cpu_kernel.h | 6 +- .../cpu/pad_and_shift_cpu_kernel.cc | 13 ++-- mindspore/profiler/profiling.py | 7 +- tests/st/profiler/test_ascend_profiler.py | 2 + tests/st/profiler/test_cpu_profiler.py | 70 +++++++++++++++++++ tests/st/profiler/test_profiler.py | 55 ++++++++++----- 19 files changed, 218 insertions(+), 155 deletions(-) create mode 100644 tests/st/profiler/test_cpu_profiler.py diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.cc index fa349509652..e4c12521ab7 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.cc @@ -25,6 +25,13 @@ namespace mindspore { namespace kernel { +namespace { +constexpr size_t kSizeFloat32 = sizeof(float); +constexpr size_t kScalarIndex = 0; +constexpr size_t kAdamWeightDecayInputsNum = 9; +constexpr size_t kAdamWeightDecayOutputsNum = 3; +} // namespace + template void AdamWeightDecayCPUKernel::LaunchAdamWeightDecay(const std::vector &inputs, const std::vector &) { @@ -83,26 +90,15 @@ void AdamWeightDecayCPUKernel::LaunchAdamWeightDecayNnacl(const std::vector &inputs, const std::vector &, const std::vector &outputs) { - if (inputs.size() != kAdamWeightDecayInputNum) { - MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but AdamWeightDecay needs 9 inputs."; - } - if (outputs.size() != kAdamWeightDecayOutputNum) { - MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but AdamWeightDecay needs 3 outputs."; - } + CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAdamWeightDecayInputsNum, kernel_name_); + CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAdamWeightDecayOutputsNum, kernel_name_); if (inputs[VAR]->size != inputs[M]->size || inputs[VAR]->size != inputs[V]->size || inputs[VAR]->size != inputs[GRAD]->size) { MS_LOG(EXCEPTION) << "Var, m, v, grad input data size must be same!"; diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h index 119b92d988c..7fa7b7da8f2 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/adam_weight_decay_cpu_kernel.h @@ -25,11 +25,6 @@ namespace mindspore { namespace kernel { -constexpr size_t kSizeFloat32 = sizeof(float); -constexpr size_t kScalarIndex = 0; -constexpr size_t kAdamWeightDecayInputNum = 9; -constexpr size_t kAdamWeightDecayOutputNum = 3; - class AdamWeightDecayCPUKernel : public CPUKernel { public: AdamWeightDecayCPUKernel() = default; diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.cc index 7bc65cc0b2e..23da6333d1d 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.cc @@ -16,16 +16,17 @@ #include "backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h" -#include - -#include "runtime/device/cpu/cpu_device_address.h" - namespace mindspore { namespace kernel { +namespace { +constexpr size_t kDepthToSpaceInputsNum = 1; +constexpr size_t kDepthToSpaceOutputsNum = 1; +} // namespace + template void DepthToSpaceCPUKernel::InitKernel(const CNodePtr &kernel_node) { MS_EXCEPTION_IF_NULL(kernel_node); - CheckParam(kernel_node); + kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0); block_size_ = LongToSize(AnfAlgo::GetNodeAttr(kernel_node, "block_size")); @@ -35,6 +36,8 @@ template bool DepthToSpaceCPUKernel::Launch(const std::vector &inputs, const std::vector & /* workspace */, const std::vector &outputs) { + CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDepthToSpaceInputsNum, kernel_name_); + CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDepthToSpaceOutputsNum, kernel_name_); auto input_addr = reinterpret_cast(inputs[0]->addr); auto output_addr = reinterpret_cast(outputs[0]->addr); size_t size = inputs[0]->size / sizeof(T); @@ -73,17 +76,5 @@ bool DepthToSpaceCPUKernel::Launch(const std::vector &inp CPUKernelUtils::ParallelFor(task, size); return true; } - -template -void DepthToSpaceCPUKernel::CheckParam(const CNodePtr &kernel_node) { - size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); - if (input_num != 1) { - MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input."; - } - size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); - if (output_num != 1) { - MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output."; - } -} } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h index 6e1f715daad..dbe1024d156 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h @@ -17,8 +17,6 @@ #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_ -#include -#include #include #include "backend/kernel_compiler/cpu/cpu_kernel.h" @@ -37,7 +35,6 @@ class DepthToSpaceCPUKernel : public CPUKernel { const std::vector &outputs) override; private: - void CheckParam(const CNodePtr &kernel_node); std::vector input_shape_; std::vector output_shape_; size_t block_size_{0}; diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/iou_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/iou_cpu_kernel.cc index a6b85e1435e..cf5874b12e7 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/iou_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/iou_cpu_kernel.cc @@ -13,29 +13,35 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include + #include "backend/kernel_compiler/cpu/iou_cpu_kernel.h" -#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h" -#include "runtime/device/cpu/cpu_device_address.h" -#include "utils/ms_utils.h" + +#include +#include namespace mindspore { namespace kernel { +namespace { +constexpr size_t kIOUInputsNum = 2; +constexpr size_t kIOUOutputsNum = 1; +constexpr size_t kBoxCoordinateLen = 4; +} // namespace + template void IOUCPUKernel::InitKernel(const CNodePtr &kernel_node) { MS_EXCEPTION_IF_NULL(kernel_node); + kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); auto anchor_boxes_shape = AnfAlgo::GetInputDeviceShape(kernel_node, ANCHOR_BOXES); constexpr size_t BOX_SHAPE_SIZE = 2; constexpr size_t BOX_SIZE_INDEX = 0; constexpr size_t BOX_COORDINATE_INDEX = 1; - if (anchor_boxes_shape.size() != BOX_SHAPE_SIZE || anchor_boxes_shape[BOX_COORDINATE_INDEX] != BOX_COORDINATE_LEN) { + if (anchor_boxes_shape.size() != BOX_SHAPE_SIZE || anchor_boxes_shape[BOX_COORDINATE_INDEX] != kBoxCoordinateLen) { MS_LOG(EXCEPTION) << "The anchor_boxes shape should be [N, 4]."; } anchor_boxes_size_ = anchor_boxes_shape[BOX_SIZE_INDEX]; auto gt_boxes_shape = AnfAlgo::GetInputDeviceShape(kernel_node, GT_BOXES); - if (gt_boxes_shape.size() != BOX_SHAPE_SIZE || gt_boxes_shape[BOX_COORDINATE_INDEX] != BOX_COORDINATE_LEN) { + if (gt_boxes_shape.size() != BOX_SHAPE_SIZE || gt_boxes_shape[BOX_COORDINATE_INDEX] != kBoxCoordinateLen) { MS_LOG(EXCEPTION) << "The gt_boxes shape should be [N, 4]."; } gt_boxes_size_ = gt_boxes_shape[BOX_SIZE_INDEX]; @@ -52,12 +58,8 @@ void IOUCPUKernel::InitKernel(const CNodePtr &kernel_node) { template bool IOUCPUKernel::Launch(const std::vector &inputs, const std::vector &, const std::vector &outputs) { - if (inputs.size() != INPUT_NUMS) { - MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but IOU needs " << INPUT_NUMS << " inputs."; - } - if (outputs.size() != OUTPUT_NUMS) { - MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but IOU needs " << OUTPUT_NUMS << " outputs."; - } + CHECK_KERNEL_INPUTS_NUM(inputs.size(), kIOUInputsNum, kernel_name_); + CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kIOUOutputsNum, kernel_name_); auto anchor_boxes = reinterpret_cast(inputs[ANCHOR_BOXES]->addr); auto gt_boxes = reinterpret_cast(inputs[GT_BOXES]->addr); auto iou_score = reinterpret_cast(outputs[IOU_VALUE]->addr); @@ -71,8 +73,8 @@ bool IOUCPUKernel::Launch(const std::vector &inputs, cons constexpr size_t X1_SHIFT = 2; constexpr size_t Y1_SHIFT = 3; for (size_t i = start; i < end; i++) { - size_t idx1 = i % anchor_boxes_size_ * BOX_COORDINATE_LEN; - size_t idx2 = i / anchor_boxes_size_ * BOX_COORDINATE_LEN; + size_t idx1 = i % anchor_boxes_size_ * kBoxCoordinateLen; + size_t idx2 = i / anchor_boxes_size_ * kBoxCoordinateLen; T I_x0 = std::max(anchor_boxes[idx1], gt_boxes[idx2]); T I_y0 = std::max(anchor_boxes[idx1 + Y0_SHIFT], gt_boxes[idx2 + Y0_SHIFT]); T I_x1 = std::min(anchor_boxes[idx1 + X1_SHIFT], gt_boxes[idx2 + X1_SHIFT]); diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/iou_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/iou_cpu_kernel.h index 9e6c0818eaf..2964a636ed0 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/iou_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/iou_cpu_kernel.h @@ -17,16 +17,12 @@ #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_IOU_CPU_KERNEL_H_ #include -#include + #include "backend/kernel_compiler/cpu/cpu_kernel.h" #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" namespace mindspore { namespace kernel { -constexpr size_t INPUT_NUMS = 2; -constexpr size_t OUTPUT_NUMS = 1; -constexpr size_t BOX_COORDINATE_LEN = 4; - template class IOUCPUKernel : public CPUKernel { public: diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.cc index d5da0d7d5b3..c203093b52e 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.cc @@ -15,18 +15,33 @@ */ #include "backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.h" -#include "runtime/device/cpu/cpu_device_address.h" + +#include +#include namespace mindspore { namespace kernel { +namespace { +constexpr size_t kL2NormalizeInputsNum = 1; +constexpr size_t kL2NormalizeOutputsNum = 1; +} // namespace + template void L2NormalizeCPUKernel::InitKernel(const CNodePtr &kernel_node) { MS_EXCEPTION_IF_NULL(kernel_node); - epsilon_ = static_cast(AnfAlgo::GetNodeAttr(kernel_node, "epsilon")); - axis_ = LongToInt(AnfAlgo::GetNodeAttr(kernel_node, "axis")); + kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); + epsilon_ = static_cast(AnfAlgo::GetNodeAttr(kernel_node, EPSILON)); + axis_ = LongToInt(AnfAlgo::GetNodeAttr(kernel_node, AXIS)); input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0); - CheckParam(kernel_node); + + int dims = SizeToInt(input_shape_.size()); + if (axis_ < -dims || axis_ >= dims) { + MS_LOG(EXCEPTION) << "Attr axis_ " << axis_ << " must be in " << -dims << "~" << dims; + } + if (epsilon_ == (T)0.0) { + MS_LOG(EXCEPTION) << "Attr epsilon can not be zero."; + } if (axis_ < 0) { axis_ += SizeToInt(input_shape_.size()); } @@ -112,6 +127,8 @@ template bool L2NormalizeCPUKernel::Launch(const std::vector &inputs, const std::vector & /* workspace */, const std::vector &outputs) { + CHECK_KERNEL_INPUTS_NUM(inputs.size(), kL2NormalizeInputsNum, kernel_name_); + CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kL2NormalizeOutputsNum, kernel_name_); auto input_addr = reinterpret_cast(inputs[0]->addr); auto output_addr = reinterpret_cast(outputs[0]->addr); @@ -131,24 +148,5 @@ bool L2NormalizeCPUKernel::Launch(const std::vector &inpu return true; } - -template -void L2NormalizeCPUKernel::CheckParam(const CNodePtr &kernel_node) { - size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); - int dims = SizeToInt(input_shape_.size()); - if (input_num != 1) { - MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but L2NormalizeCPUKernel needs 1 input."; - } - size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); - if (output_num != 1) { - MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but L2NormalizeCPUKernel needs 1 output."; - } - if (axis_ < -dims || axis_ >= dims) { - MS_LOG(EXCEPTION) << "Attr axis_ " << axis_ << " must be in " << -dims << "~" << dims; - } - if (epsilon_ == (T)0.0) { - MS_LOG(EXCEPTION) << "Attr epsilon can not be zero."; - } -} } // namespace kernel } // namespace mindspore diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.h index 226a6155156..d7f43dc7f37 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.h @@ -16,10 +16,10 @@ #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2_NORMALIZE_CPU_KERNEL_H_ #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2_NORMALIZE_CPU_KERNEL_H_ -#include + #include -#include -#include +#include + #include "backend/kernel_compiler/cpu/cpu_kernel.h" #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.cc index 4447287dd7a..b35ad9eb527 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.cc @@ -15,15 +15,19 @@ */ #include "backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.h" -#include "runtime/device/cpu/cpu_device_address.h" namespace mindspore { namespace kernel { +namespace { +constexpr size_t kL2NormalizeGradInputsNum = 3; +constexpr size_t kL2NormalizeGradOutputsNum = 1; +} // namespace + template void L2NormalizeGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { MS_EXCEPTION_IF_NULL(kernel_node); - CheckIONumber(kernel_node); - for (size_t i = 0; i < INPUT_SIZE; i++) { + kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); + for (size_t i = 0; i < kL2NormalizeGradInputsNum; i++) { (void)input_shape_list_.emplace_back(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i)); } auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0); @@ -45,6 +49,8 @@ template bool L2NormalizeGradCPUKernel::Launch(const std::vector &inputs, const std::vector &workspace, const std::vector &outputs) { + CHECK_KERNEL_INPUTS_NUM(inputs.size(), kL2NormalizeGradInputsNum, kernel_name_); + CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kL2NormalizeGradOutputsNum, kernel_name_); auto input_x = reinterpret_cast(inputs[0]->addr); auto y = reinterpret_cast(inputs[1]->addr); auto dout = reinterpret_cast(inputs[2]->addr); @@ -78,18 +84,6 @@ void L2NormalizeGradCPUKernel::CheckInputShape(const std::vector &out } } -template -void L2NormalizeGradCPUKernel::CheckIONumber(const CNodePtr &kernel_node) { - size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); - if (input_num != INPUT_SIZE) { - MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but L2NormalizeGradCPUKernel needs 3 input."; - } - size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); - if (output_num != OUTPUT_SIZE) { - MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but L2NormalizeGradCPUKernel needs 1 output."; - } -} - template std::vector L2NormalizeGradCPUKernel::OneDimIndexToHighDimIndex(size_t one_dim_index) { std::vector high_dim_index; diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.h index e1cfe3cbb46..57628d1c0a8 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.h @@ -13,17 +13,17 @@ * See the License for the specific language governing permissions and * limitations under the License. */ + #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2NORMALIZE_GRAD_CPU_KERNEL_H_ #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2NORMALIZE_GRAD_CPU_KERNEL_H_ #include + #include "backend/kernel_compiler/cpu/cpu_kernel.h" #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" namespace mindspore { namespace kernel { -constexpr size_t INPUT_SIZE = 3; -constexpr size_t OUTPUT_SIZE = 1; template class L2NormalizeGradCPUKernel : public CPUKernel { public: @@ -37,7 +37,6 @@ class L2NormalizeGradCPUKernel : public CPUKernel { private: void CheckInputShape(const std::vector &output_shape); - void CheckIONumber(const CNodePtr &kernel_node); std::vector OneDimIndexToHighDimIndex(size_t one_dim_index); void HighDimIndexToOneDimIndex(size_t *one_dim_index, const std::vector &high_dim_index); std::vector GetVector(const std::vector &high_dim_index, const T *x); diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.cc index 684cce87618..6f92f967dd0 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.cc @@ -15,22 +15,18 @@ */ #include "backend/kernel_compiler/cpu/masked_select_cpu_kernel.h" -#include "runtime/device/cpu/cpu_device_address.h" namespace mindspore { namespace kernel { +namespace { +constexpr size_t kMaskedSelectInputsNum = 2; +constexpr size_t kMaskedSelectOutputsNum = 1; +} // namespace + template void MaskedSelectCPUKernel::InitKernel(const CNodePtr &kernel_node) { - size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); - if (input_num != kInputNum) { - MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but MaskedSelectCPUKernel needs " << kInputNum - << " input."; - } - size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); - if (output_num != kOutputNum) { - MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but MaskedSelectCPUKernel needs " << kOutputNum - << " output."; - } + MS_EXCEPTION_IF_NULL(kernel_node); + kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); input_shape_a_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0); input_shape_b_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1); output_shape_ = CPUKernelUtils::GetBroadcastShape(input_shape_a_, input_shape_b_); @@ -44,6 +40,8 @@ template bool MaskedSelectCPUKernel::Launch(const std::vector &inputs, const std::vector &, const std::vector &outputs) { + CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMaskedSelectInputsNum, kernel_name_); + CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMaskedSelectOutputsNum, kernel_name_); auto x = reinterpret_cast(inputs[0]->addr); auto mask = reinterpret_cast(inputs[1]->addr); auto y = reinterpret_cast(outputs[0]->addr); diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.h index a4265aed2db..9f23647413c 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_cpu_kernel.h @@ -16,16 +16,14 @@ #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_CPU_KERNEL_H_ #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_CPU_KERNEL_H_ -#include -#include + #include + #include "backend/kernel_compiler/cpu/cpu_kernel.h" #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" namespace mindspore { namespace kernel { -constexpr size_t kInputNum = 2; -constexpr size_t kOutputNum = 1; template class MaskedSelectCPUKernel : public CPUKernel { public: diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.cc index f70a8951cab..0fc87d90cd0 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.cc @@ -15,22 +15,18 @@ */ #include "backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.h" -#include "runtime/device/cpu/cpu_device_address.h" namespace mindspore { namespace kernel { +namespace { +constexpr size_t kMaskedSelectGradInputsNum = 3; +constexpr size_t kMaskedSelectGradOutputsNum = 1; +} // namespace + template void MaskedSelectGradCPUKernel::InitKernel(const CNodePtr &kernel_node) { - size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node); - if (input_num != kInputNum) { - MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but MaskedSelectGradCPUKernel needs " << kInputNum - << " input."; - } - size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node); - if (output_num != kOutputNum) { - MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but MaskedSelectGradCPUKernel needs " << kOutputNum - << " output."; - } + MS_EXCEPTION_IF_NULL(kernel_node); + kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); input_shape_a_ = AnfAlgo::GetInputDeviceShape(kernel_node, INPUT); input_shape_b_ = AnfAlgo::GetInputDeviceShape(kernel_node, MASK); grad_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, GRAD); @@ -44,6 +40,8 @@ template bool MaskedSelectGradCPUKernel::Launch(const std::vector &inputs, const std::vector &, const std::vector &outputs) { + CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMaskedSelectGradInputsNum, kernel_name_); + CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMaskedSelectGradOutputsNum, kernel_name_); auto mask = reinterpret_cast(inputs[MASK]->addr); auto grad = reinterpret_cast(inputs[GRAD]->addr); auto dx = reinterpret_cast(outputs[INPUT]->addr); diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.h b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.h index 66f0be99d71..b6e34ca8ca0 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.h +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.h @@ -16,16 +16,14 @@ #ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_GRAD_CPU_KERNEL_H_ #define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_GRAD_CPU_KERNEL_H_ -#include -#include + #include + #include "backend/kernel_compiler/cpu/cpu_kernel.h" #include "backend/kernel_compiler/cpu/cpu_kernel_factory.h" namespace mindspore { namespace kernel { -constexpr size_t kInputNum = 3; -constexpr size_t kOutputNum = 1; template class MaskedSelectGradCPUKernel : public CPUKernel { public: diff --git a/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_and_shift_cpu_kernel.cc b/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_and_shift_cpu_kernel.cc index 6af544a0d11..5e87ef84c53 100644 --- a/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_and_shift_cpu_kernel.cc +++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/pad_and_shift_cpu_kernel.cc @@ -15,13 +15,17 @@ */ #include "backend/kernel_compiler/cpu/pad_and_shift_cpu_kernel.h" -#include -#include "runtime/device/cpu/cpu_device_address.h" namespace mindspore { namespace kernel { +namespace { +constexpr size_t kPadAndShiftInputsNum = 3; +constexpr size_t kPadAndShiftOutputsNum = 1; +} // namespace + void PadAndShiftCPUKernel::InitKernel(const CNodePtr &kernel_node) { MS_EXCEPTION_IF_NULL(kernel_node); + kernel_name_ = AnfAlgo::GetCNodeName(kernel_node); node_wpt_ = kernel_node; input_x_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0); type_size_ = GetTypeByte(TypeIdToType(input_x_dtype_)); @@ -41,13 +45,14 @@ void PadAndShiftCPUKernel::InitKernel(const CNodePtr &kernel_node) { bool PadAndShiftCPUKernel::Launch(const std::vector &inputs, const std::vector &, const std::vector &outputs) { + CHECK_KERNEL_INPUTS_NUM(inputs.size(), kPadAndShiftInputsNum, kernel_name_); + CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kPadAndShiftOutputsNum, kernel_name_); if (input_x_dtype_ == kNumberTypeInt32) { LaunchKernel(inputs, outputs); } else if (input_x_dtype_ == kNumberTypeInt64) { LaunchKernel(inputs, outputs); } else { - MS_LOG(ERROR) << "Dtype of input_x only support int32, int64"; - return false; + MS_LOG(EXCEPTION) << "Dtype of input_x only support int32, int64"; } return true; } diff --git a/mindspore/profiler/profiling.py b/mindspore/profiler/profiling.py index 44e19e3c7af..20cededa33c 100644 --- a/mindspore/profiler/profiling.py +++ b/mindspore/profiler/profiling.py @@ -122,7 +122,10 @@ class Profiler: def __init__(self, **kwargs): if c_expression.security.enable_security(): - raise Runtime("Profiler is not supported if compiled with \'-s on\'") + raise RuntimeError("Profiler is not supported if compiled with \'-s on\'") + + if context.get_context("mode") == context.PYNATIVE_MODE: + raise RuntimeError("Profiler is not supported in PyNative mode") # get device_id and device_target self._get_devid_rankid_and_devtarget() @@ -643,7 +646,7 @@ class Profiler: dev_id = "0" logger.warning("Fail to get DEVICE_ID, use 0 instead.") - if device_target and device_target not in ["Ascend", "GPU"]: + if device_target and device_target not in ["Ascend", "GPU", "CPU"]: msg = "Profiling: unsupported backend: %s" % device_target raise RuntimeError(msg) diff --git a/tests/st/profiler/test_ascend_profiler.py b/tests/st/profiler/test_ascend_profiler.py index e711f0f9fa8..a2d255e243a 100644 --- a/tests/st/profiler/test_ascend_profiler.py +++ b/tests/st/profiler/test_ascend_profiler.py @@ -23,6 +23,7 @@ from mindspore import Tensor from mindspore.ops import operations as P from mindspore.profiler import Profiler + class Net(nn.Cell): def __init__(self): super(Net, self).__init__() @@ -35,6 +36,7 @@ class Net(nn.Cell): x = np.random.randn(1, 3, 3, 4).astype(np.float32) y = np.random.randn(1, 3, 3, 4).astype(np.float32) + @pytest.mark.level0 @pytest.mark.platform_arm_ascend_training @pytest.mark.platform_x86_ascend_training diff --git a/tests/st/profiler/test_cpu_profiler.py b/tests/st/profiler/test_cpu_profiler.py new file mode 100644 index 00000000000..44837bc9fc7 --- /dev/null +++ b/tests/st/profiler/test_cpu_profiler.py @@ -0,0 +1,70 @@ +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""test cpu profiler""" +import os +import shutil +import sys + +import numpy as np +import pytest + +import mindspore.context as context +import mindspore.nn as nn +from mindspore import Tensor +from mindspore.ops import operations as P +from mindspore.profiler import Profiler + + +class Net(nn.Cell): + def __init__(self): + super(Net, self).__init__() + self.add = P.Add() + + def construct(self, x_, y_): + return self.add(x_, y_) + + +@pytest.mark.level0 +@pytest.mark.platform_x86_cpu +@pytest.mark.env_onecard +@pytest.mark.security_off +def test_cpu_profiling(): + if sys.platform != 'linux': + return + data_path = os.path.join(os.getcwd(), 'data_cpu_profiler') + if os.path.isdir(data_path): + shutil.rmtree(data_path) + context.set_context(mode=context.GRAPH_MODE, device_target="CPU") + device_id = context.get_context("device_id") + profiler = Profiler(output_path="data_cpu_profiler") + x = np.random.randn(1, 3, 3, 4).astype(np.float32) + y = np.random.randn(1, 3, 3, 4).astype(np.float32) + add = Net() + add(Tensor(x), Tensor(y)) + profiler.analyse() + + assert os.path.isdir(data_path) + assert len(os.listdir(data_path)) == 1 + + profiler_dir = os.path.join(data_path, f"{os.listdir(data_path)[0]}/") + op_detail_file = f"{profiler_dir}cpu_op_detail_info_{device_id}.csv" + op_type_file = f"{profiler_dir}cpu_op_type_info_{device_id}.csv" + timeline_file = f"{profiler_dir}cpu_op_execute_timestamp_{device_id}.txt" + cpu_profiler_files = (op_detail_file, op_type_file, timeline_file) + for file in cpu_profiler_files: + assert os.path.isfile(file) + + if os.path.isdir(data_path): + shutil.rmtree(data_path) diff --git a/tests/st/profiler/test_profiler.py b/tests/st/profiler/test_profiler.py index 3e0cbac0933..46ba803b694 100644 --- a/tests/st/profiler/test_profiler.py +++ b/tests/st/profiler/test_profiler.py @@ -15,6 +15,8 @@ import os import shutil +import sys + from tests.security_utils import security_off_wrap import pytest @@ -53,6 +55,7 @@ def weight_variable(): class LeNet5(nn.Cell): """Define LeNet5 network.""" + def __init__(self, num_class=10, channel=1): super(LeNet5, self).__init__() self.num_class = num_class @@ -86,7 +89,7 @@ class LeNet5(nn.Cell): def create_dataset(data_path, batch_size=32, repeat_size=1, num_parallel_workers=1): """create dataset for train""" # define dataset - mnist_ds = ds.MnistDataset(data_path, num_samples=batch_size*100) + mnist_ds = ds.MnistDataset(data_path, num_samples=batch_size * 100) resize_height, resize_width = 32, 32 rescale = 1.0 / 255.0 @@ -131,10 +134,26 @@ class TestProfiler: rank_id = int(os.getenv('RANK_ID')) if os.getenv('RANK_ID') else 0 mnist_path = '/home/workspace/mindspore_dataset/mnist' - def teardown(self): - """ Run after each use case.""" + @classmethod + def setup_class(cls): + """Run begin all test case start.""" cleanup() + @staticmethod + def teardown(): + """Run after each test case end.""" + cleanup() + + @pytest.mark.level2 + @pytest.mark.platform_x86_cpu + @pytest.mark.env_onecard + @security_off_wrap + def test_cpu_profiler(self): + if sys.platform != 'linux': + return + self._train_with_profiler(device_target="CPU") + self._check_cpu_profiling_file() + @pytest.mark.level1 @pytest.mark.platform_x86_gpu_training @pytest.mark.env_onecard @@ -177,12 +196,10 @@ class TestProfiler: getnext_file = self.profiler_path + f'minddata_getnext_profiling_{self.device_id}.txt' pipeline_file = self.profiler_path + f'minddata_pipeline_raw_{self.device_id}.csv' - assert os.path.exists(op_detail_file) - assert os.path.exists(op_type_file) - assert os.path.exists(activity_file) - assert os.path.exists(timeline_file) - assert os.path.exists(getnext_file) - assert os.path.exists(pipeline_file) + gpu_profiler_files = (op_detail_file, op_type_file, activity_file, + timeline_file, getnext_file, pipeline_file) + for file in gpu_profiler_files: + assert os.path.isfile(file) def _check_d_profiling_file(self): aicore_file = self.profiler_path + f'aicore_intermediate_{self.rank_id}_detail.csv' @@ -193,10 +210,16 @@ class TestProfiler: queue_profiling_file = self.profiler_path + f'device_queue_profiling_{self.rank_id}.txt' memory_file = self.profiler_path + f'memory_usage_{self.rank_id}.pb' - assert os.path.exists(aicore_file) - assert os.path.exists(step_trace_file) - assert os.path.exists(timeline_file) - assert os.path.exists(queue_profiling_file) - assert os.path.exists(minddata_pipeline_file) - assert os.path.exists(aicpu_file) - assert os.path.exists(memory_file) + d_profiler_files = (aicore_file, step_trace_file, timeline_file, aicpu_file, + minddata_pipeline_file, queue_profiling_file, memory_file) + for file in d_profiler_files: + assert os.path.isfile(file) + + def _check_cpu_profiling_file(self): + op_detail_file = self.profiler_path + f'cpu_op_detail_info_{self.device_id}.csv' + op_type_file = self.profiler_path + f'cpu_op_type_info_{self.device_id}.csv' + timeline_file = self.profiler_path + f'cpu_op_execute_timestamp_{self.device_id}.txt' + + cpu_profiler_files = (op_detail_file, op_type_file, timeline_file) + for file in cpu_profiler_files: + assert os.path.isfile(file)