support cpu profiling and code check for cpu ops

This commit is contained in:
buxue 2021-09-22 16:35:21 +08:00
parent 8c748cd7d2
commit 5418a45752
19 changed files with 218 additions and 155 deletions

View File

@ -25,6 +25,13 @@
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kSizeFloat32 = sizeof(float);
constexpr size_t kScalarIndex = 0;
constexpr size_t kAdamWeightDecayInputsNum = 9;
constexpr size_t kAdamWeightDecayOutputsNum = 3;
} // namespace
template <typename T>
void AdamWeightDecayCPUKernel::LaunchAdamWeightDecay(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &) {
@ -83,26 +90,15 @@ void AdamWeightDecayCPUKernel::LaunchAdamWeightDecayNnacl(const std::vector<Addr
void AdamWeightDecayCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
if (input_num != kAdamWeightDecayInputNum) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but AdamWeightDecay needs 9 inputs.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != kAdamWeightDecayOutputNum) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AdamWeightDecay needs 3 outputs.";
}
}
bool AdamWeightDecayCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (inputs.size() != kAdamWeightDecayInputNum) {
MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but AdamWeightDecay needs 9 inputs.";
}
if (outputs.size() != kAdamWeightDecayOutputNum) {
MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but AdamWeightDecay needs 3 outputs.";
}
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAdamWeightDecayInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAdamWeightDecayOutputsNum, kernel_name_);
if (inputs[VAR]->size != inputs[M]->size || inputs[VAR]->size != inputs[V]->size ||
inputs[VAR]->size != inputs[GRAD]->size) {
MS_LOG(EXCEPTION) << "Var, m, v, grad input data size must be same!";

View File

@ -25,11 +25,6 @@
namespace mindspore {
namespace kernel {
constexpr size_t kSizeFloat32 = sizeof(float);
constexpr size_t kScalarIndex = 0;
constexpr size_t kAdamWeightDecayInputNum = 9;
constexpr size_t kAdamWeightDecayOutputNum = 3;
class AdamWeightDecayCPUKernel : public CPUKernel {
public:
AdamWeightDecayCPUKernel() = default;

View File

@ -16,16 +16,17 @@
#include "backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h"
#include <vector>
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kDepthToSpaceInputsNum = 1;
constexpr size_t kDepthToSpaceOutputsNum = 1;
} // namespace
template <typename T>
void DepthToSpaceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
CheckParam(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
block_size_ = LongToSize(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size"));
@ -35,6 +36,8 @@ template <typename T>
bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> & /* workspace */,
const std::vector<kernel::AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDepthToSpaceInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDepthToSpaceOutputsNum, kernel_name_);
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
size_t size = inputs[0]->size / sizeof(T);
@ -73,17 +76,5 @@ bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inp
CPUKernelUtils::ParallelFor(task, size);
return true;
}
template <typename T>
void DepthToSpaceCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != 1) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
}
}
} // namespace kernel
} // namespace mindspore

View File

@ -17,8 +17,6 @@
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
#include <memory>
#include <string>
#include <vector>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
@ -37,7 +35,6 @@ class DepthToSpaceCPUKernel : public CPUKernel {
const std::vector<AddressPtr> &outputs) override;
private:
void CheckParam(const CNodePtr &kernel_node);
std::vector<size_t> input_shape_;
std::vector<size_t> output_shape_;
size_t block_size_{0};

View File

@ -13,29 +13,35 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <algorithm>
#include <string>
#include "backend/kernel_compiler/cpu/iou_cpu_kernel.h"
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include "utils/ms_utils.h"
#include <string>
#include <algorithm>
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kIOUInputsNum = 2;
constexpr size_t kIOUOutputsNum = 1;
constexpr size_t kBoxCoordinateLen = 4;
} // namespace
template <typename T>
void IOUCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
auto anchor_boxes_shape = AnfAlgo::GetInputDeviceShape(kernel_node, ANCHOR_BOXES);
constexpr size_t BOX_SHAPE_SIZE = 2;
constexpr size_t BOX_SIZE_INDEX = 0;
constexpr size_t BOX_COORDINATE_INDEX = 1;
if (anchor_boxes_shape.size() != BOX_SHAPE_SIZE || anchor_boxes_shape[BOX_COORDINATE_INDEX] != BOX_COORDINATE_LEN) {
if (anchor_boxes_shape.size() != BOX_SHAPE_SIZE || anchor_boxes_shape[BOX_COORDINATE_INDEX] != kBoxCoordinateLen) {
MS_LOG(EXCEPTION) << "The anchor_boxes shape should be [N, 4].";
}
anchor_boxes_size_ = anchor_boxes_shape[BOX_SIZE_INDEX];
auto gt_boxes_shape = AnfAlgo::GetInputDeviceShape(kernel_node, GT_BOXES);
if (gt_boxes_shape.size() != BOX_SHAPE_SIZE || gt_boxes_shape[BOX_COORDINATE_INDEX] != BOX_COORDINATE_LEN) {
if (gt_boxes_shape.size() != BOX_SHAPE_SIZE || gt_boxes_shape[BOX_COORDINATE_INDEX] != kBoxCoordinateLen) {
MS_LOG(EXCEPTION) << "The gt_boxes shape should be [N, 4].";
}
gt_boxes_size_ = gt_boxes_shape[BOX_SIZE_INDEX];
@ -52,12 +58,8 @@ void IOUCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
template <typename T>
bool IOUCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
if (inputs.size() != INPUT_NUMS) {
MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but IOU needs " << INPUT_NUMS << " inputs.";
}
if (outputs.size() != OUTPUT_NUMS) {
MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but IOU needs " << OUTPUT_NUMS << " outputs.";
}
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kIOUInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kIOUOutputsNum, kernel_name_);
auto anchor_boxes = reinterpret_cast<T *>(inputs[ANCHOR_BOXES]->addr);
auto gt_boxes = reinterpret_cast<T *>(inputs[GT_BOXES]->addr);
auto iou_score = reinterpret_cast<T *>(outputs[IOU_VALUE]->addr);
@ -71,8 +73,8 @@ bool IOUCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, cons
constexpr size_t X1_SHIFT = 2;
constexpr size_t Y1_SHIFT = 3;
for (size_t i = start; i < end; i++) {
size_t idx1 = i % anchor_boxes_size_ * BOX_COORDINATE_LEN;
size_t idx2 = i / anchor_boxes_size_ * BOX_COORDINATE_LEN;
size_t idx1 = i % anchor_boxes_size_ * kBoxCoordinateLen;
size_t idx2 = i / anchor_boxes_size_ * kBoxCoordinateLen;
T I_x0 = std::max(anchor_boxes[idx1], gt_boxes[idx2]);
T I_y0 = std::max(anchor_boxes[idx1 + Y0_SHIFT], gt_boxes[idx2 + Y0_SHIFT]);
T I_x1 = std::min(anchor_boxes[idx1 + X1_SHIFT], gt_boxes[idx2 + X1_SHIFT]);

View File

@ -17,16 +17,12 @@
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_IOU_CPU_KERNEL_H_
#include <vector>
#include <memory>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
constexpr size_t INPUT_NUMS = 2;
constexpr size_t OUTPUT_NUMS = 1;
constexpr size_t BOX_COORDINATE_LEN = 4;
template <typename T>
class IOUCPUKernel : public CPUKernel {
public:

View File

@ -15,18 +15,33 @@
*/
#include "backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
#include <utility>
#include <limits>
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kL2NormalizeInputsNum = 1;
constexpr size_t kL2NormalizeOutputsNum = 1;
} // namespace
template <typename T>
void L2NormalizeCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
epsilon_ = static_cast<T>(AnfAlgo::GetNodeAttr<float>(kernel_node, "epsilon"));
axis_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "axis"));
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
epsilon_ = static_cast<T>(AnfAlgo::GetNodeAttr<float>(kernel_node, EPSILON));
axis_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS));
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
CheckParam(kernel_node);
int dims = SizeToInt(input_shape_.size());
if (axis_ < -dims || axis_ >= dims) {
MS_LOG(EXCEPTION) << "Attr axis_ " << axis_ << " must be in " << -dims << "~" << dims;
}
if (epsilon_ == (T)0.0) {
MS_LOG(EXCEPTION) << "Attr epsilon can not be zero.";
}
if (axis_ < 0) {
axis_ += SizeToInt(input_shape_.size());
}
@ -112,6 +127,8 @@ template <typename T>
bool L2NormalizeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> & /* workspace */,
const std::vector<kernel::AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kL2NormalizeInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kL2NormalizeOutputsNum, kernel_name_);
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
@ -131,24 +148,5 @@ bool L2NormalizeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inpu
return true;
}
template <typename T>
void L2NormalizeCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
int dims = SizeToInt(input_shape_.size());
if (input_num != 1) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but L2NormalizeCPUKernel needs 1 input.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != 1) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but L2NormalizeCPUKernel needs 1 output.";
}
if (axis_ < -dims || axis_ >= dims) {
MS_LOG(EXCEPTION) << "Attr axis_ " << axis_ << " must be in " << -dims << "~" << dims;
}
if (epsilon_ == (T)0.0) {
MS_LOG(EXCEPTION) << "Attr epsilon can not be zero.";
}
}
} // namespace kernel
} // namespace mindspore

View File

@ -16,10 +16,10 @@
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2_NORMALIZE_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2_NORMALIZE_CPU_KERNEL_H_
#include <memory>
#include <vector>
#include <limits>
#include <utility>
#include <memory>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"

View File

@ -15,15 +15,19 @@
*/
#include "backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kL2NormalizeGradInputsNum = 3;
constexpr size_t kL2NormalizeGradOutputsNum = 1;
} // namespace
template <typename T>
void L2NormalizeGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
CheckIONumber(kernel_node);
for (size_t i = 0; i < INPUT_SIZE; i++) {
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
for (size_t i = 0; i < kL2NormalizeGradInputsNum; i++) {
(void)input_shape_list_.emplace_back(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i));
}
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
@ -45,6 +49,8 @@ template <typename T>
bool L2NormalizeGradCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs,
const std::vector<AddressPtr> &workspace,
const std::vector<AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kL2NormalizeGradInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kL2NormalizeGradOutputsNum, kernel_name_);
auto input_x = reinterpret_cast<T *>(inputs[0]->addr);
auto y = reinterpret_cast<T *>(inputs[1]->addr);
auto dout = reinterpret_cast<T *>(inputs[2]->addr);
@ -78,18 +84,6 @@ void L2NormalizeGradCPUKernel<T>::CheckInputShape(const std::vector<size_t> &out
}
}
template <typename T>
void L2NormalizeGradCPUKernel<T>::CheckIONumber(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != INPUT_SIZE) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but L2NormalizeGradCPUKernel needs 3 input.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != OUTPUT_SIZE) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but L2NormalizeGradCPUKernel needs 1 output.";
}
}
template <typename T>
std::vector<size_t> L2NormalizeGradCPUKernel<T>::OneDimIndexToHighDimIndex(size_t one_dim_index) {
std::vector<size_t> high_dim_index;

View File

@ -13,17 +13,17 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2NORMALIZE_GRAD_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2NORMALIZE_GRAD_CPU_KERNEL_H_
#include <vector>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
constexpr size_t INPUT_SIZE = 3;
constexpr size_t OUTPUT_SIZE = 1;
template <typename T>
class L2NormalizeGradCPUKernel : public CPUKernel {
public:
@ -37,7 +37,6 @@ class L2NormalizeGradCPUKernel : public CPUKernel {
private:
void CheckInputShape(const std::vector<size_t> &output_shape);
void CheckIONumber(const CNodePtr &kernel_node);
std::vector<size_t> OneDimIndexToHighDimIndex(size_t one_dim_index);
void HighDimIndexToOneDimIndex(size_t *one_dim_index, const std::vector<size_t> &high_dim_index);
std::vector<T> GetVector(const std::vector<size_t> &high_dim_index, const T *x);

View File

@ -15,22 +15,18 @@
*/
#include "backend/kernel_compiler/cpu/masked_select_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kMaskedSelectInputsNum = 2;
constexpr size_t kMaskedSelectOutputsNum = 1;
} // namespace
template <typename T>
void MaskedSelectCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != kInputNum) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but MaskedSelectCPUKernel needs " << kInputNum
<< " input.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != kOutputNum) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but MaskedSelectCPUKernel needs " << kOutputNum
<< " output.";
}
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
input_shape_a_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
input_shape_b_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
output_shape_ = CPUKernelUtils::GetBroadcastShape(input_shape_a_, input_shape_b_);
@ -44,6 +40,8 @@ template <typename T>
bool MaskedSelectCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMaskedSelectInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMaskedSelectOutputsNum, kernel_name_);
auto x = reinterpret_cast<T *>(inputs[0]->addr);
auto mask = reinterpret_cast<bool *>(inputs[1]->addr);
auto y = reinterpret_cast<T *>(outputs[0]->addr);

View File

@ -16,16 +16,14 @@
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_CPU_KERNEL_H_
#include <memory>
#include <unordered_map>
#include <vector>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
constexpr size_t kInputNum = 2;
constexpr size_t kOutputNum = 1;
template <typename T>
class MaskedSelectCPUKernel : public CPUKernel {
public:

View File

@ -15,22 +15,18 @@
*/
#include "backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.h"
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kMaskedSelectGradInputsNum = 3;
constexpr size_t kMaskedSelectGradOutputsNum = 1;
} // namespace
template <typename T>
void MaskedSelectGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
if (input_num != kInputNum) {
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but MaskedSelectGradCPUKernel needs " << kInputNum
<< " input.";
}
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
if (output_num != kOutputNum) {
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but MaskedSelectGradCPUKernel needs " << kOutputNum
<< " output.";
}
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
input_shape_a_ = AnfAlgo::GetInputDeviceShape(kernel_node, INPUT);
input_shape_b_ = AnfAlgo::GetInputDeviceShape(kernel_node, MASK);
grad_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, GRAD);
@ -44,6 +40,8 @@ template <typename T>
bool MaskedSelectGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMaskedSelectGradInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMaskedSelectGradOutputsNum, kernel_name_);
auto mask = reinterpret_cast<bool *>(inputs[MASK]->addr);
auto grad = reinterpret_cast<T *>(inputs[GRAD]->addr);
auto dx = reinterpret_cast<T *>(outputs[INPUT]->addr);

View File

@ -16,16 +16,14 @@
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_GRAD_CPU_KERNEL_H_
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_GRAD_CPU_KERNEL_H_
#include <memory>
#include <unordered_map>
#include <vector>
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
namespace mindspore {
namespace kernel {
constexpr size_t kInputNum = 3;
constexpr size_t kOutputNum = 1;
template <typename T>
class MaskedSelectGradCPUKernel : public CPUKernel {
public:

View File

@ -15,13 +15,17 @@
*/
#include "backend/kernel_compiler/cpu/pad_and_shift_cpu_kernel.h"
#include <string>
#include "runtime/device/cpu/cpu_device_address.h"
namespace mindspore {
namespace kernel {
namespace {
constexpr size_t kPadAndShiftInputsNum = 3;
constexpr size_t kPadAndShiftOutputsNum = 1;
} // namespace
void PadAndShiftCPUKernel::InitKernel(const CNodePtr &kernel_node) {
MS_EXCEPTION_IF_NULL(kernel_node);
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
node_wpt_ = kernel_node;
input_x_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
type_size_ = GetTypeByte(TypeIdToType(input_x_dtype_));
@ -41,13 +45,14 @@ void PadAndShiftCPUKernel::InitKernel(const CNodePtr &kernel_node) {
bool PadAndShiftCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
const std::vector<kernel::AddressPtr> &,
const std::vector<kernel::AddressPtr> &outputs) {
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kPadAndShiftInputsNum, kernel_name_);
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kPadAndShiftOutputsNum, kernel_name_);
if (input_x_dtype_ == kNumberTypeInt32) {
LaunchKernel<int>(inputs, outputs);
} else if (input_x_dtype_ == kNumberTypeInt64) {
LaunchKernel<int64_t>(inputs, outputs);
} else {
MS_LOG(ERROR) << "Dtype of input_x only support int32, int64";
return false;
MS_LOG(EXCEPTION) << "Dtype of input_x only support int32, int64";
}
return true;
}

View File

@ -122,7 +122,10 @@ class Profiler:
def __init__(self, **kwargs):
if c_expression.security.enable_security():
raise Runtime("Profiler is not supported if compiled with \'-s on\'")
raise RuntimeError("Profiler is not supported if compiled with \'-s on\'")
if context.get_context("mode") == context.PYNATIVE_MODE:
raise RuntimeError("Profiler is not supported in PyNative mode")
# get device_id and device_target
self._get_devid_rankid_and_devtarget()
@ -643,7 +646,7 @@ class Profiler:
dev_id = "0"
logger.warning("Fail to get DEVICE_ID, use 0 instead.")
if device_target and device_target not in ["Ascend", "GPU"]:
if device_target and device_target not in ["Ascend", "GPU", "CPU"]:
msg = "Profiling: unsupported backend: %s" % device_target
raise RuntimeError(msg)

View File

@ -23,6 +23,7 @@ from mindspore import Tensor
from mindspore.ops import operations as P
from mindspore.profiler import Profiler
class Net(nn.Cell):
def __init__(self):
super(Net, self).__init__()
@ -35,6 +36,7 @@ class Net(nn.Cell):
x = np.random.randn(1, 3, 3, 4).astype(np.float32)
y = np.random.randn(1, 3, 3, 4).astype(np.float32)
@pytest.mark.level0
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_ascend_training

View File

@ -0,0 +1,70 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""test cpu profiler"""
import os
import shutil
import sys
import numpy as np
import pytest
import mindspore.context as context
import mindspore.nn as nn
from mindspore import Tensor
from mindspore.ops import operations as P
from mindspore.profiler import Profiler
class Net(nn.Cell):
def __init__(self):
super(Net, self).__init__()
self.add = P.Add()
def construct(self, x_, y_):
return self.add(x_, y_)
@pytest.mark.level0
@pytest.mark.platform_x86_cpu
@pytest.mark.env_onecard
@pytest.mark.security_off
def test_cpu_profiling():
if sys.platform != 'linux':
return
data_path = os.path.join(os.getcwd(), 'data_cpu_profiler')
if os.path.isdir(data_path):
shutil.rmtree(data_path)
context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
device_id = context.get_context("device_id")
profiler = Profiler(output_path="data_cpu_profiler")
x = np.random.randn(1, 3, 3, 4).astype(np.float32)
y = np.random.randn(1, 3, 3, 4).astype(np.float32)
add = Net()
add(Tensor(x), Tensor(y))
profiler.analyse()
assert os.path.isdir(data_path)
assert len(os.listdir(data_path)) == 1
profiler_dir = os.path.join(data_path, f"{os.listdir(data_path)[0]}/")
op_detail_file = f"{profiler_dir}cpu_op_detail_info_{device_id}.csv"
op_type_file = f"{profiler_dir}cpu_op_type_info_{device_id}.csv"
timeline_file = f"{profiler_dir}cpu_op_execute_timestamp_{device_id}.txt"
cpu_profiler_files = (op_detail_file, op_type_file, timeline_file)
for file in cpu_profiler_files:
assert os.path.isfile(file)
if os.path.isdir(data_path):
shutil.rmtree(data_path)

View File

@ -15,6 +15,8 @@
import os
import shutil
import sys
from tests.security_utils import security_off_wrap
import pytest
@ -53,6 +55,7 @@ def weight_variable():
class LeNet5(nn.Cell):
"""Define LeNet5 network."""
def __init__(self, num_class=10, channel=1):
super(LeNet5, self).__init__()
self.num_class = num_class
@ -86,7 +89,7 @@ class LeNet5(nn.Cell):
def create_dataset(data_path, batch_size=32, repeat_size=1, num_parallel_workers=1):
"""create dataset for train"""
# define dataset
mnist_ds = ds.MnistDataset(data_path, num_samples=batch_size*100)
mnist_ds = ds.MnistDataset(data_path, num_samples=batch_size * 100)
resize_height, resize_width = 32, 32
rescale = 1.0 / 255.0
@ -131,10 +134,26 @@ class TestProfiler:
rank_id = int(os.getenv('RANK_ID')) if os.getenv('RANK_ID') else 0
mnist_path = '/home/workspace/mindspore_dataset/mnist'
def teardown(self):
""" Run after each use case."""
@classmethod
def setup_class(cls):
"""Run begin all test case start."""
cleanup()
@staticmethod
def teardown():
"""Run after each test case end."""
cleanup()
@pytest.mark.level2
@pytest.mark.platform_x86_cpu
@pytest.mark.env_onecard
@security_off_wrap
def test_cpu_profiler(self):
if sys.platform != 'linux':
return
self._train_with_profiler(device_target="CPU")
self._check_cpu_profiling_file()
@pytest.mark.level1
@pytest.mark.platform_x86_gpu_training
@pytest.mark.env_onecard
@ -177,12 +196,10 @@ class TestProfiler:
getnext_file = self.profiler_path + f'minddata_getnext_profiling_{self.device_id}.txt'
pipeline_file = self.profiler_path + f'minddata_pipeline_raw_{self.device_id}.csv'
assert os.path.exists(op_detail_file)
assert os.path.exists(op_type_file)
assert os.path.exists(activity_file)
assert os.path.exists(timeline_file)
assert os.path.exists(getnext_file)
assert os.path.exists(pipeline_file)
gpu_profiler_files = (op_detail_file, op_type_file, activity_file,
timeline_file, getnext_file, pipeline_file)
for file in gpu_profiler_files:
assert os.path.isfile(file)
def _check_d_profiling_file(self):
aicore_file = self.profiler_path + f'aicore_intermediate_{self.rank_id}_detail.csv'
@ -193,10 +210,16 @@ class TestProfiler:
queue_profiling_file = self.profiler_path + f'device_queue_profiling_{self.rank_id}.txt'
memory_file = self.profiler_path + f'memory_usage_{self.rank_id}.pb'
assert os.path.exists(aicore_file)
assert os.path.exists(step_trace_file)
assert os.path.exists(timeline_file)
assert os.path.exists(queue_profiling_file)
assert os.path.exists(minddata_pipeline_file)
assert os.path.exists(aicpu_file)
assert os.path.exists(memory_file)
d_profiler_files = (aicore_file, step_trace_file, timeline_file, aicpu_file,
minddata_pipeline_file, queue_profiling_file, memory_file)
for file in d_profiler_files:
assert os.path.isfile(file)
def _check_cpu_profiling_file(self):
op_detail_file = self.profiler_path + f'cpu_op_detail_info_{self.device_id}.csv'
op_type_file = self.profiler_path + f'cpu_op_type_info_{self.device_id}.csv'
timeline_file = self.profiler_path + f'cpu_op_execute_timestamp_{self.device_id}.txt'
cpu_profiler_files = (op_detail_file, op_type_file, timeline_file)
for file in cpu_profiler_files:
assert os.path.isfile(file)