forked from mindspore-Ecosystem/mindspore
!24061 support cpu profiling and cpu ops code check
Merge pull request !24061 from zhangbuxue/support_cpu_profiling_and_code_check_for_cpu_ops
This commit is contained in:
commit
cacf8427dd
|
@ -25,6 +25,13 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace {
|
||||
constexpr size_t kSizeFloat32 = sizeof(float);
|
||||
constexpr size_t kScalarIndex = 0;
|
||||
constexpr size_t kAdamWeightDecayInputsNum = 9;
|
||||
constexpr size_t kAdamWeightDecayOutputsNum = 3;
|
||||
} // namespace
|
||||
|
||||
template <typename T>
|
||||
void AdamWeightDecayCPUKernel::LaunchAdamWeightDecay(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &) {
|
||||
|
@ -83,26 +90,15 @@ void AdamWeightDecayCPUKernel::LaunchAdamWeightDecayNnacl(const std::vector<Addr
|
|||
|
||||
void AdamWeightDecayCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
if (input_num != kAdamWeightDecayInputNum) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but AdamWeightDecay needs 9 inputs.";
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != kAdamWeightDecayOutputNum) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but AdamWeightDecay needs 3 outputs.";
|
||||
}
|
||||
}
|
||||
|
||||
bool AdamWeightDecayCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
if (inputs.size() != kAdamWeightDecayInputNum) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but AdamWeightDecay needs 9 inputs.";
|
||||
}
|
||||
if (outputs.size() != kAdamWeightDecayOutputNum) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but AdamWeightDecay needs 3 outputs.";
|
||||
}
|
||||
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kAdamWeightDecayInputsNum, kernel_name_);
|
||||
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kAdamWeightDecayOutputsNum, kernel_name_);
|
||||
if (inputs[VAR]->size != inputs[M]->size || inputs[VAR]->size != inputs[V]->size ||
|
||||
inputs[VAR]->size != inputs[GRAD]->size) {
|
||||
MS_LOG(EXCEPTION) << "Var, m, v, grad input data size must be same!";
|
||||
|
|
|
@ -25,11 +25,6 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr size_t kSizeFloat32 = sizeof(float);
|
||||
constexpr size_t kScalarIndex = 0;
|
||||
constexpr size_t kAdamWeightDecayInputNum = 9;
|
||||
constexpr size_t kAdamWeightDecayOutputNum = 3;
|
||||
|
||||
class AdamWeightDecayCPUKernel : public CPUKernel {
|
||||
public:
|
||||
AdamWeightDecayCPUKernel() = default;
|
||||
|
|
|
@ -16,16 +16,17 @@
|
|||
|
||||
#include "backend/kernel_compiler/cpu/depthtospace_cpu_kernel.h"
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace {
|
||||
constexpr size_t kDepthToSpaceInputsNum = 1;
|
||||
constexpr size_t kDepthToSpaceOutputsNum = 1;
|
||||
} // namespace
|
||||
|
||||
template <typename T>
|
||||
void DepthToSpaceCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
CheckParam(kernel_node);
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
output_shape_ = AnfAlgo::GetOutputDeviceShape(kernel_node, 0);
|
||||
block_size_ = LongToSize(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "block_size"));
|
||||
|
@ -35,6 +36,8 @@ template <typename T>
|
|||
bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> & /* workspace */,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kDepthToSpaceInputsNum, kernel_name_);
|
||||
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kDepthToSpaceOutputsNum, kernel_name_);
|
||||
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
size_t size = inputs[0]->size / sizeof(T);
|
||||
|
@ -73,17 +76,5 @@ bool DepthToSpaceCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inp
|
|||
CPUKernelUtils::ParallelFor(task, size);
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void DepthToSpaceCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but DepthToSpaceCPUKerrnel needs 1 input.";
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but DepthToSpaceCPUKernel needs 1 output.";
|
||||
}
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -17,8 +17,6 @@
|
|||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_DEPTHTOSPACE_CPU_KERNEL_H_
|
||||
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
|
@ -37,7 +35,6 @@ class DepthToSpaceCPUKernel : public CPUKernel {
|
|||
const std::vector<AddressPtr> &outputs) override;
|
||||
|
||||
private:
|
||||
void CheckParam(const CNodePtr &kernel_node);
|
||||
std::vector<size_t> input_shape_;
|
||||
std::vector<size_t> output_shape_;
|
||||
size_t block_size_{0};
|
||||
|
|
|
@ -13,29 +13,35 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include <algorithm>
|
||||
#include <string>
|
||||
|
||||
#include "backend/kernel_compiler/cpu/iou_cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/mkldnn/mkl_kernel_engine.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
#include "utils/ms_utils.h"
|
||||
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace {
|
||||
constexpr size_t kIOUInputsNum = 2;
|
||||
constexpr size_t kIOUOutputsNum = 1;
|
||||
constexpr size_t kBoxCoordinateLen = 4;
|
||||
} // namespace
|
||||
|
||||
template <typename T>
|
||||
void IOUCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
auto anchor_boxes_shape = AnfAlgo::GetInputDeviceShape(kernel_node, ANCHOR_BOXES);
|
||||
constexpr size_t BOX_SHAPE_SIZE = 2;
|
||||
constexpr size_t BOX_SIZE_INDEX = 0;
|
||||
constexpr size_t BOX_COORDINATE_INDEX = 1;
|
||||
|
||||
if (anchor_boxes_shape.size() != BOX_SHAPE_SIZE || anchor_boxes_shape[BOX_COORDINATE_INDEX] != BOX_COORDINATE_LEN) {
|
||||
if (anchor_boxes_shape.size() != BOX_SHAPE_SIZE || anchor_boxes_shape[BOX_COORDINATE_INDEX] != kBoxCoordinateLen) {
|
||||
MS_LOG(EXCEPTION) << "The anchor_boxes shape should be [N, 4].";
|
||||
}
|
||||
anchor_boxes_size_ = anchor_boxes_shape[BOX_SIZE_INDEX];
|
||||
auto gt_boxes_shape = AnfAlgo::GetInputDeviceShape(kernel_node, GT_BOXES);
|
||||
if (gt_boxes_shape.size() != BOX_SHAPE_SIZE || gt_boxes_shape[BOX_COORDINATE_INDEX] != BOX_COORDINATE_LEN) {
|
||||
if (gt_boxes_shape.size() != BOX_SHAPE_SIZE || gt_boxes_shape[BOX_COORDINATE_INDEX] != kBoxCoordinateLen) {
|
||||
MS_LOG(EXCEPTION) << "The gt_boxes shape should be [N, 4].";
|
||||
}
|
||||
gt_boxes_size_ = gt_boxes_shape[BOX_SIZE_INDEX];
|
||||
|
@ -52,12 +58,8 @@ void IOUCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
|||
template <typename T>
|
||||
bool IOUCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
if (inputs.size() != INPUT_NUMS) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << inputs.size() << ", but IOU needs " << INPUT_NUMS << " inputs.";
|
||||
}
|
||||
if (outputs.size() != OUTPUT_NUMS) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << outputs.size() << ", but IOU needs " << OUTPUT_NUMS << " outputs.";
|
||||
}
|
||||
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kIOUInputsNum, kernel_name_);
|
||||
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kIOUOutputsNum, kernel_name_);
|
||||
auto anchor_boxes = reinterpret_cast<T *>(inputs[ANCHOR_BOXES]->addr);
|
||||
auto gt_boxes = reinterpret_cast<T *>(inputs[GT_BOXES]->addr);
|
||||
auto iou_score = reinterpret_cast<T *>(outputs[IOU_VALUE]->addr);
|
||||
|
@ -71,8 +73,8 @@ bool IOUCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs, cons
|
|||
constexpr size_t X1_SHIFT = 2;
|
||||
constexpr size_t Y1_SHIFT = 3;
|
||||
for (size_t i = start; i < end; i++) {
|
||||
size_t idx1 = i % anchor_boxes_size_ * BOX_COORDINATE_LEN;
|
||||
size_t idx2 = i / anchor_boxes_size_ * BOX_COORDINATE_LEN;
|
||||
size_t idx1 = i % anchor_boxes_size_ * kBoxCoordinateLen;
|
||||
size_t idx2 = i / anchor_boxes_size_ * kBoxCoordinateLen;
|
||||
T I_x0 = std::max(anchor_boxes[idx1], gt_boxes[idx2]);
|
||||
T I_y0 = std::max(anchor_boxes[idx1 + Y0_SHIFT], gt_boxes[idx2 + Y0_SHIFT]);
|
||||
T I_x1 = std::min(anchor_boxes[idx1 + X1_SHIFT], gt_boxes[idx2 + X1_SHIFT]);
|
||||
|
|
|
@ -17,16 +17,12 @@
|
|||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_IOU_CPU_KERNEL_H_
|
||||
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr size_t INPUT_NUMS = 2;
|
||||
constexpr size_t OUTPUT_NUMS = 1;
|
||||
constexpr size_t BOX_COORDINATE_LEN = 4;
|
||||
|
||||
template <typename T>
|
||||
class IOUCPUKernel : public CPUKernel {
|
||||
public:
|
||||
|
|
|
@ -15,18 +15,33 @@
|
|||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/l2_normalize_cpu_kernel.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
#include <utility>
|
||||
#include <limits>
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace {
|
||||
constexpr size_t kL2NormalizeInputsNum = 1;
|
||||
constexpr size_t kL2NormalizeOutputsNum = 1;
|
||||
} // namespace
|
||||
|
||||
template <typename T>
|
||||
void L2NormalizeCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
epsilon_ = static_cast<T>(AnfAlgo::GetNodeAttr<float>(kernel_node, "epsilon"));
|
||||
axis_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, "axis"));
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
epsilon_ = static_cast<T>(AnfAlgo::GetNodeAttr<float>(kernel_node, EPSILON));
|
||||
axis_ = LongToInt(AnfAlgo::GetNodeAttr<int64_t>(kernel_node, AXIS));
|
||||
input_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
output_shape_ = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
CheckParam(kernel_node);
|
||||
|
||||
int dims = SizeToInt(input_shape_.size());
|
||||
if (axis_ < -dims || axis_ >= dims) {
|
||||
MS_LOG(EXCEPTION) << "Attr axis_ " << axis_ << " must be in " << -dims << "~" << dims;
|
||||
}
|
||||
if (epsilon_ == (T)0.0) {
|
||||
MS_LOG(EXCEPTION) << "Attr epsilon can not be zero.";
|
||||
}
|
||||
if (axis_ < 0) {
|
||||
axis_ += SizeToInt(input_shape_.size());
|
||||
}
|
||||
|
@ -112,6 +127,8 @@ template <typename T>
|
|||
bool L2NormalizeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> & /* workspace */,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kL2NormalizeInputsNum, kernel_name_);
|
||||
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kL2NormalizeOutputsNum, kernel_name_);
|
||||
auto input_addr = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
|
||||
|
@ -131,24 +148,5 @@ bool L2NormalizeCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inpu
|
|||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void L2NormalizeCPUKernel<T>::CheckParam(const CNodePtr &kernel_node) {
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
int dims = SizeToInt(input_shape_.size());
|
||||
if (input_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but L2NormalizeCPUKernel needs 1 input.";
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != 1) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but L2NormalizeCPUKernel needs 1 output.";
|
||||
}
|
||||
if (axis_ < -dims || axis_ >= dims) {
|
||||
MS_LOG(EXCEPTION) << "Attr axis_ " << axis_ << " must be in " << -dims << "~" << dims;
|
||||
}
|
||||
if (epsilon_ == (T)0.0) {
|
||||
MS_LOG(EXCEPTION) << "Attr epsilon can not be zero.";
|
||||
}
|
||||
}
|
||||
} // namespace kernel
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -16,10 +16,10 @@
|
|||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2_NORMALIZE_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2_NORMALIZE_CPU_KERNEL_H_
|
||||
#include <memory>
|
||||
|
||||
#include <vector>
|
||||
#include <limits>
|
||||
#include <utility>
|
||||
#include <memory>
|
||||
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
|
|
|
@ -15,15 +15,19 @@
|
|||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/l2normalize_grad_cpu_kernel.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace {
|
||||
constexpr size_t kL2NormalizeGradInputsNum = 3;
|
||||
constexpr size_t kL2NormalizeGradOutputsNum = 1;
|
||||
} // namespace
|
||||
|
||||
template <typename T>
|
||||
void L2NormalizeGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
CheckIONumber(kernel_node);
|
||||
for (size_t i = 0; i < INPUT_SIZE; i++) {
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
for (size_t i = 0; i < kL2NormalizeGradInputsNum; i++) {
|
||||
(void)input_shape_list_.emplace_back(AnfAlgo::GetPrevNodeOutputInferShape(kernel_node, i));
|
||||
}
|
||||
auto output_shape = AnfAlgo::GetOutputInferShape(kernel_node, 0);
|
||||
|
@ -45,6 +49,8 @@ template <typename T>
|
|||
bool L2NormalizeGradCPUKernel<T>::Launch(const std::vector<AddressPtr> &inputs,
|
||||
const std::vector<AddressPtr> &workspace,
|
||||
const std::vector<AddressPtr> &outputs) {
|
||||
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kL2NormalizeGradInputsNum, kernel_name_);
|
||||
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kL2NormalizeGradOutputsNum, kernel_name_);
|
||||
auto input_x = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto y = reinterpret_cast<T *>(inputs[1]->addr);
|
||||
auto dout = reinterpret_cast<T *>(inputs[2]->addr);
|
||||
|
@ -78,18 +84,6 @@ void L2NormalizeGradCPUKernel<T>::CheckInputShape(const std::vector<size_t> &out
|
|||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void L2NormalizeGradCPUKernel<T>::CheckIONumber(const CNodePtr &kernel_node) {
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != INPUT_SIZE) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but L2NormalizeGradCPUKernel needs 3 input.";
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != OUTPUT_SIZE) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but L2NormalizeGradCPUKernel needs 1 output.";
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
std::vector<size_t> L2NormalizeGradCPUKernel<T>::OneDimIndexToHighDimIndex(size_t one_dim_index) {
|
||||
std::vector<size_t> high_dim_index;
|
||||
|
|
|
@ -13,17 +13,17 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2NORMALIZE_GRAD_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_L2NORMALIZE_GRAD_CPU_KERNEL_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr size_t INPUT_SIZE = 3;
|
||||
constexpr size_t OUTPUT_SIZE = 1;
|
||||
template <typename T>
|
||||
class L2NormalizeGradCPUKernel : public CPUKernel {
|
||||
public:
|
||||
|
@ -37,7 +37,6 @@ class L2NormalizeGradCPUKernel : public CPUKernel {
|
|||
|
||||
private:
|
||||
void CheckInputShape(const std::vector<size_t> &output_shape);
|
||||
void CheckIONumber(const CNodePtr &kernel_node);
|
||||
std::vector<size_t> OneDimIndexToHighDimIndex(size_t one_dim_index);
|
||||
void HighDimIndexToOneDimIndex(size_t *one_dim_index, const std::vector<size_t> &high_dim_index);
|
||||
std::vector<T> GetVector(const std::vector<size_t> &high_dim_index, const T *x);
|
||||
|
|
|
@ -15,22 +15,18 @@
|
|||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/masked_select_cpu_kernel.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace {
|
||||
constexpr size_t kMaskedSelectInputsNum = 2;
|
||||
constexpr size_t kMaskedSelectOutputsNum = 1;
|
||||
} // namespace
|
||||
|
||||
template <typename T>
|
||||
void MaskedSelectCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != kInputNum) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but MaskedSelectCPUKernel needs " << kInputNum
|
||||
<< " input.";
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != kOutputNum) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but MaskedSelectCPUKernel needs " << kOutputNum
|
||||
<< " output.";
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
input_shape_a_ = AnfAlgo::GetInputDeviceShape(kernel_node, 0);
|
||||
input_shape_b_ = AnfAlgo::GetInputDeviceShape(kernel_node, 1);
|
||||
output_shape_ = CPUKernelUtils::GetBroadcastShape(input_shape_a_, input_shape_b_);
|
||||
|
@ -44,6 +40,8 @@ template <typename T>
|
|||
bool MaskedSelectCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMaskedSelectInputsNum, kernel_name_);
|
||||
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMaskedSelectOutputsNum, kernel_name_);
|
||||
auto x = reinterpret_cast<T *>(inputs[0]->addr);
|
||||
auto mask = reinterpret_cast<bool *>(inputs[1]->addr);
|
||||
auto y = reinterpret_cast<T *>(outputs[0]->addr);
|
||||
|
|
|
@ -16,16 +16,14 @@
|
|||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_CPU_KERNEL_H_
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr size_t kInputNum = 2;
|
||||
constexpr size_t kOutputNum = 1;
|
||||
template <typename T>
|
||||
class MaskedSelectCPUKernel : public CPUKernel {
|
||||
public:
|
||||
|
|
|
@ -15,22 +15,18 @@
|
|||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/masked_select_grad_cpu_kernel.h"
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace {
|
||||
constexpr size_t kMaskedSelectGradInputsNum = 3;
|
||||
constexpr size_t kMaskedSelectGradOutputsNum = 1;
|
||||
} // namespace
|
||||
|
||||
template <typename T>
|
||||
void MaskedSelectGradCPUKernel<T>::InitKernel(const CNodePtr &kernel_node) {
|
||||
size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
|
||||
if (input_num != kInputNum) {
|
||||
MS_LOG(EXCEPTION) << "Input number is " << input_num << ", but MaskedSelectGradCPUKernel needs " << kInputNum
|
||||
<< " input.";
|
||||
}
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(kernel_node);
|
||||
if (output_num != kOutputNum) {
|
||||
MS_LOG(EXCEPTION) << "Output number is " << output_num << ", but MaskedSelectGradCPUKernel needs " << kOutputNum
|
||||
<< " output.";
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
input_shape_a_ = AnfAlgo::GetInputDeviceShape(kernel_node, INPUT);
|
||||
input_shape_b_ = AnfAlgo::GetInputDeviceShape(kernel_node, MASK);
|
||||
grad_shape_ = AnfAlgo::GetInputDeviceShape(kernel_node, GRAD);
|
||||
|
@ -44,6 +40,8 @@ template <typename T>
|
|||
bool MaskedSelectGradCPUKernel<T>::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kMaskedSelectGradInputsNum, kernel_name_);
|
||||
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kMaskedSelectGradOutputsNum, kernel_name_);
|
||||
auto mask = reinterpret_cast<bool *>(inputs[MASK]->addr);
|
||||
auto grad = reinterpret_cast<T *>(inputs[GRAD]->addr);
|
||||
auto dx = reinterpret_cast<T *>(outputs[INPUT]->addr);
|
||||
|
|
|
@ -16,16 +16,14 @@
|
|||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_GRAD_CPU_KERNEL_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_KERNEL_COMPILER_CPU_MASKED_SELECTED_GRAD_CPU_KERNEL_H_
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel.h"
|
||||
#include "backend/kernel_compiler/cpu/cpu_kernel_factory.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
constexpr size_t kInputNum = 3;
|
||||
constexpr size_t kOutputNum = 1;
|
||||
template <typename T>
|
||||
class MaskedSelectGradCPUKernel : public CPUKernel {
|
||||
public:
|
||||
|
|
|
@ -15,13 +15,17 @@
|
|||
*/
|
||||
|
||||
#include "backend/kernel_compiler/cpu/pad_and_shift_cpu_kernel.h"
|
||||
#include <string>
|
||||
#include "runtime/device/cpu/cpu_device_address.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace kernel {
|
||||
namespace {
|
||||
constexpr size_t kPadAndShiftInputsNum = 3;
|
||||
constexpr size_t kPadAndShiftOutputsNum = 1;
|
||||
} // namespace
|
||||
|
||||
void PadAndShiftCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
||||
MS_EXCEPTION_IF_NULL(kernel_node);
|
||||
kernel_name_ = AnfAlgo::GetCNodeName(kernel_node);
|
||||
node_wpt_ = kernel_node;
|
||||
input_x_dtype_ = AnfAlgo::GetInputDeviceDataType(kernel_node, 0);
|
||||
type_size_ = GetTypeByte(TypeIdToType(input_x_dtype_));
|
||||
|
@ -41,13 +45,14 @@ void PadAndShiftCPUKernel::InitKernel(const CNodePtr &kernel_node) {
|
|||
bool PadAndShiftCPUKernel::Launch(const std::vector<kernel::AddressPtr> &inputs,
|
||||
const std::vector<kernel::AddressPtr> &,
|
||||
const std::vector<kernel::AddressPtr> &outputs) {
|
||||
CHECK_KERNEL_INPUTS_NUM(inputs.size(), kPadAndShiftInputsNum, kernel_name_);
|
||||
CHECK_KERNEL_OUTPUTS_NUM(outputs.size(), kPadAndShiftOutputsNum, kernel_name_);
|
||||
if (input_x_dtype_ == kNumberTypeInt32) {
|
||||
LaunchKernel<int>(inputs, outputs);
|
||||
} else if (input_x_dtype_ == kNumberTypeInt64) {
|
||||
LaunchKernel<int64_t>(inputs, outputs);
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Dtype of input_x only support int32, int64";
|
||||
return false;
|
||||
MS_LOG(EXCEPTION) << "Dtype of input_x only support int32, int64";
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
|
|
@ -122,7 +122,10 @@ class Profiler:
|
|||
|
||||
def __init__(self, **kwargs):
|
||||
if c_expression.security.enable_security():
|
||||
raise Runtime("Profiler is not supported if compiled with \'-s on\'")
|
||||
raise RuntimeError("Profiler is not supported if compiled with \'-s on\'")
|
||||
|
||||
if context.get_context("mode") == context.PYNATIVE_MODE:
|
||||
raise RuntimeError("Profiler is not supported in PyNative mode")
|
||||
|
||||
# get device_id and device_target
|
||||
self._get_devid_rankid_and_devtarget()
|
||||
|
@ -643,7 +646,7 @@ class Profiler:
|
|||
dev_id = "0"
|
||||
logger.warning("Fail to get DEVICE_ID, use 0 instead.")
|
||||
|
||||
if device_target and device_target not in ["Ascend", "GPU"]:
|
||||
if device_target and device_target not in ["Ascend", "GPU", "CPU"]:
|
||||
msg = "Profiling: unsupported backend: %s" % device_target
|
||||
raise RuntimeError(msg)
|
||||
|
||||
|
|
|
@ -23,6 +23,7 @@ from mindspore import Tensor
|
|||
from mindspore.ops import operations as P
|
||||
from mindspore.profiler import Profiler
|
||||
|
||||
|
||||
class Net(nn.Cell):
|
||||
def __init__(self):
|
||||
super(Net, self).__init__()
|
||||
|
@ -35,6 +36,7 @@ class Net(nn.Cell):
|
|||
x = np.random.randn(1, 3, 3, 4).astype(np.float32)
|
||||
y = np.random.randn(1, 3, 3, 4).astype(np.float32)
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_arm_ascend_training
|
||||
@pytest.mark.platform_x86_ascend_training
|
||||
|
|
|
@ -0,0 +1,70 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""test cpu profiler"""
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
import mindspore.context as context
|
||||
import mindspore.nn as nn
|
||||
from mindspore import Tensor
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.profiler import Profiler
|
||||
|
||||
|
||||
class Net(nn.Cell):
|
||||
def __init__(self):
|
||||
super(Net, self).__init__()
|
||||
self.add = P.Add()
|
||||
|
||||
def construct(self, x_, y_):
|
||||
return self.add(x_, y_)
|
||||
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_cpu
|
||||
@pytest.mark.env_onecard
|
||||
@pytest.mark.security_off
|
||||
def test_cpu_profiling():
|
||||
if sys.platform != 'linux':
|
||||
return
|
||||
data_path = os.path.join(os.getcwd(), 'data_cpu_profiler')
|
||||
if os.path.isdir(data_path):
|
||||
shutil.rmtree(data_path)
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="CPU")
|
||||
device_id = context.get_context("device_id")
|
||||
profiler = Profiler(output_path="data_cpu_profiler")
|
||||
x = np.random.randn(1, 3, 3, 4).astype(np.float32)
|
||||
y = np.random.randn(1, 3, 3, 4).astype(np.float32)
|
||||
add = Net()
|
||||
add(Tensor(x), Tensor(y))
|
||||
profiler.analyse()
|
||||
|
||||
assert os.path.isdir(data_path)
|
||||
assert len(os.listdir(data_path)) == 1
|
||||
|
||||
profiler_dir = os.path.join(data_path, f"{os.listdir(data_path)[0]}/")
|
||||
op_detail_file = f"{profiler_dir}cpu_op_detail_info_{device_id}.csv"
|
||||
op_type_file = f"{profiler_dir}cpu_op_type_info_{device_id}.csv"
|
||||
timeline_file = f"{profiler_dir}cpu_op_execute_timestamp_{device_id}.txt"
|
||||
cpu_profiler_files = (op_detail_file, op_type_file, timeline_file)
|
||||
for file in cpu_profiler_files:
|
||||
assert os.path.isfile(file)
|
||||
|
||||
if os.path.isdir(data_path):
|
||||
shutil.rmtree(data_path)
|
|
@ -15,6 +15,8 @@
|
|||
import os
|
||||
import shutil
|
||||
|
||||
import sys
|
||||
|
||||
from tests.security_utils import security_off_wrap
|
||||
import pytest
|
||||
|
||||
|
@ -53,6 +55,7 @@ def weight_variable():
|
|||
|
||||
class LeNet5(nn.Cell):
|
||||
"""Define LeNet5 network."""
|
||||
|
||||
def __init__(self, num_class=10, channel=1):
|
||||
super(LeNet5, self).__init__()
|
||||
self.num_class = num_class
|
||||
|
@ -86,7 +89,7 @@ class LeNet5(nn.Cell):
|
|||
def create_dataset(data_path, batch_size=32, repeat_size=1, num_parallel_workers=1):
|
||||
"""create dataset for train"""
|
||||
# define dataset
|
||||
mnist_ds = ds.MnistDataset(data_path, num_samples=batch_size*100)
|
||||
mnist_ds = ds.MnistDataset(data_path, num_samples=batch_size * 100)
|
||||
|
||||
resize_height, resize_width = 32, 32
|
||||
rescale = 1.0 / 255.0
|
||||
|
@ -131,10 +134,26 @@ class TestProfiler:
|
|||
rank_id = int(os.getenv('RANK_ID')) if os.getenv('RANK_ID') else 0
|
||||
mnist_path = '/home/workspace/mindspore_dataset/mnist'
|
||||
|
||||
def teardown(self):
|
||||
""" Run after each use case."""
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
"""Run begin all test case start."""
|
||||
cleanup()
|
||||
|
||||
@staticmethod
|
||||
def teardown():
|
||||
"""Run after each test case end."""
|
||||
cleanup()
|
||||
|
||||
@pytest.mark.level2
|
||||
@pytest.mark.platform_x86_cpu
|
||||
@pytest.mark.env_onecard
|
||||
@security_off_wrap
|
||||
def test_cpu_profiler(self):
|
||||
if sys.platform != 'linux':
|
||||
return
|
||||
self._train_with_profiler(device_target="CPU")
|
||||
self._check_cpu_profiling_file()
|
||||
|
||||
@pytest.mark.level1
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
|
@ -177,12 +196,10 @@ class TestProfiler:
|
|||
getnext_file = self.profiler_path + f'minddata_getnext_profiling_{self.device_id}.txt'
|
||||
pipeline_file = self.profiler_path + f'minddata_pipeline_raw_{self.device_id}.csv'
|
||||
|
||||
assert os.path.exists(op_detail_file)
|
||||
assert os.path.exists(op_type_file)
|
||||
assert os.path.exists(activity_file)
|
||||
assert os.path.exists(timeline_file)
|
||||
assert os.path.exists(getnext_file)
|
||||
assert os.path.exists(pipeline_file)
|
||||
gpu_profiler_files = (op_detail_file, op_type_file, activity_file,
|
||||
timeline_file, getnext_file, pipeline_file)
|
||||
for file in gpu_profiler_files:
|
||||
assert os.path.isfile(file)
|
||||
|
||||
def _check_d_profiling_file(self):
|
||||
aicore_file = self.profiler_path + f'aicore_intermediate_{self.rank_id}_detail.csv'
|
||||
|
@ -193,10 +210,16 @@ class TestProfiler:
|
|||
queue_profiling_file = self.profiler_path + f'device_queue_profiling_{self.rank_id}.txt'
|
||||
memory_file = self.profiler_path + f'memory_usage_{self.rank_id}.pb'
|
||||
|
||||
assert os.path.exists(aicore_file)
|
||||
assert os.path.exists(step_trace_file)
|
||||
assert os.path.exists(timeline_file)
|
||||
assert os.path.exists(queue_profiling_file)
|
||||
assert os.path.exists(minddata_pipeline_file)
|
||||
assert os.path.exists(aicpu_file)
|
||||
assert os.path.exists(memory_file)
|
||||
d_profiler_files = (aicore_file, step_trace_file, timeline_file, aicpu_file,
|
||||
minddata_pipeline_file, queue_profiling_file, memory_file)
|
||||
for file in d_profiler_files:
|
||||
assert os.path.isfile(file)
|
||||
|
||||
def _check_cpu_profiling_file(self):
|
||||
op_detail_file = self.profiler_path + f'cpu_op_detail_info_{self.device_id}.csv'
|
||||
op_type_file = self.profiler_path + f'cpu_op_type_info_{self.device_id}.csv'
|
||||
timeline_file = self.profiler_path + f'cpu_op_execute_timestamp_{self.device_id}.txt'
|
||||
|
||||
cpu_profiler_files = (op_detail_file, op_type_file, timeline_file)
|
||||
for file in cpu_profiler_files:
|
||||
assert os.path.isfile(file)
|
||||
|
|
Loading…
Reference in New Issue