forked from mindspore-Ecosystem/mindspore
!14152 reduce runtime ram while fp16 is enabled
From: @hangangqiang Reviewed-by: @zhang_xue_tong,@zhanghaibo5 Signed-off-by: @zhang_xue_tong
This commit is contained in:
commit
3719fdf5ea
|
@ -474,6 +474,25 @@ void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane,
|
|||
}
|
||||
}
|
||||
|
||||
void PackNCHWFp16ToNC8HW8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel) {
|
||||
int c8 = UP_DIV(channel, C8NUM);
|
||||
for (int b = 0; b < batch; b++) {
|
||||
int src_offset = b * plane * channel;
|
||||
int dst_offset = b * plane * c8 * C8NUM;
|
||||
for (int c = 0; c < channel; c++) {
|
||||
int c8_block_num = c / C8NUM;
|
||||
int c8_block_rem = c % C8NUM;
|
||||
int src_c_offset = src_offset + c * plane;
|
||||
int dst_c_offset = dst_offset + c8_block_num * plane * C8NUM;
|
||||
for (int k = 0; k < plane; k++) {
|
||||
int src_kernel_offset = src_c_offset + k;
|
||||
int dst_kernel_offset = dst_c_offset + C8NUM * k + c8_block_rem;
|
||||
(dst + dst_kernel_offset)[0] = (float16_t)(src + src_kernel_offset)[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel) {
|
||||
int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
|
||||
for (int b = 0; b < batch; b++) {
|
||||
|
@ -504,6 +523,21 @@ void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane,
|
|||
return;
|
||||
}
|
||||
|
||||
void PackNHWCFp16ToC8HWN8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel) {
|
||||
for (int n = 0; n < batch; n++) {
|
||||
for (int hw = 0; hw < plane; hw++) {
|
||||
for (int c = 0; c < channel; c++) {
|
||||
int c8div = c / C8NUM;
|
||||
int c8mod = c % C8NUM;
|
||||
int src_index = n * plane * channel + hw * channel + c;
|
||||
int dst_index = c8div * batch * plane * C8NUM + hw * batch * C8NUM + n * C8NUM + c8mod;
|
||||
dst[dst_index] = src[src_index];
|
||||
}
|
||||
}
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel) {
|
||||
int c8_channel = UP_DIV(channel, C8NUM) * C8NUM;
|
||||
for (int b = 0; b < batch; b++) {
|
||||
|
|
|
@ -61,10 +61,14 @@ void PackNC8HW8ToNHWCFp16(const void *src, void *dst, int batch, int plane, int
|
|||
|
||||
void PackNCHWFp32ToNC8HW8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNCHWFp16ToNC8HW8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNHWCFp32ToNHWC8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNHWCFp32ToC8HWN8Fp16(float *src, float16_t *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNHWCFp16ToC8HWN8Fp16(float16_t *src, float16_t *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNHWC8Fp16ToNHWCFp32(float16_t *src, float *dst, int batch, int plane, int channel);
|
||||
|
||||
void PackNHWC8ToNHWCFp16(float16_t *src, float16_t *dst, int batch, int plane, int channel);
|
||||
|
|
|
@ -30,9 +30,6 @@ int QuantDtypeCastInferShape(const TensorC *const *inputs, size_t inputs_size, T
|
|||
TensorC *output = outputs[0];
|
||||
|
||||
QuantDtypeCastParameter *param = (QuantDtypeCastParameter *)parameter;
|
||||
if (input->data_type_ != param->srcT_) {
|
||||
return NNACL_ERR;
|
||||
}
|
||||
output->data_type_ = param->dstT_;
|
||||
output->format_ = input->format_;
|
||||
if (!parameter->infer_flag_) {
|
||||
|
|
|
@ -24,7 +24,7 @@ extern "C" {
|
|||
|
||||
typedef struct QuantDtypeCastParameter {
|
||||
OpParameter op_parameter_;
|
||||
int srcT_;
|
||||
int srcT_; // deprecated
|
||||
int dstT_;
|
||||
} QuantDtypeCastParameter;
|
||||
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
#include "src/inner_context.h"
|
||||
#include "include/errorcode.h"
|
||||
#include "src/common/log_adapter.h"
|
||||
#include "src/common/utils.h"
|
||||
#ifdef SUPPORT_NPU
|
||||
#include "src/runtime/agent/npu/npu_manager.h"
|
||||
#endif
|
||||
|
@ -85,18 +86,18 @@ int InnerContext::IsValid() const {
|
|||
MS_LOG(ERROR) << "Device list is empty.";
|
||||
return RET_NOT_SUPPORT;
|
||||
}
|
||||
if (!IsCpuEnabled()) {
|
||||
MS_LOG(ERROR) << "CPU is not supported.";
|
||||
if (!IsUserSetCpu()) {
|
||||
MS_LOG(ERROR) << "CPU context should be set.";
|
||||
return RET_NOT_SUPPORT;
|
||||
}
|
||||
#ifndef SUPPORT_GPU
|
||||
if (IsGpuEnabled()) {
|
||||
if (IsUserSetGpu()) {
|
||||
MS_LOG(ERROR) << "GPU is not supported.";
|
||||
return RET_NOT_SUPPORT;
|
||||
}
|
||||
#endif
|
||||
#ifndef SUPPORT_NPU
|
||||
if (IsNpuEnabled()) {
|
||||
if (IsUserSetNpu()) {
|
||||
MS_LOG(ERROR) << "NPU is not supported.";
|
||||
return RET_NOT_SUPPORT;
|
||||
}
|
||||
|
@ -108,6 +109,9 @@ bool InnerContext::IsCpuFloat16Enabled() const {
|
|||
if (!IsCpuEnabled()) {
|
||||
return false;
|
||||
}
|
||||
if (!IsSupportFloat16()) {
|
||||
return false;
|
||||
}
|
||||
return GetCpuInfo().enable_float16_;
|
||||
}
|
||||
|
||||
|
@ -115,33 +119,49 @@ bool InnerContext::IsGpuFloat16Enabled() const {
|
|||
if (!IsGpuEnabled()) {
|
||||
return false;
|
||||
}
|
||||
if (!IsSupportFloat16()) {
|
||||
return false;
|
||||
}
|
||||
return GetGpuInfo().enable_float16_;
|
||||
}
|
||||
|
||||
bool InnerContext::IsCpuEnabled() const {
|
||||
return this->device_list_.end() !=
|
||||
std::find_if(this->device_list_.begin(), this->device_list_.end(),
|
||||
[](const DeviceContext &device) { return device.device_type_ == DT_CPU; });
|
||||
}
|
||||
bool InnerContext::IsCpuEnabled() const { return IsUserSetCpu(); }
|
||||
|
||||
bool InnerContext::IsGpuEnabled() const {
|
||||
return this->device_list_.end() !=
|
||||
std::find_if(this->device_list_.begin(), this->device_list_.end(),
|
||||
[](const DeviceContext &device) { return device.device_type_ == DT_GPU; });
|
||||
#ifdef SUPPORT_GPU
|
||||
return IsUserSetGpu();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool InnerContext::IsNpuEnabled() const {
|
||||
#ifdef SUPPORT_NPU
|
||||
MS_ASSERT(npu_manager_ != nullptr);
|
||||
return this->device_list_.end() !=
|
||||
std::find_if(this->device_list_.begin(), this->device_list_.end(),
|
||||
[](const DeviceContext &device) { return device.device_type_ == DT_NPU; }) &&
|
||||
npu_manager_->IsSupportNPU();
|
||||
return IsUserSetNpu() && npu_manager_->IsSupportNPU();
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool InnerContext::IsUserSetCpu() const {
|
||||
return this->device_list_.end() !=
|
||||
std::find_if(this->device_list_.begin(), this->device_list_.end(),
|
||||
[](const DeviceContext &device) { return device.device_type_ == DT_CPU; });
|
||||
}
|
||||
|
||||
bool InnerContext::IsUserSetGpu() const {
|
||||
return this->device_list_.end() !=
|
||||
std::find_if(this->device_list_.begin(), this->device_list_.end(),
|
||||
[](const DeviceContext &device) { return device.device_type_ == DT_GPU; });
|
||||
}
|
||||
|
||||
bool InnerContext::IsUserSetNpu() const {
|
||||
return this->device_list_.end() !=
|
||||
std::find_if(this->device_list_.begin(), this->device_list_.end(),
|
||||
[](const DeviceContext &device) { return device.device_type_ == DT_NPU; });
|
||||
}
|
||||
|
||||
CpuDeviceInfo InnerContext::GetCpuInfo() const {
|
||||
auto iter = std::find_if(this->device_list_.begin(), this->device_list_.end(),
|
||||
[](const DeviceContext &device) { return device.device_type_ == DT_CPU; });
|
||||
|
|
|
@ -58,6 +58,13 @@ struct InnerContext : public Context {
|
|||
|
||||
virtual ~InnerContext();
|
||||
|
||||
private:
|
||||
bool IsUserSetCpu() const;
|
||||
|
||||
bool IsUserSetGpu() const;
|
||||
|
||||
bool IsUserSetNpu() const;
|
||||
|
||||
#if SUPPORT_NPU
|
||||
|
||||
private:
|
||||
|
|
|
@ -44,48 +44,12 @@ int QuantDTypeCastCPUKernel::Init() {
|
|||
MS_ASSERT(out_tensor);
|
||||
auto param = reinterpret_cast<QuantDTypeCastParameter *>(op_parameter_);
|
||||
MS_ASSERT(param);
|
||||
if (param->srcT == kNumberTypeFloat32 && param->dstT == kNumberTypeInt8) {
|
||||
if (in_tensor->data_type() != kNumberTypeFloat32 || out_tensor->data_type() != kNumberTypeInt8) {
|
||||
MS_LOG(ERROR) << "param data type and tensor data type do not match.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else if (param->srcT == kNumberTypeInt8 && param->dstT == kNumberTypeFloat32) {
|
||||
if (in_tensor->data_type() != kNumberTypeInt8 || out_tensor->data_type() != kNumberTypeFloat32) {
|
||||
MS_LOG(ERROR) << "param data type and tensor data type do not match.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else if (param->srcT == kNumberTypeUInt8 && param->dstT == kNumberTypeInt8) {
|
||||
if (in_tensor->data_type() != kNumberTypeUInt8 || out_tensor->data_type() != kNumberTypeInt8) {
|
||||
MS_LOG(ERROR) << "param data type and tensor data type do not match.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else if (param->srcT == kNumberTypeInt8 && param->dstT == kNumberTypeInt8) {
|
||||
if (in_tensor->data_type() != kNumberTypeInt8 || out_tensor->data_type() != kNumberTypeInt8) {
|
||||
MS_LOG(ERROR) << "param data type and tensor data type do not match.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else if (param->srcT == kNumberTypeInt8 && param->dstT == kNumberTypeUInt8) {
|
||||
if (in_tensor->data_type() != kNumberTypeInt8 || out_tensor->data_type() != kNumberTypeUInt8) {
|
||||
MS_LOG(ERROR) << "param data type and tensor data type do not match.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else if (param->srcT == kNumberTypeUInt8 && param->dstT == kNumberTypeFloat32) {
|
||||
if (in_tensor->data_type() != kNumberTypeUInt8 || out_tensor->data_type() != kNumberTypeFloat32) {
|
||||
MS_LOG(ERROR) << "param data type and tensor data type do not match.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else if (param->srcT == kNumberTypeFloat32 && param->dstT == kNumberTypeUInt8) {
|
||||
if (in_tensor->data_type() != kNumberTypeFloat32 || out_tensor->data_type() != kNumberTypeUInt8) {
|
||||
MS_LOG(ERROR) << "param data type and tensor data type do not match.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
} else {
|
||||
MS_LOG(ERROR) << "param data type not supported:"
|
||||
<< " src: " << param->srcT << " dst: " << param->dstT;
|
||||
return RET_PARAM_INVALID;
|
||||
}
|
||||
src_dtype = param->srcT;
|
||||
src_dtype = in_tensor->data_type();
|
||||
dst_dtype = param->dstT;
|
||||
if (out_tensor->data_type() != dst_dtype) {
|
||||
MS_LOG(ERROR) << "param data type and tensor data type do not match.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
if (!InferShapeDone()) {
|
||||
return RET_OK;
|
||||
|
@ -149,6 +113,10 @@ int QuantDTypeCastCPUKernel::QuantDTypeCast(int task_id) {
|
|||
ret = DoQuantizeFp32ToInt8(float32_ptr_ + thread_offset, int8_out_ptr_ + thread_offset, output_quant_arg.scale,
|
||||
output_quant_arg.zeroPoint, num_unit_thread, from_uint8_src);
|
||||
}
|
||||
} else {
|
||||
MS_LOG(ERROR) << "param data type not supported:"
|
||||
<< " src: " << src_dtype << " dst: " << dst_dtype;
|
||||
return RET_PARAM_INVALID;
|
||||
}
|
||||
|
||||
if (ret != RET_OK) {
|
||||
|
|
|
@ -47,7 +47,7 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
|
|||
MS_LOG(ERROR) << "get execute filter data failed.";
|
||||
return ret;
|
||||
}
|
||||
PackNCHWToNHWCFp16(fp16_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
|
||||
PackNCHWToNHWCFp16(execute_weight_, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
|
||||
weight_tensor->Batch());
|
||||
if (fp16_weight_ != nullptr) {
|
||||
free(fp16_weight_);
|
||||
|
@ -64,7 +64,7 @@ int ConvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
|
|||
if (in_tensors_.size() == kInputSize2) {
|
||||
auto bias_tensor = in_tensors_.at(kBiasIndex);
|
||||
MS_ASSERT(origin_bias_);
|
||||
auto ori_bias = reinterpret_cast<float *>(origin_bias_);
|
||||
auto ori_bias = reinterpret_cast<float16_t *>(origin_bias_);
|
||||
for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
|
||||
bias_fp16[i] = (float16_t)ori_bias[i];
|
||||
}
|
||||
|
|
|
@ -68,7 +68,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() {
|
|||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
PackNCHWFp32ToNC8HW8Fp16(reinterpret_cast<float *>(origin_weight_), packed_weight_, 1,
|
||||
PackNCHWFp16ToNC8HW8Fp16(reinterpret_cast<float16_t *>(origin_weight_), packed_weight_, 1,
|
||||
weight_tensor->Height() * weight_tensor->Width(), weight_tensor->Batch());
|
||||
|
||||
bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
|
||||
|
@ -81,7 +81,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::InitWeightBias() {
|
|||
if (in_tensors_.size() == kInputSize2) {
|
||||
auto bias_tensor = in_tensors_.at(kBiasIndex);
|
||||
MS_ASSERT(origin_bias_);
|
||||
auto ori_bias = reinterpret_cast<float *>(origin_bias_);
|
||||
auto ori_bias = reinterpret_cast<float16_t *>(origin_bias_);
|
||||
for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
|
||||
bias_fp16[i] = (float16_t)ori_bias[i];
|
||||
}
|
||||
|
|
|
@ -73,7 +73,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
|
|||
// init weight: o, h, w, i; o == group, i == 1
|
||||
auto weight_tensor = in_tensors_.at(kWeightIndex);
|
||||
int OC8 = UP_DIV(weight_tensor->Batch(), C8NUM);
|
||||
auto origin_weight = reinterpret_cast<float *>(weight_tensor->MutableData());
|
||||
auto origin_weight = reinterpret_cast<float16_t *>(weight_tensor->MutableData());
|
||||
int pack_weight_size = C8NUM * OC8 * weight_tensor->Height() * weight_tensor->Width();
|
||||
|
||||
packed_weight_ = reinterpret_cast<float16_t *>(malloc(pack_weight_size * sizeof(float16_t)));
|
||||
|
@ -81,7 +81,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
|
|||
MS_LOG(ERROR) << "Malloc buffer failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
PackNCHWFp32ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
|
||||
PackNCHWFp16ToNC8HW8Fp16(origin_weight, packed_weight_, 1, weight_tensor->Height() * weight_tensor->Width(),
|
||||
weight_tensor->Batch());
|
||||
|
||||
bias_data_ = reinterpret_cast<float16_t *>(malloc(C8NUM * OC8 * sizeof(float16_t)));
|
||||
|
@ -92,9 +92,9 @@ int DeconvolutionDepthwiseFp16CPUKernel::InitWeightBias() {
|
|||
memset(bias_data_, 0, C8NUM * OC8 * sizeof(float16_t));
|
||||
if (in_tensors_.size() == kInputSize2) {
|
||||
auto bias_tensor = in_tensors_.at(kBiasIndex);
|
||||
auto ori_bias = reinterpret_cast<float *>(bias_tensor->MutableData());
|
||||
auto ori_bias = reinterpret_cast<float16_t *>(bias_tensor->MutableData());
|
||||
for (int i = 0; i < bias_tensor->ElementsNum(); i++) {
|
||||
reinterpret_cast<float *>(bias_data_)[i] = (float16_t)ori_bias[i];
|
||||
reinterpret_cast<float16_t *>(bias_data_)[i] = ori_bias[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -57,7 +57,8 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() {
|
|||
auto kernel_h = weight_tensor->Height();
|
||||
auto kernel_w = weight_tensor->Width();
|
||||
|
||||
bias_data_ = malloc(UP_ROUND(output_channel, C4NUM) * sizeof(float16_t));
|
||||
auto bias_size = UP_ROUND(output_channel, C4NUM) * sizeof(float16_t);
|
||||
bias_data_ = malloc(bias_size);
|
||||
if (bias_data_ == nullptr) {
|
||||
MS_LOG(ERROR) << "deconv malloc bias_data_ error!";
|
||||
return RET_ERROR;
|
||||
|
@ -65,8 +66,15 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() {
|
|||
memset(bias_data_, 0, UP_ROUND(output_channel, C4NUM) * sizeof(float16_t));
|
||||
if (in_tensors_.size() == 3 && in_tensors_.at(kBiasIndex)->shape().size() == 1 &&
|
||||
in_tensors_.at(kBiasIndex)->DimensionSize(0) == output_channel) {
|
||||
Float32ToFloat16(reinterpret_cast<float *>(in_tensors_.at(2)->MutableData()),
|
||||
reinterpret_cast<float16_t *>(bias_data_), output_channel);
|
||||
if (in_tensors_.at(2)->data_type() != kNumberTypeFloat16) {
|
||||
MS_LOG(ERROR) << "deconv fp16 kernel require fp16 bias";
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (bias_size != in_tensors_.at(2)->Size()) {
|
||||
MS_LOG(ERROR) << "input bias size not match : " << bias_size << " vs " << in_tensors_.at(2)->Size();
|
||||
return RET_ERROR;
|
||||
}
|
||||
memcpy(bias_data_, in_tensors_.at(2)->data_c(), bias_size);
|
||||
}
|
||||
|
||||
size_t weight_pack_size = input_channel * kernel_w * kernel_h * UP_ROUND(output_channel, C8NUM) * sizeof(float16_t);
|
||||
|
@ -76,7 +84,11 @@ int DeConvolutionFp16CPUKernel::InitWeightBias() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
memset(execute_weight_, 0, weight_pack_size);
|
||||
PackNHWCFp32ToC8HWN8Fp16(reinterpret_cast<float *>(in_tensors_.at(1)->MutableData()), execute_weight_, input_channel,
|
||||
if (in_tensors_.at(1)->data_type() != kNumberTypeFloat16) {
|
||||
MS_LOG(ERROR) << "deconv fp16 kernel require fp16 weight";
|
||||
return RET_ERROR;
|
||||
}
|
||||
PackNHWCFp16ToC8HWN8Fp16(reinterpret_cast<float16_t *>(in_tensors_.at(1)->data_c()), execute_weight_, input_channel,
|
||||
kernel_w * kernel_h, output_channel);
|
||||
return RET_OK;
|
||||
}
|
||||
|
|
|
@ -341,7 +341,7 @@ int DeConvWinogradFp16CPUKernel::InitDataParam() {
|
|||
auto fp16_bias_data = reinterpret_cast<float16_t *>(bias_data_);
|
||||
if (in_tensors_.size() == 3 && in_tensors_.at(kBiasIndex)->shape().size() == 1 &&
|
||||
in_tensors_.at(kBiasIndex)->DimensionSize(0) == conv_param_->output_channel_) {
|
||||
auto src_bias = reinterpret_cast<float *>(in_tensors_.at(kBiasIndex)->MutableData());
|
||||
auto src_bias = reinterpret_cast<float16_t *>(in_tensors_.at(kBiasIndex)->MutableData());
|
||||
MS_ASSERT(src_bias);
|
||||
for (int i = 0; i < conv_param_->output_channel_; ++i) {
|
||||
fp16_bias_data[i] = (float16_t)src_bias[i];
|
||||
|
|
|
@ -250,6 +250,9 @@ int CopyConstTensor(Tensor *tensor, std::map<Tensor *, Tensor *> *restored_origi
|
|||
return RET_ERROR;
|
||||
#endif
|
||||
} else {
|
||||
if (tensor->own_data()) {
|
||||
return RET_OK;
|
||||
}
|
||||
tensor->set_data(nullptr);
|
||||
auto ret = tensor->MallocData();
|
||||
if (RET_OK != ret) {
|
||||
|
@ -264,8 +267,18 @@ int CopyConstTensor(Tensor *tensor, std::map<Tensor *, Tensor *> *restored_origi
|
|||
}
|
||||
#endif
|
||||
|
||||
inline void RestoreTensorData(const std::map<Tensor *, Tensor *> &restored_origin_tensors) {
|
||||
for (auto &restored_origin_tensor : restored_origin_tensors) {
|
||||
inline void FreeRestoreTensors(std::map<Tensor *, Tensor *> *restored_origin_tensors) {
|
||||
MS_ASSERT(restored_origin_tensors != nullptr);
|
||||
for (auto &restored_origin_tensor : *restored_origin_tensors) {
|
||||
restored_origin_tensor.second->set_data(nullptr);
|
||||
delete (restored_origin_tensor.second);
|
||||
}
|
||||
restored_origin_tensors->clear();
|
||||
}
|
||||
|
||||
inline void RestoreTensorData(std::map<Tensor *, Tensor *> *restored_origin_tensors) {
|
||||
MS_ASSERT(restored_origin_tensors != nullptr);
|
||||
for (auto &restored_origin_tensor : *restored_origin_tensors) {
|
||||
auto *origin_tensor = restored_origin_tensor.first;
|
||||
auto *restored_tensor = restored_origin_tensor.second;
|
||||
MS_ASSERT(origin_tensor != nullptr);
|
||||
|
@ -275,15 +288,7 @@ inline void RestoreTensorData(const std::map<Tensor *, Tensor *> &restored_origi
|
|||
origin_tensor->set_data(restored_tensor->data_c());
|
||||
origin_tensor->set_own_data(restored_tensor->own_data());
|
||||
}
|
||||
}
|
||||
|
||||
inline void FreeRestoreTensors(std::map<Tensor *, Tensor *> *restored_origin_tensors) {
|
||||
MS_ASSERT(restored_origin_tensors != nullptr);
|
||||
for (auto &restored_origin_tensor : *restored_origin_tensors) {
|
||||
restored_origin_tensor.second->set_data(nullptr);
|
||||
delete (restored_origin_tensor.second);
|
||||
}
|
||||
restored_origin_tensors->clear();
|
||||
FreeRestoreTensors(restored_origin_tensors);
|
||||
}
|
||||
|
||||
inline bool IsChannelFirst(int index, OpParameter *op_parameter) {
|
||||
|
@ -308,54 +313,54 @@ kernel::LiteKernel *Scheduler::FindCpuKernel(const std::vector<Tensor *> &in_ten
|
|||
if (!KernelRegistry::GetInstance()->SupportKernel(desc)) {
|
||||
return nullptr;
|
||||
}
|
||||
kernel::KernelKey cpu_desc = desc;
|
||||
if (kernel_data_type == kNumberTypeFloat16) {
|
||||
if (!context_->IsCpuFloat16Enabled() ||
|
||||
(cpu_desc.data_type != kNumberTypeFloat32 && cpu_desc.data_type != kNumberTypeFloat16)) {
|
||||
return nullptr;
|
||||
}
|
||||
cpu_desc.data_type = kNumberTypeFloat16;
|
||||
}
|
||||
std::map<Tensor *, Tensor *> restored_origin_tensors;
|
||||
int index = 0;
|
||||
for (auto &tensor : in_tensors) {
|
||||
auto channel_first = IsChannelFirst(index++, op_parameter);
|
||||
auto *restore_tensor = DequantUtil::DequantTensor(tensor, desc.data_type, channel_first, kernel_data_type);
|
||||
auto *restore_tensor = DequantUtil::DequantTensor(tensor, cpu_desc.data_type, channel_first, kernel_data_type);
|
||||
if (restore_tensor != nullptr) {
|
||||
restored_origin_tensors[tensor] = restore_tensor;
|
||||
} else {
|
||||
#ifndef SUPPORT_TRAIN
|
||||
if (!IsPackedOp(op_type) && !tensor->own_data()) { // && op_type != schema::PrimitiveType_LSTM
|
||||
auto ret = CopyConstTensor(tensor, &restored_origin_tensors, kernel_data_type);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(DEBUG) << "CopyConstTensor failed: " << ret;
|
||||
return nullptr;
|
||||
}
|
||||
auto ret = CopyConstTensor(tensor, &restored_origin_tensors, kernel_data_type);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(DEBUG) << "CopyConstTensor failed: " << ret;
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
}
|
||||
auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, desc, op_parameter);
|
||||
auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, cpu_desc, op_parameter);
|
||||
if (kernel != nullptr) {
|
||||
MS_LOG(DEBUG) << "Get TypeId(" << kernel_data_type << ") op success: " << PrimitiveTypeName(op_type);
|
||||
MS_LOG(DEBUG) << "Get TypeId(" << kernel_data_type << ") op success: " << PrimitiveCurVersionTypeName(op_type);
|
||||
FreeRestoreTensors(&restored_origin_tensors);
|
||||
} else {
|
||||
RestoreTensorData(restored_origin_tensors);
|
||||
RestoreTensorData(&restored_origin_tensors);
|
||||
}
|
||||
return kernel;
|
||||
}
|
||||
} // namespace mindspore::lite
|
||||
|
||||
kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in_tensors,
|
||||
const std::vector<Tensor *> &out_tensors, const Model::Node *node,
|
||||
TypeId prefer_data_type) {
|
||||
MS_ASSERT(node != nullptr);
|
||||
bool need_dequant = node->quant_type_ == schema::QuantType_WeightQuant;
|
||||
TypeId data_type = need_dequant ? kNumberTypeFloat32 : GetFirstFp32Fp16OrInt8Type(in_tensors);
|
||||
OpParameter *op_parameter = op_parameters_[node->output_indices_.at(0)];
|
||||
if (op_parameter == nullptr) {
|
||||
MS_LOG(ERROR) << "Can not find OpParameter!type: " << PrimitiveTypeName(GetPrimitiveType(node->primitive_));
|
||||
return nullptr;
|
||||
}
|
||||
bool infer_shape_interrupt = !op_parameter->infer_flag_;
|
||||
kernel::KernelKey desc{kCPU, data_type, static_cast<schema::PrimitiveType>(op_parameter->type_)};
|
||||
#if SUPPORT_GPU
|
||||
kernel::LiteKernel *Scheduler::FindGpuKernel(const std::vector<Tensor *> &in_tensors,
|
||||
const std::vector<Tensor *> &out_tensors, OpParameter *op_parameter,
|
||||
const kernel::KernelKey &desc) {
|
||||
MS_ASSERT(op_parameter != nullptr);
|
||||
if (context_->IsGpuEnabled()) {
|
||||
// support more data type like int32
|
||||
kernel::KernelKey gpu_desc{kGPU, kNumberTypeFloat32, desc.type};
|
||||
if (context_->IsGpuFloat16Enabled()) gpu_desc.data_type = kNumberTypeFloat16;
|
||||
if (in_tensors.front()->data_type() == kNumberTypeInt8) gpu_desc.data_type = kNumberTypeInt8;
|
||||
if (context_->IsGpuFloat16Enabled()) {
|
||||
gpu_desc.data_type = kNumberTypeFloat16;
|
||||
}
|
||||
if (in_tensors.front()->data_type() == kNumberTypeInt8) {
|
||||
gpu_desc.data_type = kNumberTypeInt8;
|
||||
}
|
||||
|
||||
// weight quant
|
||||
std::map<Tensor *, Tensor *> restored_origin_tensors;
|
||||
|
@ -370,36 +375,32 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in
|
|||
|
||||
auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, gpu_desc, op_parameter);
|
||||
if (kernel != nullptr) {
|
||||
MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " " << node->name_;
|
||||
MS_LOG(DEBUG) << "Get gpu op success: " << PrimitiveCurVersionTypeName(gpu_desc.type);
|
||||
FreeRestoreTensors(&restored_origin_tensors);
|
||||
return kernel;
|
||||
} else {
|
||||
MS_LOG(DEBUG) << "Get gpu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(gpu_desc.type) << " "
|
||||
<< node->name_;
|
||||
auto ret = InferNodeShape(node, &infer_shape_interrupt);
|
||||
if (ret == RET_INFER_INVALID || ret == RET_OK) {
|
||||
op_parameter = op_parameters_[node->output_indices_.at(0)];
|
||||
} else {
|
||||
RestoreTensorData(restored_origin_tensors);
|
||||
MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_;
|
||||
return nullptr;
|
||||
}
|
||||
MS_LOG(DEBUG) << "Get gpu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(gpu_desc.type);
|
||||
RestoreTensorData(&restored_origin_tensors);
|
||||
}
|
||||
return kernel;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
#endif
|
||||
#if SUPPORT_NPU
|
||||
}
|
||||
|
||||
kernel::LiteKernel *Scheduler::FindNpuKernel(const std::vector<Tensor *> &in_tensors,
|
||||
const std::vector<Tensor *> &out_tensors, OpParameter *op_parameter,
|
||||
const kernel::KernelKey &desc) {
|
||||
MS_ASSERT(op_parameter != nullptr);
|
||||
kernel::KernelKey npu_desc{kNPU, desc.data_type, desc.type};
|
||||
if (context_->IsNpuEnabled()) {
|
||||
if (desc.data_type == kNumberTypeFloat16) {
|
||||
desc.data_type = kNumberTypeFloat32;
|
||||
if (npu_desc.data_type == kNumberTypeFloat16) {
|
||||
npu_desc.data_type = kNumberTypeFloat32;
|
||||
}
|
||||
for (auto tensor : in_tensors) {
|
||||
if (tensor->data_type() == kNumberTypeFloat16) {
|
||||
tensor->set_data_type(kNumberTypeFloat32);
|
||||
}
|
||||
}
|
||||
kernel::KernelKey npu_desc{kNPU, desc.data_type, desc.type};
|
||||
|
||||
// weight quant
|
||||
std::map<Tensor *, Tensor *> restored_origin_tensors;
|
||||
for (auto &tensor : in_tensors) {
|
||||
int index = 0;
|
||||
|
@ -411,33 +412,72 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in
|
|||
}
|
||||
auto *kernel = KernelRegistry::GetInstance()->GetKernel(in_tensors, out_tensors, context_, npu_desc, op_parameter);
|
||||
if (kernel != nullptr) {
|
||||
MS_LOG(DEBUG) << "Get npu op success: " << PrimitiveCurVersionTypeName(npu_desc.type) << " " << node->name_;
|
||||
FreeRestoreTensors(&restored_origin_tensors);
|
||||
return kernel;
|
||||
MS_LOG(DEBUG) << "Get npu op success: " << PrimitiveCurVersionTypeName(npu_desc.type);
|
||||
} else {
|
||||
MS_LOG(DEBUG) << "Get npu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(npu_desc.type) << " "
|
||||
<< node->name_;
|
||||
RestoreTensorData(restored_origin_tensors);
|
||||
auto ret = InferNodeShape(node, &infer_shape_interrupt);
|
||||
if (ret == RET_INFER_INVALID || ret == RET_OK) {
|
||||
op_parameter = op_parameters_[node->output_indices_.at(0)];
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_;
|
||||
return nullptr;
|
||||
}
|
||||
RestoreTensorData(&restored_origin_tensors);
|
||||
MS_LOG(DEBUG) << "Get npu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(npu_desc.type);
|
||||
}
|
||||
return kernel;
|
||||
} else {
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in_tensors,
|
||||
const std::vector<Tensor *> &out_tensors, const Model::Node *node,
|
||||
TypeId prefer_data_type) {
|
||||
MS_ASSERT(node != nullptr);
|
||||
// why we need this
|
||||
TypeId data_type =
|
||||
(node->quant_type_ == schema::QuantType_WeightQuant) ? kNumberTypeFloat32 : GetFirstFp32Fp16OrInt8Type(in_tensors);
|
||||
OpParameter *op_parameter = op_parameters_[node->output_indices_.at(0)];
|
||||
if (op_parameter == nullptr) {
|
||||
MS_LOG(ERROR) << "Can not find OpParameter!type: " << PrimitiveTypeName(GetPrimitiveType(node->primitive_));
|
||||
return nullptr;
|
||||
}
|
||||
bool infer_shape_interrupt = !op_parameter->infer_flag_;
|
||||
kernel::KernelKey desc{kCPU, data_type, static_cast<schema::PrimitiveType>(op_parameter->type_)};
|
||||
kernel::LiteKernel *kernel = nullptr;
|
||||
#ifdef SUPPORT_GPU
|
||||
kernel = FindGpuKernel(in_tensors, out_tensors, op_parameter, desc);
|
||||
if (kernel != nullptr) {
|
||||
return kernel;
|
||||
} else {
|
||||
MS_LOG(DEBUG) << "Get gpu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(desc.type) << " "
|
||||
<< node->name_;
|
||||
auto ret = InferNodeShape(node, &infer_shape_interrupt);
|
||||
if (ret == RET_INFER_INVALID || ret == RET_OK) {
|
||||
op_parameter = op_parameters_[node->output_indices_.at(0)];
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if ((prefer_data_type == kNumberTypeFloat16 || prefer_data_type == kTypeUnknown) &&
|
||||
mindspore::lite::IsSupportFloat16() &&
|
||||
((context_->IsCpuFloat16Enabled() && data_type == kNumberTypeFloat32) || data_type == kNumberTypeFloat16)) {
|
||||
kernel::KernelKey fp16_cpu_desc{desc.arch, kNumberTypeFloat16, desc.type};
|
||||
auto kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, fp16_cpu_desc, kNumberTypeFloat16);
|
||||
#ifdef SUPPORT_NPU
|
||||
kernel = FindNpuKernel(in_tensors, out_tensors, op_parameter, desc);
|
||||
if (kernel != nullptr) {
|
||||
return kernel;
|
||||
} else {
|
||||
MS_LOG(DEBUG) << "Get npu op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(desc.type) << " "
|
||||
<< node->name_;
|
||||
auto ret = InferNodeShape(node, &infer_shape_interrupt);
|
||||
if (ret == RET_INFER_INVALID || ret == RET_OK) {
|
||||
op_parameter = op_parameters_[node->output_indices_.at(0)];
|
||||
} else {
|
||||
MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (prefer_data_type == kNumberTypeFloat16 || prefer_data_type == kTypeUnknown) {
|
||||
kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, desc, kNumberTypeFloat16);
|
||||
if (kernel != nullptr) {
|
||||
return kernel;
|
||||
} else {
|
||||
MS_LOG(DEBUG) << "Get fp16 op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(fp16_cpu_desc.type)
|
||||
<< " " << node->name_;
|
||||
MS_LOG(DEBUG) << "Get fp16 op failed, scheduler to cpu: " << PrimitiveCurVersionTypeName(desc.type) << " "
|
||||
<< node->name_;
|
||||
auto ret = InferNodeShape(node, &infer_shape_interrupt);
|
||||
if (ret == RET_INFER_INVALID || ret == RET_OK) {
|
||||
op_parameter = op_parameters_[node->output_indices_.at(0)];
|
||||
|
@ -452,20 +492,18 @@ kernel::LiteKernel *Scheduler::FindBackendKernel(const std::vector<Tensor *> &in
|
|||
desc.data_type = kNumberTypeFloat32;
|
||||
}
|
||||
if (prefer_data_type == kNumberTypeFloat32 || prefer_data_type == kTypeUnknown) {
|
||||
auto kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, desc, kNumberTypeFloat32);
|
||||
kernel = FindCpuKernel(in_tensors, out_tensors, op_parameter, desc, kNumberTypeFloat32);
|
||||
if (kernel != nullptr) {
|
||||
return kernel;
|
||||
} else {
|
||||
auto ret = InferNodeShape(node, &infer_shape_interrupt);
|
||||
if (!(ret == RET_INFER_INVALID || ret == RET_OK)) {
|
||||
MS_LOG(ERROR)
|
||||
|
||||
<< "Try repeat infer fail: " << node->name_;
|
||||
MS_LOG(ERROR) << "Try repeat infer fail: " << node->name_;
|
||||
}
|
||||
}
|
||||
}
|
||||
return nullptr;
|
||||
} // namespace mindspore::lite
|
||||
}
|
||||
|
||||
kernel::LiteKernel *Scheduler::SchedulePartialToKernel(const lite::Model::Node *src_node) {
|
||||
MS_ASSERT(src_model_ != nullptr);
|
||||
|
|
|
@ -61,6 +61,10 @@ class Scheduler {
|
|||
TypeId prefer_data_type = kTypeUnknown);
|
||||
kernel::LiteKernel *FindCpuKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
|
||||
OpParameter *op_parameter, const kernel::KernelKey &desc, TypeId kernel_data_type);
|
||||
kernel::LiteKernel *FindGpuKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
|
||||
OpParameter *op_parameter, const kernel::KernelKey &desc);
|
||||
kernel::LiteKernel *FindNpuKernel(const std::vector<Tensor *> &in_tensors, const std::vector<Tensor *> &out_tensors,
|
||||
OpParameter *op_parameter, const kernel::KernelKey &desc);
|
||||
// schedule a partial node to a subgraph_kernel
|
||||
kernel::LiteKernel *SchedulePartialToKernel(const lite::Model::Node *src_node);
|
||||
// schedule a node to a kernel
|
||||
|
|
|
@ -412,9 +412,7 @@ int Benchmark::MarkPerformance() {
|
|||
for (int i = 0; i < flags_->loop_count_; i++) {
|
||||
session_->BindThread(true);
|
||||
auto start = GetTimeUs();
|
||||
auto status = (flags_->time_profiling_ || flags_->perf_profiling_)
|
||||
? session_->RunGraph(before_call_back_, after_call_back_)
|
||||
: session_->RunGraph();
|
||||
auto status = session_->RunGraph(before_call_back_, after_call_back_);
|
||||
if (status != 0) {
|
||||
MS_LOG(ERROR) << "Inference error " << status;
|
||||
std::cerr << "Inference error " << status;
|
||||
|
@ -479,7 +477,7 @@ int Benchmark::MarkAccuracy() {
|
|||
std::cerr << "PrintInputData error " << status << std::endl;
|
||||
return status;
|
||||
}
|
||||
status = session_->RunGraph();
|
||||
status = session_->RunGraph(before_call_back_, after_call_back_);
|
||||
if (status != RET_OK) {
|
||||
MS_LOG(ERROR) << "Inference error " << status;
|
||||
std::cerr << "Inference error " << status << std::endl;
|
||||
|
@ -615,7 +613,9 @@ int Benchmark::RunBenchmark() {
|
|||
return ret;
|
||||
}
|
||||
}
|
||||
if (model != nullptr) model->Free();
|
||||
if (model != nullptr) {
|
||||
model->Free();
|
||||
}
|
||||
|
||||
ms_inputs_ = session_->GetInputs();
|
||||
auto end_prepare_time = GetTimeUs();
|
||||
|
@ -689,18 +689,18 @@ int Benchmark::InitTimeProfilingCallbackParameter() {
|
|||
// before callback
|
||||
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
|
||||
const CallBackParam &callParam) {
|
||||
const CallBackParam &call_param) {
|
||||
if (before_inputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of beforeInputs is empty";
|
||||
}
|
||||
if (before_outputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of beforeOutputs is empty";
|
||||
}
|
||||
if (op_times_by_type_.find(callParam.node_type) == op_times_by_type_.end()) {
|
||||
op_times_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, 0.0f)));
|
||||
if (op_times_by_type_.find(call_param.node_type) == op_times_by_type_.end()) {
|
||||
op_times_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, 0.0f)));
|
||||
}
|
||||
if (op_times_by_name_.find(callParam.node_name) == op_times_by_name_.end()) {
|
||||
op_times_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, 0.0f)));
|
||||
if (op_times_by_name_.find(call_param.node_name) == op_times_by_name_.end()) {
|
||||
op_times_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, 0.0f)));
|
||||
}
|
||||
|
||||
op_call_times_total_++;
|
||||
|
@ -735,6 +735,7 @@ int Benchmark::InitTimeProfilingCallbackParameter() {
|
|||
};
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int Benchmark::InitPerfProfilingCallbackParameter() {
|
||||
#ifndef ENABLE_ARM64
|
||||
MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
|
||||
|
@ -781,18 +782,18 @@ int Benchmark::InitPerfProfilingCallbackParameter() {
|
|||
// before callback
|
||||
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
|
||||
const CallBackParam &callParam) {
|
||||
const CallBackParam &call_param) {
|
||||
if (before_inputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of beforeInputs is empty";
|
||||
}
|
||||
if (before_outputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of beforeOutputs is empty";
|
||||
}
|
||||
if (op_perf_by_type_.find(callParam.node_type) == op_perf_by_type_.end()) {
|
||||
op_perf_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, zero)));
|
||||
if (op_perf_by_type_.find(call_param.node_type) == op_perf_by_type_.end()) {
|
||||
op_perf_by_type_.insert(std::make_pair(call_param.node_type, std::make_pair(0, zero)));
|
||||
}
|
||||
if (op_perf_by_name_.find(callParam.node_name) == op_perf_by_name_.end()) {
|
||||
op_perf_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, zero)));
|
||||
if (op_perf_by_name_.find(call_param.node_name) == op_perf_by_name_.end()) {
|
||||
op_perf_by_name_.insert(std::make_pair(call_param.node_name, std::make_pair(0, zero)));
|
||||
}
|
||||
|
||||
op_call_times_total_++;
|
||||
|
@ -831,12 +832,89 @@ int Benchmark::InitPerfProfilingCallbackParameter() {
|
|||
return RET_OK;
|
||||
}
|
||||
|
||||
namespace {
|
||||
template <typename T>
|
||||
std::string DataToString(void *data, size_t data_number) {
|
||||
if (data == nullptr) {
|
||||
return "Data of tensor is nullptr";
|
||||
}
|
||||
std::ostringstream oss;
|
||||
auto casted_data = static_cast<T *>(data);
|
||||
for (size_t i = 0; i < 40 && i < data_number; i++) {
|
||||
oss << " " << casted_data[i];
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
std::string DumpMSTensor(tensor::MSTensor *tensor) {
|
||||
if (tensor == nullptr) {
|
||||
return "Tensor is nullptr";
|
||||
}
|
||||
std::ostringstream oss;
|
||||
oss << " DataType: " << tensor->data_type();
|
||||
oss << " Shape:";
|
||||
for (auto &dim : tensor->shape()) {
|
||||
oss << " " << dim;
|
||||
}
|
||||
oss << std::endl << "Data:";
|
||||
switch (tensor->data_type()) {
|
||||
case kNumberTypeFloat32: {
|
||||
oss << DataToString<float>(tensor->data(), tensor->ElementsNum());
|
||||
} break;
|
||||
case kNumberTypeFloat16: {
|
||||
oss << DataToString<int16_t>(tensor->data(), tensor->ElementsNum());
|
||||
} break;
|
||||
case kNumberTypeInt32: {
|
||||
oss << DataToString<int32_t>(tensor->data(), tensor->ElementsNum());
|
||||
} break;
|
||||
case kNumberTypeInt16: {
|
||||
oss << DataToString<int16_t>(tensor->data(), tensor->ElementsNum());
|
||||
} break;
|
||||
case kNumberTypeInt8: {
|
||||
oss << DataToString<int8_t>(tensor->data(), tensor->ElementsNum());
|
||||
} break;
|
||||
default:
|
||||
oss << "Unsupported data type to print";
|
||||
break;
|
||||
}
|
||||
return oss.str();
|
||||
}
|
||||
} // namespace
|
||||
|
||||
int Benchmark::InitDumpProfilingCallbackParameter() {
|
||||
// before callback
|
||||
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
|
||||
const CallBackParam &call_param) { return true; };
|
||||
|
||||
// after callback
|
||||
after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
|
||||
const CallBackParam &call_param) {
|
||||
std::cout << "================================================================" << std::endl;
|
||||
std::cout << call_param.node_name << " inputs : " << std::endl;
|
||||
for (auto ms_tensor : after_inputs) {
|
||||
std::cout << DumpMSTensor(ms_tensor) << std::endl;
|
||||
}
|
||||
std::cout << "----------------------------------------------------------------" << std::endl;
|
||||
std::cout << call_param.node_name << " outputs : " << std::endl;
|
||||
for (const auto ms_tensor : after_outputs) {
|
||||
std::cout << DumpMSTensor(ms_tensor) << std::endl;
|
||||
}
|
||||
std::cout << "================================================================" << std::endl;
|
||||
return true;
|
||||
};
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int Benchmark::InitCallbackParameter() {
|
||||
int ret = RET_OK;
|
||||
if (flags_->time_profiling_) {
|
||||
ret = InitTimeProfilingCallbackParameter();
|
||||
} else if (flags_->perf_profiling_) {
|
||||
ret = InitPerfProfilingCallbackParameter();
|
||||
} else if (flags_->dump_profiling_) {
|
||||
ret = InitDumpProfilingCallbackParameter();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
@ -917,16 +995,14 @@ int Benchmark::Init() {
|
|||
return RET_ERROR;
|
||||
}
|
||||
|
||||
if (flags_->time_profiling_ || flags_->perf_profiling_) {
|
||||
if (flags_->time_profiling_ && flags_->perf_profiling_) {
|
||||
MS_LOG(INFO) << "time_profiling is enabled, will not run perf_profiling.";
|
||||
}
|
||||
auto status = InitCallbackParameter();
|
||||
if (status != RET_OK) {
|
||||
MS_LOG(ERROR) << "Init callback Parameter failed.";
|
||||
std::cerr << "Init callback Parameter failed." << std::endl;
|
||||
return RET_ERROR;
|
||||
}
|
||||
if (flags_->time_profiling_ && flags_->perf_profiling_) {
|
||||
MS_LOG(INFO) << "time_profiling is enabled, will not run perf_profiling.";
|
||||
}
|
||||
auto status = InitCallbackParameter();
|
||||
if (status != RET_OK) {
|
||||
MS_LOG(ERROR) << "Init callback Parameter failed.";
|
||||
std::cerr << "Init callback Parameter failed." << std::endl;
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
return RET_OK;
|
||||
|
|
|
@ -113,9 +113,6 @@ class MS_API BenchmarkFlags : public virtual FlagParser {
|
|||
int num_threads_ = 2;
|
||||
bool enable_fp16_ = false;
|
||||
int warm_up_loop_count_ = 3;
|
||||
bool time_profiling_ = false;
|
||||
bool perf_profiling_ = false;
|
||||
std::string perf_event_ = "CYCLE";
|
||||
// MarkAccuracy
|
||||
std::string benchmark_data_file_;
|
||||
std::string benchmark_data_type_ = "FLOAT";
|
||||
|
@ -125,6 +122,10 @@ class MS_API BenchmarkFlags : public virtual FlagParser {
|
|||
std::vector<std::vector<int>> resize_dims_;
|
||||
|
||||
std::string device_ = "CPU";
|
||||
bool time_profiling_ = false;
|
||||
bool perf_profiling_ = false;
|
||||
std::string perf_event_ = "CYCLE";
|
||||
bool dump_profiling_ = false;
|
||||
};
|
||||
|
||||
class MS_API Benchmark {
|
||||
|
@ -163,9 +164,13 @@ class MS_API Benchmark {
|
|||
int *total_size);
|
||||
|
||||
int InitCallbackParameter();
|
||||
|
||||
int InitTimeProfilingCallbackParameter();
|
||||
|
||||
int InitPerfProfilingCallbackParameter();
|
||||
|
||||
int InitDumpProfilingCallbackParameter();
|
||||
|
||||
int PrintResult(const std::vector<std::string> &title, const std::map<std::string, std::pair<int, float>> &result);
|
||||
|
||||
#ifdef ENABLE_ARM64
|
||||
|
@ -289,8 +294,8 @@ class MS_API Benchmark {
|
|||
std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_type_;
|
||||
std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_name_;
|
||||
#endif
|
||||
KernelCallBack before_call_back_;
|
||||
KernelCallBack after_call_back_;
|
||||
KernelCallBack before_call_back_ = nullptr;
|
||||
KernelCallBack after_call_back_ = nullptr;
|
||||
std::mt19937 random_engine_;
|
||||
};
|
||||
|
||||
|
|
|
@ -193,7 +193,7 @@ STATUS InferShapePass::GetCNodeInputTensors(const CNodePtr &cnode, std::vector<l
|
|||
tensor::TensorPtr tensor_info;
|
||||
auto status = GetTensorInfoFromAbstract(&tensor_info, cnode, i);
|
||||
if (status != RET_OK) {
|
||||
MS_LOG(ERROR) << "get tensor info failed.";
|
||||
MS_LOG(DEBUG) << "get tensor info failed.";
|
||||
return RET_ERROR;
|
||||
}
|
||||
std::unique_ptr<lite::Tensor> tensor = nullptr;
|
||||
|
|
Loading…
Reference in New Issue