!19987 [MS][LITE]fix bug and add new api benchmark

Merge pull request !19987 from 张学同/api13
This commit is contained in:
i-robot 2021-07-12 01:36:58 +00:00 committed by Gitee
commit ec2ec77666
13 changed files with 1986 additions and 929 deletions

View File

@ -41,79 +41,9 @@ CreateTrainSessionProto *CreateTrainSessionCallbackHolder(CreateTrainSessionProt
return proto_;
}
lite::CpuBindMode ModelImpl::GetCpuBindMode() {
auto affinity_mode = context_->GetThreadAffinityMode();
switch (affinity_mode) {
case 0:
return lite::NO_BIND;
case 1:
return lite::HIGHER_CPU;
case 2:
return lite::MID_CPU;
default:
return lite::NO_BIND;
}
}
Status ModelImpl::ConverterContext(const std::shared_ptr<Context> &context, lite::Context *model_context) {
auto device_list = context->MutableDeviceInfo();
if (device_list.size() == 0) {
MS_LOG(ERROR) << "Invalid device list.";
return kLiteInputParamInvalid;
}
if (device_list.size() > 2) {
MS_LOG(ERROR) << "Only CPU/CPU & GPU/CPU & NPU mode is supported.";
return kLiteInputParamInvalid;
}
model_context->thread_num_ = context->GetThreadNum();
model_context->enable_parallel_ = context->GetEnableParallel();
model_context->affinity_core_list_ = context->GetThreadAffinityCoreList();
model_context->device_list_.clear();
if (device_list[0]->GetDeviceType() != kCPU) {
MS_LOG(ERROR) << "CPU context must be enabled and in the first place of device list.";
return kLiteInputParamInvalid;
}
auto cpu_context = device_list[0]->Cast<CPUDeviceInfo>();
model_context->allocator = cpu_context->GetAllocator();
if (model_context->allocator == nullptr) {
model_context->allocator = Allocator::Create();
if (model_context->allocator == nullptr) {
MS_LOG(ERROR) << "Create Allocator failed.";
return kLiteNullptr;
}
MS_LOG(DEBUG) << "Set new allocator.";
cpu_context->SetAllocator(model_context->allocator);
}
lite::CpuBindMode mode = GetCpuBindMode();
lite::DeviceInfo cpu_info = {0};
cpu_info.cpu_device_info_ = {cpu_context->GetEnableFP16(), mode};
model_context->device_list_.push_back({lite::DT_CPU, cpu_info, cpu_context->GetProvider(),
cpu_context->GetProviderDevice(), cpu_context->GetAllocator()});
if (device_list.size() == 2) {
lite::DeviceInfo device_info = {0};
if (device_list[1]->GetDeviceType() == kMaliGPU) {
auto gpu_context = device_list[1]->Cast<MaliGPUDeviceInfo>();
device_info.gpu_device_info_ = {gpu_context->GetEnableFP16()};
model_context->device_list_.push_back({lite::DT_GPU, device_info, gpu_context->GetProvider(),
gpu_context->GetProviderDevice(), gpu_context->GetAllocator()});
} else if (device_list[1]->GetDeviceType() == kKirinNPU) {
auto npu_context = device_list[1]->Cast<KirinNPUDeviceInfo>();
device_info.npu_device_info_ = {npu_context->GetFrequency()};
model_context->device_list_.push_back({lite::DT_NPU, device_info});
} else {
MS_LOG(ERROR) << "Invalid device.";
return kLiteInputParamInvalid;
}
}
model_context->delegate = context->GetDelegate();
return kSuccess;
}
Status ModelImpl::Build(const void *model_data, size_t data_size, ModelType model_type,
const std::shared_ptr<Context> &ms_context) {
context_ = ms_context;
lite::Context lite_context;
auto status = A2L_ConvertContext(ms_context.get(), &lite_context);
if (status != kSuccess) {

View File

@ -100,8 +100,6 @@ class ModelImpl {
void SetGraph(const std::shared_ptr<Graph> &graph) { graph_ = graph; }
void SetContext(const std::shared_ptr<Context> &context) { context_ = context; }
void SetConfig(const std::shared_ptr<TrainCfg> cfg) { cfg_ = cfg; }
lite::CpuBindMode GetCpuBindMode();
Status ConverterContext(const std::shared_ptr<Context> &context, lite::Context *model_context);
Status RunGraph(const MSKernelCallBack &before, const MSKernelCallBack &after);
};
} // namespace mindspore

View File

@ -331,6 +331,7 @@ if(MSLITE_ENABLE_CONVERTER)
${TEST_SRC}
${TEST_DIR}/st/converter_test.cc
${TEST_DIR}/st/mindrt_parallel_test.cc
${TEST_DIR}/st/graph_test.cc
${TEST_DIR}/st/sub_graph_test.cc
${TEST_DIR}/common/import_from_meta_graphT.cc
${TEST_DIR}/ut/tools/optimizer/fusion/conv_biasadd_fusion_test.cc
@ -384,6 +385,8 @@ if(ENABLE_FP16 AND SUPPORT_TRAIN)
list(APPEND TEST_SRC ${TEST_CASE_KERNEL_FP16_SRC_GRAD})
endif()
file(GLOB_RECURSE API_SRC ${LITE_DI}/src/cxx_api/*.cc)
set(TEST_SRC ${TEST_SRC} ${API_SRC})
add_executable(lite-test ${TEST_SRC})
add_dependencies(lite-test fbs_src)

View File

@ -7,7 +7,10 @@ set(COMMON_SRC
if(NOT TARGET_HIMIX200)
add_executable(benchmark
${CMAKE_CURRENT_SOURCE_DIR}/main.cc
${CMAKE_CURRENT_SOURCE_DIR}/run_benchmark.cc
${CMAKE_CURRENT_SOURCE_DIR}/benchmark_base.cc
${CMAKE_CURRENT_SOURCE_DIR}/benchmark.cc
${CMAKE_CURRENT_SOURCE_DIR}/benchmark_unified_api.cc
${COMMON_SRC})
add_dependencies(benchmark fbs_src)

View File

@ -42,69 +42,6 @@
namespace mindspore {
namespace lite {
namespace {
constexpr int kNumPrintMin = 5;
}
static const char *DELIM_COLON = ":";
static const char *DELIM_COMMA = ",";
static const char *DELIM_SLASH = "/";
static const std::unordered_map<TypeId, std::string> TYPE_ID_MAP{
{kNumberTypeFloat16, "Float16"}, {kNumberTypeFloat, "Float32"}, {kNumberTypeFloat32, "Float32"},
{kNumberTypeInt8, "Int8"}, {kNumberTypeInt16, "Int16"}, {kNumberTypeInt, "Int32"},
{kNumberTypeInt32, "Int32"}, {kNumberTypeUInt8, "UInt8"}, {kNumberTypeUInt16, "UInt16"},
{kNumberTypeUInt, "UInt32"}, {kNumberTypeUInt32, "UInt32"}, {kObjectTypeString, "String"},
{kNumberTypeBool, "Bool"}, {kObjectTypeTensorType, "Tensor"}};
static const std::unordered_map<schema::Format, std::string> TENSOR_FORMAT_MAP{
{schema::Format_NCHW, "NCHW"}, {schema::Format_NHWC, "NHWC"}, {schema::Format_NHWC4, "NHWC4"},
{schema::Format_HWKC, "HWKC"}, {schema::Format_HWCK, "HWCK"}, {schema::Format_KCHW, "KCHW"},
{schema::Format_CKHW, "CKHW"}, {schema::Format_KHWC, "KHWC"}, {schema::Format_CHWK, "CHWK"},
{schema::Format_HW, "HW"}, {schema::Format_HW4, "HW4"}, {schema::Format_NC, "NC"},
{schema::Format_NC4, "NC4"}, {schema::Format_NC4HW4, "NC4HW4"}, {schema::Format_NCDHW, "NCDHW"}};
namespace dump {
constexpr auto kConfigPath = "MINDSPORE_DUMP_CONFIG";
constexpr auto kSettings = "common_dump_settings";
constexpr auto kMode = "dump_mode";
constexpr auto kPath = "path";
constexpr auto kNetName = "net_name";
constexpr auto kInputOutput = "input_output";
constexpr auto kKernels = "kernels";
} // namespace dump
int Benchmark::GenerateRandomData(size_t size, void *data, TypeId data_type) {
MS_ASSERT(data != nullptr);
switch (data_type) {
case kNumberTypeFloat32:
case kNumberTypeFloat:
FillInputData<float>(size, data, std::uniform_real_distribution<float>(0.1f, 1.0f));
break;
case kNumberTypeFloat64:
FillInputData<double>(size, data, std::uniform_real_distribution<double>(0.1, 1.0));
break;
case kNumberTypeInt64:
FillInputData<int64_t>(size, data, std::uniform_int_distribution<int64_t>(0, 1));
break;
case kNumberTypeInt:
case kNumberTypeInt32:
FillInputData<int32_t>(size, data, std::uniform_int_distribution<int32_t>(0, 1));
break;
case kNumberTypeInt16:
FillInputData<int16_t>(size, data, std::uniform_int_distribution<int16_t>(0, 1));
break;
case kNumberTypeInt8:
FillInputData<int8_t>(size, data, std::uniform_int_distribution<int8_t>(-127, 127));
break;
case kNumberTypeUInt8:
FillInputData<uint8_t>(size, data, std::uniform_int_distribution<uint8_t>(0, 254));
break;
default:
char *casted_data = static_cast<char *>(data);
for (size_t i = 0; i < size; i++) {
casted_data[i] = static_cast<char>(i);
}
}
return RET_OK;
}
int Benchmark::GenerateInputData() {
for (auto tensor : ms_inputs_) {
@ -118,7 +55,7 @@ int Benchmark::GenerateInputData() {
if (tensor->data_type() == kObjectTypeString) {
status = StringsToMSTensor({"you're the best."}, tensor);
} else {
status = GenerateRandomData(tensor->Size(), input_data, tensor->data_type());
status = GenerateRandomData(tensor->Size(), input_data, static_cast<float>(tensor->data_type()));
}
if (status != RET_OK) {
std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
@ -129,25 +66,6 @@ int Benchmark::GenerateInputData() {
return RET_OK;
}
int Benchmark::LoadInput() {
if (flags_->in_data_file_.empty()) {
auto status = GenerateInputData();
if (status != 0) {
std::cerr << "Generate input data error " << status << std::endl;
MS_LOG(ERROR) << "Generate input data error " << status;
return status;
}
} else {
auto status = ReadInputFile();
if (status != 0) {
std::cerr << "ReadInputFile error, " << status << std::endl;
MS_LOG(ERROR) << "ReadInputFile error, " << status;
return status;
}
}
return RET_OK;
}
int Benchmark::ReadInputFile() {
if (ms_inputs_.empty()) {
return RET_OK;
@ -196,49 +114,6 @@ int Benchmark::ReadInputFile() {
return RET_OK;
}
// calibData is FP32
int Benchmark::ReadCalibData() {
const char *calib_data_path = flags_->benchmark_data_file_.c_str();
// read calib data
std::ifstream in_file(calib_data_path);
if (!in_file.good()) {
std::cerr << "file: " << calib_data_path << " is not exist" << std::endl;
MS_LOG(ERROR) << "file: " << calib_data_path << " is not exist";
return RET_ERROR;
}
if (!in_file.is_open()) {
std::cerr << "file: " << calib_data_path << " open failed" << std::endl;
MS_LOG(ERROR) << "file: " << calib_data_path << " open failed";
in_file.close();
return RET_ERROR;
}
MS_LOG(INFO) << "Start reading calibData file";
std::string line;
std::string tensor_name;
while (!in_file.eof()) {
getline(in_file, line);
std::stringstream string_line1(line);
size_t dim = 0;
string_line1 >> tensor_name >> dim;
std::vector<size_t> dims;
for (size_t i = 0; i < dim; i++) {
size_t tmp_dim;
string_line1 >> tmp_dim;
dims.push_back(tmp_dim);
}
auto ret = ReadTensorData(in_file, tensor_name, dims);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Read tensor data failed, tensor name: " << tensor_name;
return RET_ERROR;
}
}
in_file.close();
MS_LOG(INFO) << "Finish reading calibData file";
return RET_OK;
}
int Benchmark::ReadTensorData(std::ifstream &in_file_stream, const std::string &tensor_name,
const std::vector<size_t> &dims) {
std::string line;
@ -379,28 +254,6 @@ tensor::MSTensor *Benchmark::GetTensorByNameOrShape(const std::string &node_or_t
return tensor;
}
int Benchmark::CompareStringData(const std::string &name, tensor::MSTensor *tensor) {
auto iter = this->benchmark_data_.find(name);
if (iter != this->benchmark_data_.end()) {
std::vector<std::string> calib_strings = iter->second->strings_data;
std::vector<std::string> output_strings = MSTensorToStrings(tensor);
size_t compare_num = std::min(calib_strings.size(), output_strings.size());
size_t print_num = std::min(compare_num, static_cast<size_t>(kNumPrintMin));
std::cout << "Data of node " << name << " : " << std::endl;
for (size_t i = 0; i < compare_num; i++) {
if (i < print_num) {
std::cout << " " << output_strings[i] << std::endl;
}
if (calib_strings[i] != output_strings[i]) {
MS_LOG(ERROR) << "Compare failed, index: " << i;
return RET_ERROR;
}
}
}
return RET_OK;
}
int Benchmark::CompareDataGetTotalBiasAndSize(const std::string &name, tensor::MSTensor *tensor, float *total_bias,
int *total_size) {
float bias = 0;
@ -698,36 +551,6 @@ int Benchmark::RunBenchmark() {
return RET_OK;
}
void BenchmarkFlags::InitInputDataList() {
char *input_list = new char[this->in_data_file_.length() + 1];
snprintf(input_list, this->in_data_file_.length() + 1, "%s", this->in_data_file_.c_str());
char *cur_input;
const char *split_c = ",";
cur_input = strtok(input_list, split_c);
while (cur_input != nullptr) {
input_data_list_.emplace_back(cur_input);
cur_input = strtok(nullptr, split_c);
}
delete[] input_list;
}
void BenchmarkFlags::InitResizeDimsList() {
std::string content = this->resize_dims_in_;
std::vector<int> shape;
auto shape_strs = StringSplit(content, std::string(DELIM_COLON));
for (const auto &shape_str : shape_strs) {
shape.clear();
auto dim_strs = StringSplit(shape_str, std::string(DELIM_COMMA));
std::cout << "Resize Dims: ";
for (const auto &dim_str : dim_strs) {
std::cout << dim_str << " ";
shape.emplace_back(static_cast<int>(std::stoi(dim_str)));
}
std::cout << std::endl;
this->resize_dims_.emplace_back(shape);
}
}
int Benchmark::InitTimeProfilingCallbackParameter() {
// before callback
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
@ -1022,444 +845,7 @@ int Benchmark::InitDumpTensorDataCallbackParameter() {
return RET_OK;
}
int Benchmark::CheckThreadNumValid() {
if (this->flags_->num_threads_ < 1) {
MS_LOG(ERROR) << "numThreads:" << this->flags_->num_threads_ << " must be greater than 0";
std::cerr << "numThreads:" << this->flags_->num_threads_ << " must be greater than 0" << std::endl;
return RET_ERROR;
}
Benchmark::~Benchmark() { delete (session_); }
if (flags_->enable_parallel_) {
if (flags_->num_threads_ < 2) {
MS_LOG(ERROR) << "enable parallel need more than 1 thread.";
std::cerr << "enable parallel need more than 1 thread." << std::endl;
return RET_ERROR;
}
}
return RET_OK;
}
int Benchmark::InitDumpConfigFromJson(char *path) {
auto real_path = RealPath(path);
std::ifstream ifs(real_path);
if (!ifs.good()) {
MS_LOG(ERROR) << "file: " << real_path << " is not exist";
return RET_ERROR;
}
if (!ifs.is_open()) {
MS_LOG(ERROR) << "file: " << real_path << " open failed";
return RET_ERROR;
}
try {
dump_cfg_json_ = nlohmann::json::parse(ifs);
} catch (const nlohmann::json::parse_error &error) {
MS_LOG(ERROR) << "parse json file failed, please check your file.";
return RET_ERROR;
}
if (dump_cfg_json_[dump::kSettings] == nullptr) {
MS_LOG(ERROR) << "\"common_dump_settings\" is required.";
return RET_ERROR;
}
if (dump_cfg_json_[dump::kSettings][dump::kMode] == nullptr) {
MS_LOG(ERROR) << "\"dump_mode\" is required.";
return RET_ERROR;
}
if (dump_cfg_json_[dump::kSettings][dump::kPath] == nullptr) {
MS_LOG(ERROR) << "\"path\" is required.";
return RET_ERROR;
}
if (dump_cfg_json_[dump::kSettings][dump::kNetName] == nullptr) {
dump_cfg_json_[dump::kSettings][dump::kNetName] = "Default";
}
if (dump_cfg_json_[dump::kSettings][dump::kInputOutput] == nullptr) {
dump_cfg_json_[dump::kSettings][dump::kInputOutput] = 0;
}
if (dump_cfg_json_[dump::kSettings][dump::kKernels] != nullptr &&
!dump_cfg_json_[dump::kSettings][dump::kKernels].empty()) {
if (dump_cfg_json_[dump::kSettings][dump::kMode] == 0) {
MS_LOG(ERROR) << R"("dump_mode" should be 1 when "kernels" isn't empty.)";
return RET_ERROR;
}
}
auto abs_path = dump_cfg_json_[dump::kSettings][dump::kPath].get<std::string>();
auto net_name = dump_cfg_json_[dump::kSettings][dump::kNetName].get<std::string>();
if (abs_path.back() == '\\' || abs_path.back() == '/') {
dump_file_output_dir_ = abs_path + net_name;
} else {
#ifdef _WIN32
dump_file_output_dir_ = abs_path + "\\" + net_name;
#else
dump_file_output_dir_ = abs_path + "/" + net_name;
#endif
}
auto status = CreateOutputDir(&dump_file_output_dir_);
if (status != RET_OK) {
MS_LOG(ERROR) << "create data output directory failed.";
return RET_ERROR;
}
return RET_OK;
}
int Benchmark::InitCallbackParameter() {
int ret = RET_OK;
if (flags_->time_profiling_) {
ret = InitTimeProfilingCallbackParameter();
} else if (flags_->perf_profiling_) {
ret = InitPerfProfilingCallbackParameter();
} else if (flags_->print_tensor_data_) {
ret = InitPrintTensorDataCallbackParameter();
} else if (flags_->dump_tensor_data_) {
ret = InitDumpTensorDataCallbackParameter();
}
return ret;
}
int Benchmark::Init() {
if (this->flags_ == nullptr) {
return 1;
}
MS_LOG(INFO) << "ModelPath = " << this->flags_->model_file_;
MS_LOG(INFO) << "InDataPath = " << this->flags_->in_data_file_;
MS_LOG(INFO) << "InDataType = " << this->flags_->in_data_type_in_;
MS_LOG(INFO) << "LoopCount = " << this->flags_->loop_count_;
MS_LOG(INFO) << "DeviceType = " << this->flags_->device_;
MS_LOG(INFO) << "AccuracyThreshold = " << this->flags_->accuracy_threshold_;
MS_LOG(INFO) << "WarmUpLoopCount = " << this->flags_->warm_up_loop_count_;
MS_LOG(INFO) << "NumThreads = " << this->flags_->num_threads_;
MS_LOG(INFO) << "Fp16Priority = " << this->flags_->enable_fp16_;
MS_LOG(INFO) << "EnableParallel = " << this->flags_->enable_parallel_;
MS_LOG(INFO) << "calibDataPath = " << this->flags_->benchmark_data_file_;
std::cout << "ModelPath = " << this->flags_->model_file_ << std::endl;
std::cout << "InDataPath = " << this->flags_->in_data_file_ << std::endl;
std::cout << "InDataType = " << this->flags_->in_data_type_in_ << std::endl;
std::cout << "LoopCount = " << this->flags_->loop_count_ << std::endl;
std::cout << "DeviceType = " << this->flags_->device_ << std::endl;
std::cout << "AccuracyThreshold = " << this->flags_->accuracy_threshold_ << std::endl;
std::cout << "WarmUpLoopCount = " << this->flags_->warm_up_loop_count_ << std::endl;
std::cout << "NumThreads = " << this->flags_->num_threads_ << std::endl;
std::cout << "Fp16Priority = " << this->flags_->enable_fp16_ << std::endl;
std::cout << "EnableParallel = " << this->flags_->enable_parallel_ << std::endl;
std::cout << "calibDataPath = " << this->flags_->benchmark_data_file_ << std::endl;
if (this->flags_->loop_count_ < 1) {
MS_LOG(ERROR) << "LoopCount:" << this->flags_->loop_count_ << " must be greater than 0";
std::cerr << "LoopCount:" << this->flags_->loop_count_ << " must be greater than 0" << std::endl;
return RET_ERROR;
}
auto thread_ret = CheckThreadNumValid();
if (thread_ret != RET_OK) {
MS_LOG(ERROR) << "Invalid numThreads.";
std::cerr << "Invalid numThreads." << std::endl;
return RET_ERROR;
}
static std::vector<std::string> CPU_BIND_MODE_MAP = {"NO_BIND", "HIGHER_CPU", "MID_CPU"};
if (this->flags_->cpu_bind_mode_ >= 1) {
MS_LOG(INFO) << "cpuBindMode = " << CPU_BIND_MODE_MAP[this->flags_->cpu_bind_mode_];
std::cout << "cpuBindMode = " << CPU_BIND_MODE_MAP[this->flags_->cpu_bind_mode_] << std::endl;
} else {
MS_LOG(INFO) << "cpuBindMode = NO_BIND";
std::cout << "cpuBindMode = NO_BIND" << std::endl;
}
this->flags_->in_data_type_ = this->flags_->in_data_type_in_ == "img" ? kImage : kBinary;
if (!flags_->benchmark_data_type_.empty()) {
if (data_type_map_.find(flags_->benchmark_data_type_) == data_type_map_.end()) {
MS_LOG(ERROR) << "CalibDataType not supported: " << flags_->benchmark_data_type_.c_str();
return RET_ERROR;
}
msCalibDataType = data_type_map_.at(flags_->benchmark_data_type_);
MS_LOG(INFO) << "CalibDataType = " << flags_->benchmark_data_type_.c_str();
std::cout << "CalibDataType = " << flags_->benchmark_data_type_.c_str() << std::endl;
}
if (flags_->model_file_.empty()) {
MS_LOG(ERROR) << "modelPath is required";
std::cerr << "modelPath is required" << std::endl;
return 1;
}
flags_->InitInputDataList();
flags_->InitResizeDimsList();
if (!flags_->resize_dims_.empty() && !flags_->input_data_list_.empty() &&
flags_->resize_dims_.size() != flags_->input_data_list_.size()) {
MS_LOG(ERROR) << "Size of input resizeDims should be equal to size of input inDataPath";
std::cerr << "Size of input resizeDims should be equal to size of input inDataPath" << std::endl;
return RET_ERROR;
}
if (flags_->device_ != "CPU" && flags_->device_ != "GPU" && flags_->device_ != "NPU") {
MS_LOG(ERROR) << "Device type:" << flags_->device_ << " is not supported.";
std::cerr << "Device type:" << flags_->device_ << " is not supported." << std::endl;
return RET_ERROR;
}
if (flags_->time_profiling_ && flags_->perf_profiling_) {
MS_LOG(INFO) << "time_profiling is enabled, will not run perf_profiling.";
}
// get dump data output path
auto dump_cfg_path = std::getenv(dump::kConfigPath);
if (dump_cfg_path != nullptr) {
flags_->dump_tensor_data_ = true;
if (InitDumpConfigFromJson(dump_cfg_path) != RET_OK) {
MS_LOG(ERROR) << "parse dump config file failed.";
return RET_ERROR;
}
} else {
MS_LOG(INFO) << "No MINDSPORE_DUMP_CONFIG in env, don't need to dump data";
}
auto status = InitCallbackParameter();
if (status != RET_OK) {
MS_LOG(ERROR) << "Init callback Parameter failed.";
std::cerr << "Init callback Parameter failed." << std::endl;
return RET_ERROR;
}
return RET_OK;
}
int Benchmark::PrintResult(const std::vector<std::string> &title,
const std::map<std::string, std::pair<int, float>> &result) {
std::vector<size_t> columnLenMax(5);
std::vector<std::vector<std::string>> rows;
for (auto &iter : result) {
char stringBuf[5][100] = {};
std::vector<std::string> columns;
size_t len = 0;
len = iter.first.size();
if (len > columnLenMax.at(0)) {
columnLenMax.at(0) = len + 4;
}
columns.push_back(iter.first);
len =
snprintf(stringBuf[1], sizeof(stringBuf[1]), "%f", iter.second.second / static_cast<float>(flags_->loop_count_));
if (len > columnLenMax.at(1)) {
columnLenMax.at(1) = len + 4;
}
columns.emplace_back(stringBuf[1]);
len = snprintf(stringBuf[2], sizeof(stringBuf[2]), "%f", iter.second.second / op_cost_total_);
if (len > columnLenMax.at(2)) {
columnLenMax.at(2) = len + 4;
}
columns.emplace_back(stringBuf[2]);
len = snprintf(stringBuf[3], sizeof(stringBuf[3]), "%d", iter.second.first);
if (len > columnLenMax.at(3)) {
columnLenMax.at(3) = len + 4;
}
columns.emplace_back(stringBuf[3]);
len = snprintf(stringBuf[4], sizeof(stringBuf[4]), "%f", iter.second.second);
if (len > columnLenMax.at(4)) {
columnLenMax.at(4) = len + 4;
}
columns.emplace_back(stringBuf[4]);
rows.push_back(columns);
}
printf("-------------------------------------------------------------------------\n");
for (int i = 0; i < 5; i++) {
auto printBuf = title[i];
if (printBuf.size() > columnLenMax.at(i)) {
columnLenMax.at(i) = printBuf.size();
}
printBuf.resize(columnLenMax.at(i), ' ');
printf("%s\t", printBuf.c_str());
}
printf("\n");
for (auto &row : rows) {
for (int j = 0; j < 5; j++) {
auto printBuf = row[j];
printBuf.resize(columnLenMax.at(j), ' ');
printf("%s\t", printBuf.c_str());
}
printf("\n");
}
return RET_OK;
}
#ifdef ENABLE_ARM64
int Benchmark::PrintPerfResult(const std::vector<std::string> &title,
const std::map<std::string, std::pair<int, struct PerfCount>> &result) {
std::vector<size_t> columnLenMax(5);
std::vector<std::vector<std::string>> rows;
for (auto &iter : result) {
char stringBuf[5][100] = {};
std::vector<std::string> columns;
size_t len = 0;
len = iter.first.size();
if (len > columnLenMax.at(0)) {
columnLenMax.at(0) = len + 4;
}
columns.push_back(iter.first);
float tmp = float_t(flags_->num_threads_) * iter.second.second.value[0] / float_t(flags_->loop_count_) / 1000.0f;
len = snprintf(stringBuf[1], sizeof(stringBuf[1]), "%.2f", tmp);
if (len > columnLenMax.at(1)) {
columnLenMax.at(1) = len + 4;
}
columns.emplace_back(stringBuf[1]);
len = snprintf(stringBuf[2], sizeof(stringBuf[2]), "%f", iter.second.second.value[0] / op_cost_total_);
if (len > columnLenMax.at(2)) {
columnLenMax.at(2) = len + 4;
}
columns.emplace_back(stringBuf[2]);
tmp = float_t(flags_->num_threads_) * iter.second.second.value[1] / float_t(flags_->loop_count_) / 1000.0f;
len = snprintf(stringBuf[3], sizeof(stringBuf[3]), "%.2f", tmp);
if (len > columnLenMax.at(3)) {
columnLenMax.at(3) = len + 4;
}
columns.emplace_back(stringBuf[3]);
len = snprintf(stringBuf[4], sizeof(stringBuf[4]), "%f", iter.second.second.value[1] / op_cost2_total_);
if (len > columnLenMax.at(4)) {
columnLenMax.at(4) = len + 4;
}
columns.emplace_back(stringBuf[4]);
rows.push_back(columns);
}
printf("-------------------------------------------------------------------------\n");
for (int i = 0; i < 5; i++) {
auto printBuf = title[i];
if (printBuf.size() > columnLenMax.at(i)) {
columnLenMax.at(i) = printBuf.size();
}
printBuf.resize(columnLenMax.at(i), ' ');
printf("%s\t", printBuf.c_str());
}
printf("\n");
for (auto &row : rows) {
for (int j = 0; j < 5; j++) {
auto printBuf = row[j];
printBuf.resize(columnLenMax.at(j), ' ');
printf("%s\t", printBuf.c_str());
}
printf("\n");
}
return RET_OK;
}
#endif
#ifdef SUPPORT_NNIE
int SvpSysInit() {
HI_S32 ret = HI_SUCCESS;
VB_CONFIG_S struVbConf;
HI_MPI_SYS_Exit();
HI_MPI_VB_Exit();
memset(&struVbConf, 0, sizeof(VB_CONFIG_S));
struVbConf.u32MaxPoolCnt = 2;
struVbConf.astCommPool[1].u64BlkSize = 768 * 576 * 2;
struVbConf.astCommPool[1].u32BlkCnt = 1;
ret = HI_MPI_VB_SetConfig((const VB_CONFIG_S *)&struVbConf);
if (HI_SUCCESS != ret) {
MS_LOG(ERROR) << "Error:HI_MPI_VB_SetConf failed!";
return RET_ERROR;
}
ret = HI_MPI_VB_Init();
if (HI_SUCCESS != ret) {
MS_LOG(ERROR) << "Error:HI_MPI_VB_Init failed!";
return RET_ERROR;
}
ret = HI_MPI_SYS_Init();
if (HI_SUCCESS != ret) {
MS_LOG(ERROR) << "Error:HI_MPI_SYS_Init failed!";
return RET_ERROR;
}
return RET_OK;
}
int SvpSysExit() {
HI_S32 ret = HI_SUCCESS;
ret = HI_MPI_SYS_Exit();
if (HI_SUCCESS != ret) {
MS_LOG(ERROR) << "Error:HI_MPI_SYS_Exit failed!";
return RET_ERROR;
}
ret = HI_MPI_VB_Exit();
if (HI_SUCCESS != ret) {
MS_LOG(ERROR) << "Error:HI_MPI_VB_Exit failed!";
return RET_ERROR;
}
return RET_OK;
}
#endif
Benchmark::~Benchmark() {
for (const auto &iter : this->benchmark_data_) {
delete (iter.second);
}
this->benchmark_data_.clear();
delete (session_);
#ifdef SUPPORT_NNIE
SvpSysExit();
#endif
}
int RunBenchmark(int argc, const char **argv) {
BenchmarkFlags flags;
Option<std::string> err = flags.ParseFlags(argc, argv);
#ifdef SUPPORT_NNIE
SvpSysInit();
#endif
if (err.IsSome()) {
std::cerr << err.Get() << std::endl;
std::cerr << flags.Usage() << std::endl;
return RET_ERROR;
}
if (flags.help) {
std::cerr << flags.Usage() << std::endl;
return RET_OK;
}
Benchmark benchmark(&flags);
auto status = benchmark.Init();
if (status != 0) {
MS_LOG(ERROR) << "Benchmark init Error : " << status;
std::cerr << "Benchmark init Error : " << status << std::endl;
return RET_ERROR;
}
status = benchmark.RunBenchmark();
if (status != 0) {
MS_LOG(ERROR) << "Run Benchmark "
<< flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
<< " Failed : " << status;
std::cerr << "Run Benchmark " << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
<< " Failed : " << status << std::endl;
return RET_ERROR;
}
MS_LOG(INFO) << "Run Benchmark " << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
<< " Success.";
std::cout << "Run Benchmark " << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
<< " Success." << std::endl;
return RET_OK;
}
} // namespace lite
} // namespace mindspore

View File

@ -31,6 +31,7 @@
#include <cfloat>
#include <utility>
#include <nlohmann/json.hpp>
#include "tools/benchmark/benchmark_base.h"
#include "include/model.h"
#include "tools/common/flag_parser.h"
#include "src/common/file_utils.h"
@ -38,283 +39,57 @@
#include "include/lite_session.h"
namespace mindspore::lite {
enum MS_API InDataType { kImage = 0, kBinary = 1 };
constexpr float relativeTolerance = 1e-5;
constexpr float absoluteTolerance = 1e-8;
#ifdef ENABLE_ARM64
struct PerfResult {
int64_t nr;
struct {
int64_t value;
int64_t id;
} values[2];
};
struct PerfCount {
int64_t value[2];
};
#endif
struct MS_API CheckTensor {
CheckTensor(const std::vector<size_t> &shape, const std::vector<float> &data,
const std::vector<std::string> &strings_data = {""}) {
this->shape = shape;
this->data = data;
this->strings_data = strings_data;
}
std::vector<size_t> shape;
std::vector<float> data;
std::vector<std::string> strings_data;
};
class MS_API BenchmarkFlags : public virtual FlagParser {
class MS_API Benchmark : public BenchmarkBase {
public:
BenchmarkFlags() {
// common
AddFlag(&BenchmarkFlags::model_file_, "modelFile", "Input model file", "");
AddFlag(&BenchmarkFlags::in_data_file_, "inDataFile", "Input data file, if not set, use random input", "");
AddFlag(&BenchmarkFlags::device_, "device", "CPU | GPU | NPU", "CPU");
AddFlag(&BenchmarkFlags::cpu_bind_mode_, "cpuBindMode",
"Input 0 for NO_BIND, 1 for HIGHER_CPU, 2 for MID_CPU, default value: 1", 1);
// MarkPerformance
AddFlag(&BenchmarkFlags::loop_count_, "loopCount", "Run loop count", 10);
AddFlag(&BenchmarkFlags::num_threads_, "numThreads", "Run threads number", 2);
AddFlag(&BenchmarkFlags::enable_fp16_, "enableFp16", "Enable float16", false);
AddFlag(&BenchmarkFlags::enable_parallel_, "enableParallel", "Enable subgraph parallel : true | false", false);
AddFlag(&BenchmarkFlags::warm_up_loop_count_, "warmUpLoopCount", "Run warm up loop", 3);
AddFlag(&BenchmarkFlags::time_profiling_, "timeProfiling", "Run time profiling", false);
AddFlag(&BenchmarkFlags::perf_profiling_, "perfProfiling",
"Perf event profiling(only instructions statics enabled currently)", false);
AddFlag(&BenchmarkFlags::perf_event_, "perfEvent", "CYCLE|CACHE|STALL", "CYCLE");
// MarkAccuracy
AddFlag(&BenchmarkFlags::benchmark_data_file_, "benchmarkDataFile", "Benchmark data file path", "");
AddFlag(&BenchmarkFlags::benchmark_data_type_, "benchmarkDataType",
"Benchmark data type. FLOAT | INT32 | INT8 | UINT8", "FLOAT");
AddFlag(&BenchmarkFlags::accuracy_threshold_, "accuracyThreshold", "Threshold of accuracy", 0.5);
AddFlag(&BenchmarkFlags::resize_dims_in_, "inputShapes",
"Shape of input data, the format should be NHWC. e.g. 1,32,32,32:1,1,32,32,1", "");
}
~BenchmarkFlags() override = default;
void InitInputDataList();
void InitResizeDimsList();
public:
// common
std::string model_file_;
std::string in_data_file_;
std::vector<std::string> input_data_list_;
InDataType in_data_type_ = kBinary;
std::string in_data_type_in_ = "bin";
int cpu_bind_mode_ = 1;
// MarkPerformance
int loop_count_ = 10;
int num_threads_ = 2;
bool enable_fp16_ = false;
bool enable_parallel_ = false;
int warm_up_loop_count_ = 3;
// MarkAccuracy
std::string benchmark_data_file_;
std::string benchmark_data_type_ = "FLOAT";
float accuracy_threshold_ = 0.5;
// Resize
std::string resize_dims_in_;
std::vector<std::vector<int>> resize_dims_;
std::string device_ = "CPU";
bool time_profiling_ = false;
bool perf_profiling_ = false;
std::string perf_event_ = "CYCLE";
bool dump_tensor_data_ = false;
bool print_tensor_data_ = false;
};
class MS_API Benchmark {
public:
explicit Benchmark(BenchmarkFlags *flags) : flags_(flags) {}
explicit Benchmark(BenchmarkFlags *flags) : BenchmarkBase(flags) {}
virtual ~Benchmark();
int Init();
int RunBenchmark();
private:
// call GenerateInputData or ReadInputFile to init inputTensors
int LoadInput();
int RunBenchmark() override;
protected:
// call GenerateRandomData to fill inputTensors
int GenerateInputData();
int GenerateInputData() override;
int GenerateRandomData(size_t size, void *data, TypeId data_type);
int ReadInputFile() override;
int ReadInputFile();
int ReadCalibData();
int ReadTensorData(std::ifstream &in_file_stream, const std::string &tensor_name, const std::vector<size_t> &dims);
int ReadTensorData(std::ifstream &in_file_stream, const std::string &tensor_name,
const std::vector<size_t> &dims) override;
void InitContext(const std::shared_ptr<Context> &context);
int CompareOutput();
int CompareOutput() override;
tensor::MSTensor *GetTensorByNameOrShape(const std::string &node_or_tensor_name, const std::vector<size_t> &dims);
tensor::MSTensor *GetTensorByNodeShape(const std::vector<size_t> &node_shape);
int CompareStringData(const std::string &name, tensor::MSTensor *tensor);
int CompareDataGetTotalBiasAndSize(const std::string &name, tensor::MSTensor *tensor, float *total_bias,
int *total_size);
int InitDumpConfigFromJson(char *path);
int InitTimeProfilingCallbackParameter() override;
int InitCallbackParameter();
int InitPerfProfilingCallbackParameter() override;
int InitTimeProfilingCallbackParameter();
int InitDumpTensorDataCallbackParameter() override;
int InitPerfProfilingCallbackParameter();
int InitDumpTensorDataCallbackParameter();
int InitPrintTensorDataCallbackParameter();
int PrintResult(const std::vector<std::string> &title, const std::map<std::string, std::pair<int, float>> &result);
#ifdef ENABLE_ARM64
int PrintPerfResult(const std::vector<std::string> &title,
const std::map<std::string, std::pair<int, struct PerfCount>> &result);
#endif
int InitPrintTensorDataCallbackParameter() override;
int PrintInputData();
// tensorData need to be converter first
template <typename T>
float CompareData(const std::string &nodeName, const std::vector<int> &msShape, const void *tensor_data) {
const T *msTensorData = static_cast<const T *>(tensor_data);
auto iter = this->benchmark_data_.find(nodeName);
if (iter != this->benchmark_data_.end()) {
std::vector<size_t> castedMSShape;
size_t shapeSize = 1;
for (int64_t dim : msShape) {
castedMSShape.push_back(size_t(dim));
shapeSize *= dim;
}
CheckTensor *calibTensor = iter->second;
if (calibTensor->shape != castedMSShape) {
std::ostringstream oss;
oss << "Shape of mslite output(";
for (auto dim : castedMSShape) {
oss << dim << ",";
}
oss << ") and shape source model output(";
for (auto dim : calibTensor->shape) {
oss << dim << ",";
}
oss << ") are different";
std::cerr << oss.str() << std::endl;
MS_LOG(ERROR) << oss.str().c_str();
return RET_ERROR;
}
size_t errorCount = 0;
float meanError = 0;
std::cout << "Data of node " << nodeName << " : ";
for (size_t j = 0; j < shapeSize; j++) {
if (j < 50) {
std::cout << static_cast<float>(msTensorData[j]) << " ";
}
if (std::isnan(msTensorData[j]) || std::isinf(msTensorData[j])) {
std::cerr << "Output tensor has nan or inf data, compare fail" << std::endl;
MS_LOG(ERROR) << "Output tensor has nan or inf data, compare fail";
return RET_ERROR;
}
auto tolerance = absoluteTolerance + relativeTolerance * fabs(calibTensor->data.at(j));
auto absoluteError = std::fabs(msTensorData[j] - calibTensor->data.at(j));
if (absoluteError > tolerance) {
if (fabs(calibTensor->data.at(j) - 0.0f) < FLT_EPSILON) {
if (absoluteError > 1e-5) {
meanError += absoluteError;
errorCount++;
} else {
continue;
}
} else {
// just assume that atol = rtol
meanError += absoluteError / (fabs(calibTensor->data.at(j)) + FLT_MIN);
errorCount++;
}
}
}
std::cout << std::endl;
if (meanError > 0.0f) {
meanError /= errorCount;
}
if (meanError <= 0.0000001) {
std::cout << "Mean bias of node/tensor " << nodeName << " : 0%" << std::endl;
} else {
std::cout << "Mean bias of node/tensor " << nodeName << " : " << meanError * 100 << "%" << std::endl;
}
return meanError;
} else {
MS_LOG(INFO) << "%s is not in Source Model output", nodeName.c_str();
return RET_ERROR;
}
}
template <typename T, typename Distribution>
void FillInputData(int size, void *data, Distribution distribution) {
MS_ASSERT(data != nullptr);
int elements_num = size / sizeof(T);
(void)std::generate_n(static_cast<T *>(data), elements_num,
[&]() { return static_cast<T>(distribution(random_engine_)); });
}
int MarkPerformance();
int MarkAccuracy();
int CheckThreadNumValid();
private:
BenchmarkFlags *flags_;
session::LiteSession *session_{nullptr};
std::vector<mindspore::tensor::MSTensor *> ms_inputs_;
std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> ms_outputs_;
std::unordered_map<std::string, CheckTensor *> benchmark_data_;
std::unordered_map<std::string, TypeId> data_type_map_{{"FLOAT", TypeId::kNumberTypeFloat},
{"INT8", TypeId::kNumberTypeInt8},
{"INT32", TypeId::kNumberTypeInt32},
{"UINT8", TypeId::kNumberTypeUInt8}};
TypeId msCalibDataType = TypeId::kNumberTypeFloat;
// callback parameters
uint64_t op_begin_ = 0;
int op_call_times_total_ = 0;
float op_cost_total_ = 0.0f;
std::map<std::string, std::pair<int, float>> op_times_by_type_;
std::map<std::string, std::pair<int, float>> op_times_by_name_;
// dump data
nlohmann::json dump_cfg_json_;
std::string dump_file_output_dir_;
#ifdef ENABLE_ARM64
int perf_fd = 0;
int perf_fd2 = 0;
float op_cost2_total_ = 0.0f;
std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_type_;
std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_name_;
#endif
KernelCallBack before_call_back_ = nullptr;
KernelCallBack after_call_back_ = nullptr;
std::mt19937 random_engine_;
};
int MS_API RunBenchmark(int argc, const char **argv);
} // namespace mindspore::lite
#endif // MINNIE_BENCHMARK_BENCHMARK_H_

View File

@ -0,0 +1,606 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tools/benchmark/benchmark_base.h"
#define __STDC_FORMAT_MACROS
#include <cinttypes>
#undef __STDC_FORMAT_MACROS
#include <algorithm>
#include <utility>
#include <functional>
#include "include/context.h"
#include "include/ms_tensor.h"
#include "include/version.h"
#include "schema/model_generated.h"
#include "src/common/common.h"
#include "src/tensor.h"
#ifdef ENABLE_ARM64
#include <linux/perf_event.h>
#include <sys/ioctl.h>
#include <asm/unistd.h>
#include <unistd.h>
#endif
#ifdef SUPPORT_NNIE
#include "include/hi_common.h"
#include "include/hi_comm_vb.h"
#include "include/mpi_sys.h"
#include "include/mpi_vb.h"
#endif
namespace mindspore {
namespace lite {
const std::unordered_map<int, std::string> TYPE_ID_MAP{
{kNumberTypeFloat16, "Float16"}, {kNumberTypeFloat, "Float32"}, {kNumberTypeFloat32, "Float32"},
{kNumberTypeInt8, "Int8"}, {kNumberTypeInt16, "Int16"}, {kNumberTypeInt, "Int32"},
{kNumberTypeInt32, "Int32"}, {kNumberTypeUInt8, "UInt8"}, {kNumberTypeUInt16, "UInt16"},
{kNumberTypeUInt, "UInt32"}, {kNumberTypeUInt32, "UInt32"}, {kObjectTypeString, "String"},
{kNumberTypeBool, "Bool"}, {kObjectTypeTensorType, "Tensor"}};
const std::unordered_map<schema::Format, std::string> TENSOR_FORMAT_MAP{
{schema::Format_NCHW, "NCHW"}, {schema::Format_NHWC, "NHWC"}, {schema::Format_NHWC4, "NHWC4"},
{schema::Format_HWKC, "HWKC"}, {schema::Format_HWCK, "HWCK"}, {schema::Format_KCHW, "KCHW"},
{schema::Format_CKHW, "CKHW"}, {schema::Format_KHWC, "KHWC"}, {schema::Format_CHWK, "CHWK"},
{schema::Format_HW, "HW"}, {schema::Format_HW4, "HW4"}, {schema::Format_NC, "NC"},
{schema::Format_NC4, "NC4"}, {schema::Format_NC4HW4, "NC4HW4"}, {schema::Format_NCDHW, "NCDHW"}};
int BenchmarkBase::GenerateRandomData(size_t size, void *data, int data_type) {
MS_ASSERT(data != nullptr);
switch (data_type) {
case kNumberTypeFloat32:
case kNumberTypeFloat:
FillInputData<float>(size, data, std::uniform_real_distribution<float>(0.1f, 1.0f));
break;
case kNumberTypeFloat64:
FillInputData<double>(size, data, std::uniform_real_distribution<double>(0.1, 1.0));
break;
case kNumberTypeInt64:
FillInputData<int64_t>(size, data, std::uniform_int_distribution<int64_t>(0, 1));
break;
case kNumberTypeInt:
case kNumberTypeInt32:
FillInputData<int32_t>(size, data, std::uniform_int_distribution<int32_t>(0, 1));
break;
case kNumberTypeInt16:
FillInputData<int16_t>(size, data, std::uniform_int_distribution<int16_t>(0, 1));
break;
case kNumberTypeInt8:
FillInputData<int8_t>(size, data, std::uniform_int_distribution<int8_t>(-127, 127));
break;
case kNumberTypeUInt8:
FillInputData<uint8_t>(size, data, std::uniform_int_distribution<uint8_t>(0, 254));
break;
default:
char *casted_data = static_cast<char *>(data);
for (size_t i = 0; i < size; i++) {
casted_data[i] = static_cast<char>(i);
}
}
return RET_OK;
}
int BenchmarkBase::LoadInput() {
if (flags_->in_data_file_.empty()) {
auto status = GenerateInputData();
if (status != 0) {
std::cerr << "Generate input data error " << status << std::endl;
MS_LOG(ERROR) << "Generate input data error " << status;
return status;
}
} else {
auto status = ReadInputFile();
if (status != 0) {
std::cerr << "ReadInputFile error, " << status << std::endl;
MS_LOG(ERROR) << "ReadInputFile error, " << status;
return status;
}
}
return RET_OK;
}
// calibData is FP32
int BenchmarkBase::ReadCalibData() {
const char *calib_data_path = flags_->benchmark_data_file_.c_str();
// read calib data
std::ifstream in_file(calib_data_path);
if (!in_file.good()) {
std::cerr << "file: " << calib_data_path << " is not exist" << std::endl;
MS_LOG(ERROR) << "file: " << calib_data_path << " is not exist";
return RET_ERROR;
}
if (!in_file.is_open()) {
std::cerr << "file: " << calib_data_path << " open failed" << std::endl;
MS_LOG(ERROR) << "file: " << calib_data_path << " open failed";
in_file.close();
return RET_ERROR;
}
MS_LOG(INFO) << "Start reading calibData file";
std::string line;
std::string tensor_name;
while (!in_file.eof()) {
getline(in_file, line);
std::stringstream string_line1(line);
size_t dim = 0;
string_line1 >> tensor_name >> dim;
std::vector<size_t> dims;
for (size_t i = 0; i < dim; i++) {
size_t tmp_dim;
string_line1 >> tmp_dim;
dims.push_back(tmp_dim);
}
auto ret = ReadTensorData(in_file, tensor_name, dims);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Read tensor data failed, tensor name: " << tensor_name;
return RET_ERROR;
}
}
in_file.close();
MS_LOG(INFO) << "Finish reading calibData file";
return RET_OK;
}
int BenchmarkBase::CompareStringData(const std::string &name, tensor::MSTensor *tensor) {
auto iter = this->benchmark_data_.find(name);
if (iter != this->benchmark_data_.end()) {
std::vector<std::string> calib_strings = iter->second->strings_data;
std::vector<std::string> output_strings = MSTensorToStrings(tensor);
size_t compare_num = std::min(calib_strings.size(), output_strings.size());
size_t print_num = std::min(compare_num, static_cast<size_t>(kNumPrintMin));
std::cout << "Data of node " << name << " : " << std::endl;
for (size_t i = 0; i < compare_num; i++) {
if (i < print_num) {
std::cout << " " << output_strings[i] << std::endl;
}
if (calib_strings[i] != output_strings[i]) {
MS_LOG(ERROR) << "Compare failed, index: " << i;
return RET_ERROR;
}
}
}
return RET_OK;
}
void BenchmarkFlags::InitInputDataList() {
char *input_list = new char[this->in_data_file_.length() + 1];
snprintf(input_list, this->in_data_file_.length() + 1, "%s", this->in_data_file_.c_str());
char *cur_input;
const char *split_c = ",";
cur_input = strtok(input_list, split_c);
while (cur_input != nullptr) {
input_data_list_.emplace_back(cur_input);
cur_input = strtok(nullptr, split_c);
}
delete[] input_list;
}
void BenchmarkFlags::InitResizeDimsList() {
std::string content = this->resize_dims_in_;
std::vector<int> shape;
auto shape_strs = StringSplit(content, std::string(DELIM_COLON));
for (const auto &shape_str : shape_strs) {
shape.clear();
auto dim_strs = StringSplit(shape_str, std::string(DELIM_COMMA));
std::cout << "Resize Dims: ";
for (const auto &dim_str : dim_strs) {
std::cout << dim_str << " ";
shape.emplace_back(static_cast<int>(std::stoi(dim_str)));
}
std::cout << std::endl;
this->resize_dims_.emplace_back(shape);
}
}
int BenchmarkBase::CheckThreadNumValid() {
if (this->flags_->num_threads_ < 1) {
MS_LOG(ERROR) << "numThreads:" << this->flags_->num_threads_ << " must be greater than 0";
std::cerr << "numThreads:" << this->flags_->num_threads_ << " must be greater than 0" << std::endl;
return RET_ERROR;
}
if (flags_->enable_parallel_) {
if (flags_->num_threads_ < 2) {
MS_LOG(ERROR) << "enable parallel need more than 1 thread.";
std::cerr << "enable parallel need more than 1 thread." << std::endl;
return RET_ERROR;
}
}
return RET_OK;
}
int BenchmarkBase::InitDumpConfigFromJson(char *path) {
auto real_path = RealPath(path);
std::ifstream ifs(real_path);
if (!ifs.good()) {
MS_LOG(ERROR) << "file: " << real_path << " is not exist";
return RET_ERROR;
}
if (!ifs.is_open()) {
MS_LOG(ERROR) << "file: " << real_path << " open failed";
return RET_ERROR;
}
try {
dump_cfg_json_ = nlohmann::json::parse(ifs);
} catch (const nlohmann::json::parse_error &error) {
MS_LOG(ERROR) << "parse json file failed, please check your file.";
return RET_ERROR;
}
if (dump_cfg_json_[dump::kSettings] == nullptr) {
MS_LOG(ERROR) << "\"common_dump_settings\" is required.";
return RET_ERROR;
}
if (dump_cfg_json_[dump::kSettings][dump::kMode] == nullptr) {
MS_LOG(ERROR) << "\"dump_mode\" is required.";
return RET_ERROR;
}
if (dump_cfg_json_[dump::kSettings][dump::kPath] == nullptr) {
MS_LOG(ERROR) << "\"path\" is required.";
return RET_ERROR;
}
if (dump_cfg_json_[dump::kSettings][dump::kNetName] == nullptr) {
dump_cfg_json_[dump::kSettings][dump::kNetName] = "Default";
}
if (dump_cfg_json_[dump::kSettings][dump::kInputOutput] == nullptr) {
dump_cfg_json_[dump::kSettings][dump::kInputOutput] = 0;
}
if (dump_cfg_json_[dump::kSettings][dump::kKernels] != nullptr &&
!dump_cfg_json_[dump::kSettings][dump::kKernels].empty()) {
if (dump_cfg_json_[dump::kSettings][dump::kMode] == 0) {
MS_LOG(ERROR) << R"("dump_mode" should be 1 when "kernels" isn't empty.)";
return RET_ERROR;
}
}
auto abs_path = dump_cfg_json_[dump::kSettings][dump::kPath].get<std::string>();
auto net_name = dump_cfg_json_[dump::kSettings][dump::kNetName].get<std::string>();
if (abs_path.back() == '\\' || abs_path.back() == '/') {
dump_file_output_dir_ = abs_path + net_name;
} else {
#ifdef _WIN32
dump_file_output_dir_ = abs_path + "\\" + net_name;
#else
dump_file_output_dir_ = abs_path + "/" + net_name;
#endif
}
auto status = CreateOutputDir(&dump_file_output_dir_);
if (status != RET_OK) {
MS_LOG(ERROR) << "create data output directory failed.";
return RET_ERROR;
}
return RET_OK;
}
int BenchmarkBase::InitCallbackParameter() {
int ret = RET_OK;
if (flags_->time_profiling_) {
ret = InitTimeProfilingCallbackParameter();
} else if (flags_->perf_profiling_) {
ret = InitPerfProfilingCallbackParameter();
} else if (flags_->print_tensor_data_) {
ret = InitPrintTensorDataCallbackParameter();
} else if (flags_->dump_tensor_data_) {
ret = InitDumpTensorDataCallbackParameter();
}
return ret;
}
int BenchmarkBase::Init() {
if (this->flags_ == nullptr) {
return 1;
}
MS_LOG(INFO) << "ModelPath = " << this->flags_->model_file_;
MS_LOG(INFO) << "InDataPath = " << this->flags_->in_data_file_;
MS_LOG(INFO) << "InDataType = " << this->flags_->in_data_type_in_;
MS_LOG(INFO) << "LoopCount = " << this->flags_->loop_count_;
MS_LOG(INFO) << "DeviceType = " << this->flags_->device_;
MS_LOG(INFO) << "AccuracyThreshold = " << this->flags_->accuracy_threshold_;
MS_LOG(INFO) << "WarmUpLoopCount = " << this->flags_->warm_up_loop_count_;
MS_LOG(INFO) << "NumThreads = " << this->flags_->num_threads_;
MS_LOG(INFO) << "Fp16Priority = " << this->flags_->enable_fp16_;
MS_LOG(INFO) << "EnableParallel = " << this->flags_->enable_parallel_;
MS_LOG(INFO) << "calibDataPath = " << this->flags_->benchmark_data_file_;
std::cout << "ModelPath = " << this->flags_->model_file_ << std::endl;
std::cout << "InDataPath = " << this->flags_->in_data_file_ << std::endl;
std::cout << "InDataType = " << this->flags_->in_data_type_in_ << std::endl;
std::cout << "LoopCount = " << this->flags_->loop_count_ << std::endl;
std::cout << "DeviceType = " << this->flags_->device_ << std::endl;
std::cout << "AccuracyThreshold = " << this->flags_->accuracy_threshold_ << std::endl;
std::cout << "WarmUpLoopCount = " << this->flags_->warm_up_loop_count_ << std::endl;
std::cout << "NumThreads = " << this->flags_->num_threads_ << std::endl;
std::cout << "Fp16Priority = " << this->flags_->enable_fp16_ << std::endl;
std::cout << "EnableParallel = " << this->flags_->enable_parallel_ << std::endl;
std::cout << "calibDataPath = " << this->flags_->benchmark_data_file_ << std::endl;
if (this->flags_->loop_count_ < 1) {
MS_LOG(ERROR) << "LoopCount:" << this->flags_->loop_count_ << " must be greater than 0";
std::cerr << "LoopCount:" << this->flags_->loop_count_ << " must be greater than 0" << std::endl;
return RET_ERROR;
}
auto thread_ret = CheckThreadNumValid();
if (thread_ret != RET_OK) {
MS_LOG(ERROR) << "Invalid numThreads.";
std::cerr << "Invalid numThreads." << std::endl;
return RET_ERROR;
}
static std::vector<std::string> CPU_BIND_MODE_MAP = {"NO_BIND", "HIGHER_CPU", "MID_CPU"};
if (this->flags_->cpu_bind_mode_ >= 1) {
MS_LOG(INFO) << "cpuBindMode = " << CPU_BIND_MODE_MAP[this->flags_->cpu_bind_mode_];
std::cout << "cpuBindMode = " << CPU_BIND_MODE_MAP[this->flags_->cpu_bind_mode_] << std::endl;
} else {
MS_LOG(INFO) << "cpuBindMode = NO_BIND";
std::cout << "cpuBindMode = NO_BIND" << std::endl;
}
this->flags_->in_data_type_ = this->flags_->in_data_type_in_ == "img" ? kImage : kBinary;
if (!flags_->benchmark_data_type_.empty()) {
if (data_type_map_.find(flags_->benchmark_data_type_) == data_type_map_.end()) {
MS_LOG(ERROR) << "CalibDataType not supported: " << flags_->benchmark_data_type_.c_str();
return RET_ERROR;
}
msCalibDataType = data_type_map_.at(flags_->benchmark_data_type_);
MS_LOG(INFO) << "CalibDataType = " << flags_->benchmark_data_type_.c_str();
std::cout << "CalibDataType = " << flags_->benchmark_data_type_.c_str() << std::endl;
}
if (flags_->model_file_.empty()) {
MS_LOG(ERROR) << "modelPath is required";
std::cerr << "modelPath is required" << std::endl;
return 1;
}
flags_->InitInputDataList();
flags_->InitResizeDimsList();
if (!flags_->resize_dims_.empty() && !flags_->input_data_list_.empty() &&
flags_->resize_dims_.size() != flags_->input_data_list_.size()) {
MS_LOG(ERROR) << "Size of input resizeDims should be equal to size of input inDataPath";
std::cerr << "Size of input resizeDims should be equal to size of input inDataPath" << std::endl;
return RET_ERROR;
}
if (flags_->device_ != "CPU" && flags_->device_ != "GPU" && flags_->device_ != "NPU") {
MS_LOG(ERROR) << "Device type:" << flags_->device_ << " is not supported.";
std::cerr << "Device type:" << flags_->device_ << " is not supported." << std::endl;
return RET_ERROR;
}
if (flags_->time_profiling_ && flags_->perf_profiling_) {
MS_LOG(INFO) << "time_profiling is enabled, will not run perf_profiling.";
}
// get dump data output path
auto dump_cfg_path = std::getenv(dump::kConfigPath);
if (dump_cfg_path != nullptr) {
flags_->dump_tensor_data_ = true;
if (InitDumpConfigFromJson(dump_cfg_path) != RET_OK) {
MS_LOG(ERROR) << "parse dump config file failed.";
return RET_ERROR;
}
} else {
MS_LOG(INFO) << "No MINDSPORE_DUMP_CONFIG in env, don't need to dump data";
}
auto status = InitCallbackParameter();
if (status != RET_OK) {
MS_LOG(ERROR) << "Init callback Parameter failed.";
std::cerr << "Init callback Parameter failed." << std::endl;
return RET_ERROR;
}
return RET_OK;
}
int BenchmarkBase::PrintResult(const std::vector<std::string> &title,
const std::map<std::string, std::pair<int, float>> &result) {
std::vector<size_t> columnLenMax(5);
std::vector<std::vector<std::string>> rows;
for (auto &iter : result) {
char stringBuf[5][100] = {};
std::vector<std::string> columns;
size_t len = 0;
len = iter.first.size();
if (len > columnLenMax.at(0)) {
columnLenMax.at(0) = len + 4;
}
columns.push_back(iter.first);
len =
snprintf(stringBuf[1], sizeof(stringBuf[1]), "%f", iter.second.second / static_cast<float>(flags_->loop_count_));
if (len > columnLenMax.at(1)) {
columnLenMax.at(1) = len + 4;
}
columns.emplace_back(stringBuf[1]);
len = snprintf(stringBuf[2], sizeof(stringBuf[2]), "%f", iter.second.second / op_cost_total_);
if (len > columnLenMax.at(2)) {
columnLenMax.at(2) = len + 4;
}
columns.emplace_back(stringBuf[2]);
len = snprintf(stringBuf[3], sizeof(stringBuf[3]), "%d", iter.second.first);
if (len > columnLenMax.at(3)) {
columnLenMax.at(3) = len + 4;
}
columns.emplace_back(stringBuf[3]);
len = snprintf(stringBuf[4], sizeof(stringBuf[4]), "%f", iter.second.second);
if (len > columnLenMax.at(4)) {
columnLenMax.at(4) = len + 4;
}
columns.emplace_back(stringBuf[4]);
rows.push_back(columns);
}
printf("-------------------------------------------------------------------------\n");
for (int i = 0; i < 5; i++) {
auto printBuf = title[i];
if (printBuf.size() > columnLenMax.at(i)) {
columnLenMax.at(i) = printBuf.size();
}
printBuf.resize(columnLenMax.at(i), ' ');
printf("%s\t", printBuf.c_str());
}
printf("\n");
for (auto &row : rows) {
for (int j = 0; j < 5; j++) {
auto printBuf = row[j];
printBuf.resize(columnLenMax.at(j), ' ');
printf("%s\t", printBuf.c_str());
}
printf("\n");
}
return RET_OK;
}
#ifdef ENABLE_ARM64
int BenchmarkBase::PrintPerfResult(const std::vector<std::string> &title,
const std::map<std::string, std::pair<int, struct PerfCount>> &result) {
std::vector<size_t> columnLenMax(5);
std::vector<std::vector<std::string>> rows;
for (auto &iter : result) {
char stringBuf[5][100] = {};
std::vector<std::string> columns;
size_t len = 0;
len = iter.first.size();
if (len > columnLenMax.at(0)) {
columnLenMax.at(0) = len + 4;
}
columns.push_back(iter.first);
float tmp = float_t(flags_->num_threads_) * iter.second.second.value[0] / float_t(flags_->loop_count_) / 1000.0f;
len = snprintf(stringBuf[1], sizeof(stringBuf[1]), "%.2f", tmp);
if (len > columnLenMax.at(1)) {
columnLenMax.at(1) = len + 4;
}
columns.emplace_back(stringBuf[1]);
len = snprintf(stringBuf[2], sizeof(stringBuf[2]), "%f", iter.second.second.value[0] / op_cost_total_);
if (len > columnLenMax.at(2)) {
columnLenMax.at(2) = len + 4;
}
columns.emplace_back(stringBuf[2]);
tmp = float_t(flags_->num_threads_) * iter.second.second.value[1] / float_t(flags_->loop_count_) / 1000.0f;
len = snprintf(stringBuf[3], sizeof(stringBuf[3]), "%.2f", tmp);
if (len > columnLenMax.at(3)) {
columnLenMax.at(3) = len + 4;
}
columns.emplace_back(stringBuf[3]);
len = snprintf(stringBuf[4], sizeof(stringBuf[4]), "%f", iter.second.second.value[1] / op_cost2_total_);
if (len > columnLenMax.at(4)) {
columnLenMax.at(4) = len + 4;
}
columns.emplace_back(stringBuf[4]);
rows.push_back(columns);
}
printf("-------------------------------------------------------------------------\n");
for (int i = 0; i < 5; i++) {
auto printBuf = title[i];
if (printBuf.size() > columnLenMax.at(i)) {
columnLenMax.at(i) = printBuf.size();
}
printBuf.resize(columnLenMax.at(i), ' ');
printf("%s\t", printBuf.c_str());
}
printf("\n");
for (auto &row : rows) {
for (int j = 0; j < 5; j++) {
auto printBuf = row[j];
printBuf.resize(columnLenMax.at(j), ' ');
printf("%s\t", printBuf.c_str());
}
printf("\n");
}
return RET_OK;
}
#endif
#ifdef SUPPORT_NNIE
int SvpSysInit() {
HI_S32 ret = HI_SUCCESS;
VB_CONFIG_S struVbConf;
HI_MPI_SYS_Exit();
HI_MPI_VB_Exit();
memset(&struVbConf, 0, sizeof(VB_CONFIG_S));
struVbConf.u32MaxPoolCnt = 2;
struVbConf.astCommPool[1].u64BlkSize = 768 * 576 * 2;
struVbConf.astCommPool[1].u32BlkCnt = 1;
ret = HI_MPI_VB_SetConfig((const VB_CONFIG_S *)&struVbConf);
if (HI_SUCCESS != ret) {
MS_LOG(ERROR) << "Error:HI_MPI_VB_SetConf failed!";
return RET_ERROR;
}
ret = HI_MPI_VB_Init();
if (HI_SUCCESS != ret) {
MS_LOG(ERROR) << "Error:HI_MPI_VB_Init failed!";
return RET_ERROR;
}
ret = HI_MPI_SYS_Init();
if (HI_SUCCESS != ret) {
MS_LOG(ERROR) << "Error:HI_MPI_SYS_Init failed!";
return RET_ERROR;
}
return RET_OK;
}
int SvpSysExit() {
HI_S32 ret = HI_SUCCESS;
ret = HI_MPI_SYS_Exit();
if (HI_SUCCESS != ret) {
MS_LOG(ERROR) << "Error:HI_MPI_SYS_Exit failed!";
return RET_ERROR;
}
ret = HI_MPI_VB_Exit();
if (HI_SUCCESS != ret) {
MS_LOG(ERROR) << "Error:HI_MPI_VB_Exit failed!";
return RET_ERROR;
}
return RET_OK;
}
#endif
BenchmarkBase::~BenchmarkBase() {
for (const auto &iter : this->benchmark_data_) {
delete (iter.second);
}
this->benchmark_data_.clear();
#ifdef SUPPORT_NNIE
SvpSysExit();
#endif
}
} // namespace lite
} // namespace mindspore

View File

@ -0,0 +1,316 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINNIE_BENCHMARK_BENCHMARK_BASE_H_
#define MINNIE_BENCHMARK_BENCHMARK_BASE_H_
#include <getopt.h>
#include <signal.h>
#include <random>
#include <unordered_map>
#include <fstream>
#include <iostream>
#include <map>
#include <cmath>
#include <string>
#include <vector>
#include <memory>
#include <cfloat>
#include <utility>
#include <nlohmann/json.hpp>
#include "include/model.h"
#include "tools/common/flag_parser.h"
#include "src/common/file_utils.h"
#include "src/common/utils.h"
#include "ir/dtype/type_id.h"
#include "schema/model_generated.h"
namespace mindspore::lite {
enum MS_API InDataType { kImage = 0, kBinary = 1 };
constexpr float relativeTolerance = 1e-5;
constexpr float absoluteTolerance = 1e-8;
constexpr int kNumPrintMin = 5;
constexpr const char *DELIM_COLON = ":";
constexpr const char *DELIM_COMMA = ",";
constexpr const char *DELIM_SLASH = "/";
extern const std::unordered_map<int, std::string> TYPE_ID_MAP;
extern const std::unordered_map<schema::Format, std::string> TENSOR_FORMAT_MAP;
//
namespace dump {
constexpr auto kConfigPath = "MINDSPORE_DUMP_CONFIG";
constexpr auto kSettings = "common_dump_settings";
constexpr auto kMode = "dump_mode";
constexpr auto kPath = "path";
constexpr auto kNetName = "net_name";
constexpr auto kInputOutput = "input_output";
constexpr auto kKernels = "kernels";
} // namespace dump
#ifdef ENABLE_ARM64
struct PerfResult {
int64_t nr;
struct {
int64_t value;
int64_t id;
} values[2];
};
struct PerfCount {
int64_t value[2];
};
#endif
struct MS_API CheckTensor {
CheckTensor(const std::vector<size_t> &shape, const std::vector<float> &data,
const std::vector<std::string> &strings_data = {""}) {
this->shape = shape;
this->data = data;
this->strings_data = strings_data;
}
std::vector<size_t> shape;
std::vector<float> data;
std::vector<std::string> strings_data;
};
class MS_API BenchmarkFlags : public virtual FlagParser {
public:
BenchmarkFlags() {
// common
AddFlag(&BenchmarkFlags::model_file_, "modelFile", "Input model file", "");
AddFlag(&BenchmarkFlags::in_data_file_, "inDataFile", "Input data file, if not set, use random input", "");
AddFlag(&BenchmarkFlags::device_, "device", "CPU | GPU | NPU", "CPU");
AddFlag(&BenchmarkFlags::cpu_bind_mode_, "cpuBindMode",
"Input 0 for NO_BIND, 1 for HIGHER_CPU, 2 for MID_CPU, default value: 1", 1);
// MarkPerformance
AddFlag(&BenchmarkFlags::loop_count_, "loopCount", "Run loop count", 10);
AddFlag(&BenchmarkFlags::num_threads_, "numThreads", "Run threads number", 2);
AddFlag(&BenchmarkFlags::enable_fp16_, "enableFp16", "Enable float16", false);
AddFlag(&BenchmarkFlags::enable_parallel_, "enableParallel", "Enable subgraph parallel : true | false", false);
AddFlag(&BenchmarkFlags::warm_up_loop_count_, "warmUpLoopCount", "Run warm up loop", 3);
AddFlag(&BenchmarkFlags::time_profiling_, "timeProfiling", "Run time profiling", false);
AddFlag(&BenchmarkFlags::perf_profiling_, "perfProfiling",
"Perf event profiling(only instructions statics enabled currently)", false);
AddFlag(&BenchmarkFlags::perf_event_, "perfEvent", "CYCLE|CACHE|STALL", "CYCLE");
// MarkAccuracy
AddFlag(&BenchmarkFlags::benchmark_data_file_, "benchmarkDataFile", "Benchmark data file path", "");
AddFlag(&BenchmarkFlags::benchmark_data_type_, "benchmarkDataType",
"Benchmark data type. FLOAT | INT32 | INT8 | UINT8", "FLOAT");
AddFlag(&BenchmarkFlags::accuracy_threshold_, "accuracyThreshold", "Threshold of accuracy", 0.5);
AddFlag(&BenchmarkFlags::resize_dims_in_, "inputShapes",
"Shape of input data, the format should be NHWC. e.g. 1,32,32,32:1,1,32,32,1", "");
}
~BenchmarkFlags() override = default;
void InitInputDataList();
void InitResizeDimsList();
public:
// common
std::string model_file_;
std::string in_data_file_;
std::vector<std::string> input_data_list_;
InDataType in_data_type_ = kBinary;
std::string in_data_type_in_ = "bin";
int cpu_bind_mode_ = 1;
// MarkPerformance
int loop_count_ = 10;
int num_threads_ = 2;
bool enable_fp16_ = false;
bool enable_parallel_ = false;
int warm_up_loop_count_ = 3;
// MarkAccuracy
std::string benchmark_data_file_;
std::string benchmark_data_type_ = "FLOAT";
float accuracy_threshold_ = 0.5;
// Resize
std::string resize_dims_in_;
std::vector<std::vector<int>> resize_dims_;
std::string device_ = "CPU";
bool time_profiling_ = false;
bool perf_profiling_ = false;
std::string perf_event_ = "CYCLE";
bool dump_tensor_data_ = false;
bool print_tensor_data_ = false;
};
class MS_API BenchmarkBase {
public:
explicit BenchmarkBase(BenchmarkFlags *flags) : flags_(flags) {}
virtual ~BenchmarkBase();
int Init();
virtual int RunBenchmark() = 0;
protected:
int LoadInput();
virtual int GenerateInputData() = 0;
int GenerateRandomData(size_t size, void *data, int data_type);
virtual int ReadInputFile() = 0;
int ReadCalibData();
virtual int ReadTensorData(std::ifstream &in_file_stream, const std::string &tensor_name,
const std::vector<size_t> &dims) = 0;
virtual int CompareOutput() = 0;
int CompareStringData(const std::string &name, tensor::MSTensor *tensor);
int InitDumpConfigFromJson(char *path);
int InitCallbackParameter();
virtual int InitTimeProfilingCallbackParameter() = 0;
virtual int InitPerfProfilingCallbackParameter() = 0;
virtual int InitDumpTensorDataCallbackParameter() = 0;
virtual int InitPrintTensorDataCallbackParameter() = 0;
int PrintResult(const std::vector<std::string> &title, const std::map<std::string, std::pair<int, float>> &result);
#ifdef ENABLE_ARM64
int PrintPerfResult(const std::vector<std::string> &title,
const std::map<std::string, std::pair<int, struct PerfCount>> &result);
#endif
// tensorData need to be converter first
template <typename T, typename ST>
float CompareData(const std::string &nodeName, const std::vector<ST> &msShape, const void *tensor_data) {
const T *msTensorData = static_cast<const T *>(tensor_data);
auto iter = this->benchmark_data_.find(nodeName);
if (iter != this->benchmark_data_.end()) {
std::vector<size_t> castedMSShape;
size_t shapeSize = 1;
for (int64_t dim : msShape) {
castedMSShape.push_back(size_t(dim));
shapeSize *= dim;
}
CheckTensor *calibTensor = iter->second;
if (calibTensor->shape != castedMSShape) {
std::ostringstream oss;
oss << "Shape of mslite output(";
for (auto dim : castedMSShape) {
oss << dim << ",";
}
oss << ") and shape source model output(";
for (auto dim : calibTensor->shape) {
oss << dim << ",";
}
oss << ") are different";
std::cerr << oss.str() << std::endl;
MS_LOG(ERROR) << oss.str().c_str();
return RET_ERROR;
}
size_t errorCount = 0;
float meanError = 0;
std::cout << "Data of node " << nodeName << " : ";
for (size_t j = 0; j < shapeSize; j++) {
if (j < 50) {
std::cout << static_cast<float>(msTensorData[j]) << " ";
}
if (std::isnan(msTensorData[j]) || std::isinf(msTensorData[j])) {
std::cerr << "Output tensor has nan or inf data, compare fail" << std::endl;
MS_LOG(ERROR) << "Output tensor has nan or inf data, compare fail";
return RET_ERROR;
}
auto tolerance = absoluteTolerance + relativeTolerance * fabs(calibTensor->data.at(j));
auto absoluteError = std::fabs(msTensorData[j] - calibTensor->data.at(j));
if (absoluteError > tolerance) {
if (fabs(calibTensor->data.at(j) - 0.0f) < FLT_EPSILON) {
if (absoluteError > 1e-5) {
meanError += absoluteError;
errorCount++;
} else {
continue;
}
} else {
// just assume that atol = rtol
meanError += absoluteError / (fabs(calibTensor->data.at(j)) + FLT_MIN);
errorCount++;
}
}
}
std::cout << std::endl;
if (meanError > 0.0f) {
meanError /= errorCount;
}
if (meanError <= 0.0000001) {
std::cout << "Mean bias of node/tensor " << nodeName << " : 0%" << std::endl;
} else {
std::cout << "Mean bias of node/tensor " << nodeName << " : " << meanError * 100 << "%" << std::endl;
}
return meanError;
} else {
MS_LOG(INFO) << "%s is not in Source Model output", nodeName.c_str();
return RET_ERROR;
}
}
template <typename T, typename Distribution>
void FillInputData(int size, void *data, Distribution distribution) {
MS_ASSERT(data != nullptr);
int elements_num = size / sizeof(T);
(void)std::generate_n(static_cast<T *>(data), elements_num,
[&]() { return static_cast<T>(distribution(random_engine_)); });
}
int CheckThreadNumValid();
protected:
BenchmarkFlags *flags_;
std::unordered_map<std::string, CheckTensor *> benchmark_data_;
std::unordered_map<std::string, int> data_type_map_{
{"FLOAT", kNumberTypeFloat}, {"INT8", kNumberTypeInt8}, {"INT32", kNumberTypeInt32}, {"UINT8", kNumberTypeUInt8}};
int msCalibDataType = kNumberTypeFloat;
// callback parameters
uint64_t op_begin_ = 0;
int op_call_times_total_ = 0;
float op_cost_total_ = 0.0f;
std::map<std::string, std::pair<int, float>> op_times_by_type_;
std::map<std::string, std::pair<int, float>> op_times_by_name_;
// dump data
nlohmann::json dump_cfg_json_;
std::string dump_file_output_dir_;
#ifdef ENABLE_ARM64
int perf_fd = 0;
int perf_fd2 = 0;
float op_cost2_total_ = 0.0f;
std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_type_;
std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_name_;
#endif
std::mt19937 random_engine_;
};
} // namespace mindspore::lite
#endif // MINNIE_BENCHMARK_BENCHMARK_BASE_H_

View File

@ -0,0 +1,828 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tools/benchmark/benchmark_unified_api.h"
#define __STDC_FORMAT_MACROS
#include <cinttypes>
#undef __STDC_FORMAT_MACROS
#include <algorithm>
#include <utility>
#include <functional>
#include "include/context.h"
#include "include/ms_tensor.h"
#include "include/version.h"
#include "schema/model_generated.h"
#include "src/common/common.h"
#include "src/tensor.h"
#ifdef ENABLE_ARM64
#include <linux/perf_event.h>
#include <sys/ioctl.h>
#include <asm/unistd.h>
#include <unistd.h>
#endif
#ifdef SUPPORT_NNIE
#include "include/hi_common.h"
#include "include/hi_comm_vb.h"
#include "include/mpi_sys.h"
#include "include/mpi_vb.h"
#endif
namespace mindspore {
namespace lite {
int BenchmarkUnifiedApi::GenerateInputData() {
for (auto tensor : ms_inputs_for_api_) {
MS_ASSERT(tensor != nullptr);
auto input_data = tensor.MutableData();
if (input_data == nullptr) {
MS_LOG(ERROR) << "MallocData for inTensor failed";
return RET_ERROR;
}
int status;
if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
std::cerr << "Unsupported kObjectTypeString:" << std::endl;
MS_LOG(ERROR) << "Unsupported kObjectTypeString:";
return RET_ERROR;
// status = StringsToMSTensor({"you're the best."}, tensor);
} else {
status = GenerateRandomData(tensor.DataSize(), input_data, static_cast<int>(tensor.DataType()));
}
if (status != RET_OK) {
std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
return status;
}
}
return RET_OK;
}
int BenchmarkUnifiedApi::ReadInputFile() {
if (ms_inputs_for_api_.empty()) {
return RET_OK;
}
if (this->flags_->in_data_type_ == kImage) {
MS_LOG(ERROR) << "Not supported image input";
return RET_ERROR;
} else {
for (size_t i = 0; i < flags_->input_data_list_.size(); i++) {
auto cur_tensor = ms_inputs_for_api_.at(i);
MS_ASSERT(cur_tensor != nullptr);
size_t size;
char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
if (bin_buf == nullptr) {
MS_LOG(ERROR) << "ReadFile return nullptr";
return RET_ERROR;
}
if (static_cast<int>(cur_tensor.DataType()) == kObjectTypeString) {
std::cerr << "Unsupported kObjectTypeString:" << std::endl;
MS_LOG(ERROR) << "Unsupported kObjectTypeString:";
return RET_ERROR;
} else {
auto tensor_data_size = cur_tensor.DataSize();
if (size != tensor_data_size) {
std::cerr << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size
<< std::endl;
MS_LOG(ERROR) << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size;
delete[] bin_buf;
return RET_ERROR;
}
auto input_data = cur_tensor.MutableData();
if (input_data == nullptr) {
MS_LOG(ERROR) << "input_data is nullptr.";
return RET_ERROR;
}
memcpy(input_data, bin_buf, tensor_data_size);
}
delete[] bin_buf;
}
}
return RET_OK;
}
int BenchmarkUnifiedApi::ReadTensorData(std::ifstream &in_file_stream, const std::string &tensor_name,
const std::vector<size_t> &dims) {
std::string line;
getline(in_file_stream, line);
std::stringstream line_stream(line);
if (this->benchmark_data_.find(tensor_name) != this->benchmark_data_.end()) {
return RET_OK;
}
mindspore::MSTensor tensor = GetMSTensorByNameOrShape(tensor_name, dims);
if (tensor == nullptr) {
MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
return RET_ERROR;
}
std::vector<float> data;
std::vector<std::string> strings_data;
size_t shape_size = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<size_t>());
if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
strings_data.push_back(line);
for (size_t i = 1; i < shape_size; i++) {
getline(in_file_stream, line);
strings_data.push_back(line);
}
} else {
for (size_t i = 0; i < shape_size; i++) {
float tmp_data;
line_stream >> tmp_data;
data.push_back(tmp_data);
}
}
auto *check_tensor = new (std::nothrow) CheckTensor(dims, data, strings_data);
if (check_tensor == nullptr) {
MS_LOG(ERROR) << "New CheckTensor failed, tensor name: " << tensor_name;
return RET_ERROR;
}
this->benchmark_data_.insert(std::make_pair(tensor_name, check_tensor));
return RET_OK;
}
void BenchmarkUnifiedApi::InitMSContext(const std::shared_ptr<mindspore::Context> &context) {
context->SetThreadNum(flags_->num_threads_);
context->SetEnableParallel(flags_->enable_parallel_);
context->SetThreadAffinity(flags_->cpu_bind_mode_);
auto &device_list = context->MutableDeviceInfo();
std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
device_info->SetEnableFP16(flags_->enable_fp16_);
device_list.push_back(device_info);
if (flags_->device_ == "GPU") {
std::shared_ptr<MaliGPUDeviceInfo> gpu_device_info = std::make_shared<MaliGPUDeviceInfo>();
gpu_device_info->SetEnableFP16(flags_->enable_fp16_);
device_list.push_back(gpu_device_info);
}
if (flags_->device_ == "NPU") {
std::shared_ptr<KirinNPUDeviceInfo> npu_device_info = std::make_shared<KirinNPUDeviceInfo>();
npu_device_info->SetFrequency(3);
device_list.push_back(npu_device_info);
}
}
int BenchmarkUnifiedApi::CompareOutput() {
std::cout << "================ Comparing Output data ================" << std::endl;
float total_bias = 0;
int total_size = 0;
for (const auto &calib_tensor : benchmark_data_) {
std::string node_or_tensor_name = calib_tensor.first;
mindspore::MSTensor tensor = GetMSTensorByNameOrShape(node_or_tensor_name, calib_tensor.second->shape);
if (tensor == nullptr) {
MS_LOG(ERROR) << "Get tensor failed, tensor name: " << node_or_tensor_name;
return RET_ERROR;
}
int ret;
if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
std::cerr << "Unsupported kObjectTypeString:" << std::endl;
MS_LOG(ERROR) << "Unsupported kObjectTypeString:";
return RET_ERROR;
// ret = CompareStringData(node_or_tensor_name, tensor);
} else {
ret = CompareDataGetTotalBiasAndSize(node_or_tensor_name, &tensor, &total_bias, &total_size);
}
if (ret != RET_OK) {
MS_LOG(ERROR) << "Error in CompareData";
std::cerr << "Error in CompareData" << std::endl;
std::cout << "=======================================================" << std::endl << std::endl;
return ret;
}
}
float mean_bias;
if (total_size != 0) {
mean_bias = total_bias / float_t(total_size) * 100;
} else {
mean_bias = 0;
}
std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%" << std::endl;
std::cout << "=======================================================" << std::endl << std::endl;
if (mean_bias > this->flags_->accuracy_threshold_) {
MS_LOG(ERROR) << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%";
std::cerr << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl;
return RET_ERROR;
}
return RET_OK;
}
mindspore::MSTensor BenchmarkUnifiedApi::GetMSTensorByNodeShape(const std::vector<size_t> &node_shape) {
std::vector<mindspore::MSTensor> match_tensors;
std::vector<int64_t> shape_vector = ConverterToInt64Vector<size_t>(node_shape);
auto tensors = ms_model_.GetOutputs();
for (auto &out_tensor_pair : tensors) {
if (out_tensor_pair.Shape() == shape_vector) {
match_tensors.emplace_back(out_tensor_pair);
}
}
return match_tensors.front();
}
mindspore::MSTensor BenchmarkUnifiedApi::GetMSTensorByNameOrShape(const std::string &node_or_tensor_name,
const std::vector<size_t> &dims) {
mindspore::MSTensor tensor;
auto tensors = ms_model_.GetOutputsByNodeName(node_or_tensor_name);
if (tensors.empty() || tensors.size() != 1) {
MS_LOG(INFO) << "Cannot find output node: " << node_or_tensor_name
<< " or node has more than one output tensor, switch to GetOutputByTensorName";
tensor = ms_model_.GetOutputByTensorName(node_or_tensor_name);
if (tensor == nullptr) {
return GetMSTensorByNodeShape(dims);
}
} else {
tensor = tensors.front();
}
return tensor;
}
int BenchmarkUnifiedApi::CompareDataGetTotalBiasAndSize(const std::string &name, mindspore::MSTensor *tensor,
float *total_bias, int *total_size) {
float bias = 0;
auto mutableData = tensor->MutableData();
if (mutableData == nullptr) {
MS_LOG(ERROR) << "mutableData is nullptr.";
return RET_ERROR;
}
switch (static_cast<int>(tensor->DataType())) {
case TypeId::kNumberTypeFloat:
case TypeId::kNumberTypeFloat32: {
bias = CompareData<float>(name, tensor->Shape(), mutableData);
break;
}
case TypeId::kNumberTypeInt8: {
bias = CompareData<int8_t>(name, tensor->Shape(), mutableData);
break;
}
case TypeId::kNumberTypeUInt8: {
bias = CompareData<uint8_t>(name, tensor->Shape(), mutableData);
break;
}
case TypeId::kNumberTypeInt32: {
bias = CompareData<int32_t>(name, tensor->Shape(), mutableData);
break;
}
case TypeId::kNumberTypeInt16: {
bias = CompareData<int16_t>(name, tensor->Shape(), mutableData);
break;
}
case TypeId::kNumberTypeBool: {
bias = CompareData<bool>(name, tensor->Shape(), mutableData);
break;
}
default:
MS_LOG(ERROR) << "Datatype " << static_cast<int>(tensor->DataType()) << " is not supported.";
return RET_ERROR;
}
if (bias < 0) {
MS_LOG(ERROR) << "CompareData failed, name: " << name;
return RET_ERROR;
}
*total_bias += bias;
*total_size += 1;
return RET_OK;
}
int BenchmarkUnifiedApi::MarkPerformance() {
MS_LOG(INFO) << "Running warm up loops...";
std::cout << "Running warm up loops..." << std::endl;
std::vector<MSTensor> outputs;
for (int i = 0; i < flags_->warm_up_loop_count_; i++) {
auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs);
if (status != kSuccess) {
MS_LOG(ERROR) << "Inference error ";
std::cerr << "Inference error " << std::endl;
return RET_ERROR;
}
}
MS_LOG(INFO) << "Running benchmark loops...";
std::cout << "Running benchmark loops..." << std::endl;
uint64_t time_min = 1000000;
uint64_t time_max = 0;
uint64_t time_avg = 0;
for (int i = 0; i < flags_->loop_count_; i++) {
auto inputs = ms_model_.GetInputs();
for (auto tensor : inputs) {
tensor.MutableData(); // prepare data
}
auto start = GetTimeUs();
auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
if (status != kSuccess) {
MS_LOG(ERROR) << "Inference error ";
std::cerr << "Inference error ";
return RET_ERROR;
}
auto end = GetTimeUs();
auto time = end - start;
time_min = std::min(time_min, time);
time_max = std::max(time_max, time);
time_avg += time;
}
if (flags_->time_profiling_) {
const std::vector<std::string> per_op_name = {"opName", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
const std::vector<std::string> per_op_type = {"opType", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
PrintResult(per_op_name, op_times_by_name_);
PrintResult(per_op_type, op_times_by_type_);
#ifdef ENABLE_ARM64
} else if (flags_->perf_profiling_) {
if (flags_->perf_event_ == "CACHE") {
const std::vector<std::string> per_op_name = {"opName", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
const std::vector<std::string> per_op_type = {"opType", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
PrintPerfResult(per_op_name, op_perf_by_name_);
PrintPerfResult(per_op_type, op_perf_by_type_);
} else if (flags_->perf_event_ == "STALL") {
const std::vector<std::string> per_op_name = {"opName", "frontend(k)", "frontend(%)", "backendend(k)",
"backendend(%)"};
const std::vector<std::string> per_op_type = {"opType", "frontend(k)", "frontend(%)", "backendend(k)",
"backendend(%)"};
PrintPerfResult(per_op_name, op_perf_by_name_);
PrintPerfResult(per_op_type, op_perf_by_type_);
} else {
const std::vector<std::string> per_op_name = {"opName", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
const std::vector<std::string> per_op_type = {"opType", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
PrintPerfResult(per_op_name, op_perf_by_name_);
PrintPerfResult(per_op_type, op_perf_by_type_);
}
#endif
}
if (flags_->loop_count_ > 0) {
time_avg /= flags_->loop_count_;
MS_LOG(INFO) << "Model = " << flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
<< ", NumThreads = " << flags_->num_threads_ << ", MinRunTime = " << time_min / 1000.0f
<< ", MaxRuntime = " << time_max / 1000.0f << ", AvgRunTime = " << time_avg / 1000.0f;
printf("Model = %s, NumThreads = %d, MinRunTime = %f ms, MaxRuntime = %f ms, AvgRunTime = %f ms\n",
flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str(), flags_->num_threads_,
time_min / 1000.0f, time_max / 1000.0f, time_avg / 1000.0f);
}
return RET_OK;
}
int BenchmarkUnifiedApi::MarkAccuracy() {
MS_LOG(INFO) << "MarkAccuracy";
std::cout << "MarkAccuracy" << std::endl;
auto status = PrintInputData();
if (status != RET_OK) {
MS_LOG(ERROR) << "PrintInputData error " << status;
std::cerr << "PrintInputData error " << status << std::endl;
return status;
}
std::vector<MSTensor> outputs;
auto ret = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
if (ret != kSuccess) {
MS_LOG(ERROR) << "Inference error ";
std::cerr << "Inference error " << std::endl;
return RET_ERROR;
}
status = ReadCalibData();
if (status != RET_OK) {
MS_LOG(ERROR) << "Read calib data error " << status;
std::cerr << "Read calib data error " << status << std::endl;
return status;
}
status = CompareOutput();
if (status != RET_OK) {
MS_LOG(ERROR) << "Compare output error " << status;
std::cerr << "Compare output error " << status << std::endl;
return status;
}
return RET_OK;
}
int BenchmarkUnifiedApi::PrintInputData() {
for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
auto input = ms_inputs_for_api_[i];
MS_ASSERT(input != nullptr);
auto tensor_data_type = static_cast<int>(input.DataType());
std::cout << "InData" << i << ": ";
if (tensor_data_type == TypeId::kObjectTypeString) {
std::cerr << "Unsupported kObjectTypeString:" << std::endl;
MS_LOG(ERROR) << "Unsupported kObjectTypeString:";
return RET_ERROR;
}
size_t print_num = std::min(static_cast<int>(input.ElementNum()), 20);
const void *in_data = input.MutableData();
if (in_data == nullptr) {
MS_LOG(ERROR) << "in_data is nullptr.";
return RET_ERROR;
}
for (size_t j = 0; j < print_num; j++) {
if (tensor_data_type == TypeId::kNumberTypeFloat32 || tensor_data_type == TypeId::kNumberTypeFloat) {
std::cout << static_cast<const float *>(in_data)[j] << " ";
} else if (tensor_data_type == TypeId::kNumberTypeInt8) {
std::cout << static_cast<const int8_t *>(in_data)[j] << " ";
} else if (tensor_data_type == TypeId::kNumberTypeUInt8) {
std::cout << static_cast<const uint8_t *>(in_data)[j] << " ";
} else if (tensor_data_type == TypeId::kNumberTypeInt32) {
std::cout << static_cast<const int32_t *>(in_data)[j] << " ";
} else if (tensor_data_type == TypeId::kNumberTypeInt64) {
std::cout << static_cast<const int64_t *>(in_data)[j] << " ";
} else if (tensor_data_type == TypeId::kNumberTypeBool) {
std::cout << static_cast<const bool *>(in_data)[j] << " ";
} else {
MS_LOG(ERROR) << "Datatype: " << tensor_data_type << " is not supported.";
return RET_ERROR;
}
}
std::cout << std::endl;
}
return RET_OK;
}
int BenchmarkUnifiedApi::RunBenchmark() {
auto start_prepare_time = GetTimeUs();
// Load graph
std::string model_name = flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1);
MS_LOG(INFO) << "start reading model file";
std::cout << "start reading model file" << std::endl;
size_t size = 0;
char *graph_buf = ReadFile(flags_->model_file_.c_str(), &size);
if (graph_buf == nullptr) {
MS_LOG(ERROR) << "Read model file failed while running " << model_name.c_str();
std::cerr << "Read model file failed while running " << model_name.c_str() << std::endl;
return RET_ERROR;
}
auto context = std::make_shared<mindspore::Context>();
if (context == nullptr) {
MS_LOG(ERROR) << "New context failed while running " << model_name.c_str();
std::cerr << "New context failed while running " << model_name.c_str() << std::endl;
return RET_ERROR;
}
(void)InitMSContext(context);
auto ret = ms_model_.Build(graph_buf, size, kMindIR, context);
if (ret != kSuccess) {
MS_LOG(ERROR) << "ms_model_.Build failed while running ", model_name.c_str();
std::cout << "ms_model_.Build failed while running ", model_name.c_str();
return RET_ERROR;
}
if (!flags_->resize_dims_.empty()) {
std::vector<std::vector<int64_t>> resize_dims;
(void)std::transform(flags_->resize_dims_.begin(), flags_->resize_dims_.end(), std::back_inserter(resize_dims),
[&](auto &shapes) { return this->ConverterToInt64Vector<int>(shapes); });
ret = ms_model_.Resize(ms_model_.GetInputs(), resize_dims);
if (ret != kSuccess) {
MS_LOG(ERROR) << "Input tensor resize failed.";
std::cout << "Input tensor resize failed.";
return RET_ERROR;
}
}
ms_inputs_for_api_ = ms_model_.GetInputs();
auto end_prepare_time = GetTimeUs();
MS_LOG(INFO) << "PrepareTime = " << (end_prepare_time - start_prepare_time) / 1000 << " ms";
std::cout << "PrepareTime = " << (end_prepare_time - start_prepare_time) / 1000 << " ms" << std::endl;
// Load input
MS_LOG(INFO) << "start generate input data";
auto status = LoadInput();
if (status != 0) {
MS_LOG(ERROR) << "Generate input data error";
return status;
}
if (!flags_->benchmark_data_file_.empty()) {
status = MarkAccuracy();
for (auto &data : benchmark_data_) {
data.second->shape.clear();
data.second->data.clear();
delete data.second;
data.second = nullptr;
}
benchmark_data_.clear();
if (status != 0) {
MS_LOG(ERROR) << "Run MarkAccuracy error: " << status;
std::cout << "Run MarkAccuracy error: " << status << std::endl;
return status;
}
} else {
status = MarkPerformance();
if (status != 0) {
MS_LOG(ERROR) << "Run MarkPerformance error: " << status;
std::cout << "Run MarkPerformance error: " << status << std::endl;
return status;
}
}
if (flags_->dump_tensor_data_) {
std::cout << "Dumped file is saved to : " + dump_file_output_dir_ << std::endl;
}
return RET_OK;
}
int BenchmarkUnifiedApi::InitTimeProfilingCallbackParameter() {
// before callback
ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
const std::vector<mindspore::MSTensor> &before_outputs,
const MSCallBackParam &call_param) {
if (before_inputs.empty()) {
MS_LOG(INFO) << "The num of beforeInputs is empty";
}
if (before_outputs.empty()) {
MS_LOG(INFO) << "The num of beforeOutputs is empty";
}
if (op_times_by_type_.find(call_param.node_type_) == op_times_by_type_.end()) {
op_times_by_type_.insert(std::make_pair(call_param.node_type_, std::make_pair(0, 0.0f)));
}
if (op_times_by_name_.find(call_param.node_name_) == op_times_by_name_.end()) {
op_times_by_name_.insert(std::make_pair(call_param.node_name_, std::make_pair(0, 0.0f)));
}
op_call_times_total_++;
op_begin_ = GetTimeUs();
return true;
};
// after callback
ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
uint64_t opEnd = GetTimeUs();
if (after_inputs.empty()) {
MS_LOG(INFO) << "The num of after inputs is empty";
}
if (after_outputs.empty()) {
MS_LOG(INFO) << "The num of after outputs is empty";
}
float cost = static_cast<float>(opEnd - op_begin_) / 1000.0f;
if (flags_->device_ == "GPU") {
auto gpu_param = reinterpret_cast<const GPUCallBackParam &>(call_param);
cost = static_cast<float>(gpu_param.execute_time);
}
op_cost_total_ += cost;
op_times_by_type_[call_param.node_type_].first++;
op_times_by_type_[call_param.node_type_].second += cost;
op_times_by_name_[call_param.node_name_].first++;
op_times_by_name_[call_param.node_name_].second += cost;
return true;
};
return RET_OK;
}
int BenchmarkUnifiedApi::InitPerfProfilingCallbackParameter() {
#ifndef ENABLE_ARM64
MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
return RET_ERROR;
#else
struct perf_event_attr pe, pe2;
memset(&pe, 0, sizeof(struct perf_event_attr));
memset(&pe2, 0, sizeof(struct perf_event_attr));
pe.type = PERF_TYPE_HARDWARE;
pe2.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(struct perf_event_attr);
pe2.size = sizeof(struct perf_event_attr);
pe.disabled = 1;
pe2.disabled = 1;
pe.exclude_kernel = 1; // don't count kernel
pe2.exclude_kernel = 1; // don't count kernel
pe.exclude_hv = 1; // don't count hypervisor
pe2.exclude_hv = 1; // don't count hypervisor
pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
if (flags_->perf_event_ == "CACHE") {
pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
pe2.config = PERF_COUNT_HW_CACHE_MISSES;
} else if (flags_->perf_event_ == "STALL") {
pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
} else {
pe.config = PERF_COUNT_HW_CPU_CYCLES;
pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
}
perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
if (perf_fd == -1) {
MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
return RET_ERROR;
}
perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
if (perf_fd2 == -1) {
MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
return RET_ERROR;
}
struct PerfCount zero;
zero.value[0] = 0;
zero.value[1] = 0;
// before callback
ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
const std::vector<mindspore::MSTensor> &before_outputs,
const MSCallBackParam &call_param) {
if (before_inputs.empty()) {
MS_LOG(INFO) << "The num of beforeInputs is empty";
}
if (before_outputs.empty()) {
MS_LOG(INFO) << "The num of beforeOutputs is empty";
}
if (op_perf_by_type_.find(call_param.node_type_) == op_perf_by_type_.end()) {
op_perf_by_type_.insert(std::make_pair(call_param.node_type_, std::make_pair(0, zero)));
}
if (op_perf_by_name_.find(call_param.node_name_) == op_perf_by_name_.end()) {
op_perf_by_name_.insert(std::make_pair(call_param.node_name_, std::make_pair(0, zero)));
}
op_call_times_total_++;
ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
return true;
};
// after callback
ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
struct PerfResult res;
ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
read(perf_fd, &res, sizeof(struct PerfResult));
if (after_inputs.empty()) {
MS_LOG(INFO) << "The num of after inputs is empty";
}
if (after_outputs.empty()) {
MS_LOG(INFO) << "The num of after outputs is empty";
}
float cost1 = static_cast<float>(res.values[0].value);
float cost2 = static_cast<float>(res.values[1].value);
op_cost_total_ += cost1;
op_cost2_total_ += cost2;
op_perf_by_type_[call_param.node_type_].first++;
op_perf_by_type_[call_param.node_type_].second.value[0] += cost1;
op_perf_by_type_[call_param.node_type_].second.value[1] += cost2;
op_perf_by_name_[call_param.node_name_].first++;
op_perf_by_name_[call_param.node_name_].second.value[0] += cost1;
op_perf_by_name_[call_param.node_name_].second.value[1] += cost2;
return true;
};
#endif
return RET_OK;
}
namespace {
template <typename T>
std::string DataToString(void *data, size_t data_number) {
if (data == nullptr) {
return "Data of tensor is nullptr";
}
std::ostringstream oss;
auto casted_data = static_cast<T *>(data);
for (size_t i = 0; i < 40 && i < data_number; i++) {
oss << " " << casted_data[i];
}
return oss.str();
}
std::string DumpMSTensor(mindspore::MSTensor *tensor) {
if (tensor == nullptr) {
return "Tensor is nullptr";
}
std::ostringstream oss;
oss << " DataType: " << static_cast<int>(tensor->DataType());
oss << " Shape:";
for (auto &dim : tensor->Shape()) {
oss << " " << dim;
}
oss << std::endl << " Data:";
switch (static_cast<int>(tensor->DataType())) {
case kNumberTypeFloat32: {
oss << DataToString<float>(tensor->MutableData(), tensor->ElementNum());
} break;
case kNumberTypeFloat16: {
oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
} break;
case kNumberTypeInt32: {
oss << DataToString<int32_t>(tensor->MutableData(), tensor->ElementNum());
} break;
case kNumberTypeInt16: {
oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
} break;
case kNumberTypeInt8: {
oss << DataToString<int8_t>(tensor->MutableData(), tensor->ElementNum());
} break;
default:
oss << "Unsupported data type to print";
break;
}
return oss.str();
}
std::string GenerateOutputFileName(mindspore::MSTensor *tensor, const std::string &op_name,
const std::string &file_type, const size_t &idx) {
std::string file_name = op_name;
auto pos = file_name.find_first_of('/');
while (pos != std::string::npos) {
file_name.replace(pos, 1, ".");
pos = file_name.find_first_of('/');
}
file_name += "_" + file_type + "_" + std::to_string(idx) + "_shape_";
for (const auto &dim : tensor->Shape()) {
file_name += std::to_string(dim) + "_";
}
if (TYPE_ID_MAP.find(static_cast<int>(tensor->DataType())) != TYPE_ID_MAP.end()) {
file_name += TYPE_ID_MAP.at(static_cast<int>(tensor->DataType()));
}
file_name += +".bin";
return file_name;
}
} // namespace
int BenchmarkUnifiedApi::InitPrintTensorDataCallbackParameter() {
// before callback
ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
const std::vector<mindspore::MSTensor> &before_outputs,
const MSCallBackParam &call_param) { return true; };
// after callback
ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
std::cout << "================================================================" << std::endl;
std::cout << call_param.node_name_ << " inputs : " << std::endl;
for (auto ms_tensor : after_inputs) {
std::cout << DumpMSTensor(&ms_tensor) << std::endl;
}
std::cout << "----------------------------------------------------------------" << std::endl;
std::cout << call_param.node_name_ << " outputs : " << std::endl;
for (auto ms_tensor : after_outputs) {
std::cout << DumpMSTensor(&ms_tensor) << std::endl;
}
std::cout << "================================================================" << std::endl;
return true;
};
return RET_OK;
}
int BenchmarkUnifiedApi::InitDumpTensorDataCallbackParameter() {
// before callback
ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
const std::vector<mindspore::MSTensor> &before_outputs,
const MSCallBackParam &call_param) {
auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
if (dump_mode == 0 || std::find(kernels.begin(), kernels.end(), call_param.node_name_) != kernels.end()) {
if (input_output_mode == 0 || input_output_mode == 1) {
for (size_t i = 0; i < before_inputs.size(); i++) {
auto ms_tensor = before_inputs.at(i);
auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name_, "input", i);
auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) { // save to file
MS_LOG(ERROR) << "write tensor data to file failed.";
return false;
}
}
}
}
return true;
};
// after callback
ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
if (dump_mode == 0 || std::find(kernels.begin(), kernels.end(), call_param.node_name_) != kernels.end()) {
if (input_output_mode == 0 || input_output_mode == 2) {
for (size_t i = 0; i < after_outputs.size(); i++) {
auto ms_tensor = after_outputs.at(i);
auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name_, "output", i);
auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) { // save to file
MS_LOG(ERROR) << "write tensor data to file failed.";
return false;
}
}
}
}
return true;
};
return RET_OK;
}
BenchmarkUnifiedApi::~BenchmarkUnifiedApi() {}
} // namespace lite
} // namespace mindspore

View File

@ -0,0 +1,103 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_BENCHMARK_BENCHMARK_UNIFIED_API_H_
#define MINDSPORE_BENCHMARK_BENCHMARK_UNIFIED_API_H_
#include <getopt.h>
#include <signal.h>
#include <random>
#include <unordered_map>
#include <fstream>
#include <iostream>
#include <map>
#include <cmath>
#include <string>
#include <vector>
#include <memory>
#include <cfloat>
#include <utility>
#include <nlohmann/json.hpp>
#include "tools/benchmark/benchmark_base.h"
#include "include/model.h"
#include "tools/common/flag_parser.h"
#include "src/common/file_utils.h"
#include "src/common/utils.h"
#include "include/api/types.h"
#include "include/api/model.h"
namespace mindspore::lite {
class MS_API BenchmarkUnifiedApi : public BenchmarkBase {
public:
explicit BenchmarkUnifiedApi(BenchmarkFlags *flags) : BenchmarkBase(flags) {}
virtual ~BenchmarkUnifiedApi();
int RunBenchmark() override;
protected:
int CompareDataGetTotalBiasAndSize(const std::string &name, mindspore::MSTensor *tensor, float *total_bias,
int *total_size);
void InitContext(const std::shared_ptr<mindspore::Context> &context);
mindspore::MSTensor GetMSTensorByNodeShape(const std::vector<size_t> &node_shape);
mindspore::MSTensor GetMSTensorByNameOrShape(const std::string &node_or_tensor_name, const std::vector<size_t> &dims);
// call GenerateRandomData to fill inputTensors
int GenerateInputData() override;
int ReadInputFile() override;
int ReadTensorData(std::ifstream &in_file_stream, const std::string &tensor_name,
const std::vector<size_t> &dims) override;
void InitMSContext(const std::shared_ptr<Context> &context);
int CompareOutput() override;
int InitTimeProfilingCallbackParameter() override;
int InitPerfProfilingCallbackParameter() override;
int InitDumpTensorDataCallbackParameter() override;
int InitPrintTensorDataCallbackParameter() override;
int PrintInputData();
template <typename T>
std::vector<int64_t> ConverterToInt64Vector(const std::vector<T> &srcDims) {
std::vector<int64_t> dims;
for (auto shape : srcDims) {
dims.push_back(static_cast<int64_t>(shape));
}
return dims;
}
int MarkPerformance();
int MarkAccuracy();
private:
mindspore::Model ms_model_;
std::vector<mindspore::MSTensor> ms_inputs_for_api_;
MSKernelCallBack ms_before_call_back_ = nullptr;
MSKernelCallBack ms_after_call_back_ = nullptr;
};
} // namespace mindspore::lite
#endif // MINNIE_BENCHMARK_BENCHMARK_H_

View File

@ -14,7 +14,7 @@
* limitations under the License.
*/
#include "tools/benchmark/benchmark.h"
#include "tools/benchmark/run_benchmark.h"
#include "include/version.h"
int main(int argc, const char **argv) {

View File

@ -0,0 +1,82 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tools/benchmark/run_benchmark.h"
#include <string>
namespace mindspore {
namespace lite {
int RunBenchmark(int argc, const char **argv) {
BenchmarkFlags flags;
Option<std::string> err = flags.ParseFlags(argc, argv);
#ifdef SUPPORT_NNIE
SvpSysInit();
#endif
if (err.IsSome()) {
std::cerr << err.Get() << std::endl;
std::cerr << flags.Usage() << std::endl;
return RET_ERROR;
}
if (flags.help) {
std::cerr << flags.Usage() << std::endl;
return RET_OK;
}
BenchmarkBase *benchmark = nullptr;
// get dump data output path
auto new_api = std::getenv("ENABLE_NEW_API");
if (new_api == nullptr || std::string(new_api) != "true") {
benchmark = new Benchmark(&flags);
} else {
benchmark = new BenchmarkUnifiedApi(&flags);
}
if (benchmark == nullptr) {
MS_LOG(ERROR) << "new benchmark failed ";
std::cerr << "new benchmark failed" << std::endl;
return RET_ERROR;
}
auto status = benchmark->Init();
if (status != 0) {
MS_LOG(ERROR) << "Benchmark init Error : " << status;
std::cerr << "Benchmark init Error : " << status << std::endl;
delete benchmark;
benchmark = nullptr;
return RET_ERROR;
}
status = benchmark->RunBenchmark();
if (status != 0) {
MS_LOG(ERROR) << "Run Benchmark "
<< flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
<< " Failed : " << status;
std::cerr << "Run Benchmark " << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
<< " Failed : " << status << std::endl;
delete benchmark;
benchmark = nullptr;
return RET_ERROR;
}
MS_LOG(INFO) << "Run Benchmark " << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
<< " Success.";
std::cout << "Run Benchmark " << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
<< " Success." << std::endl;
delete benchmark;
benchmark = nullptr;
return RET_OK;
}
} // namespace lite
} // namespace mindspore

View File

@ -0,0 +1,27 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINNIE_BENCHMARK_RUN_BENCHMARK_H_
#define MINNIE_BENCHMARK_RUN_BENCHMARK_H_
#include "tools/benchmark/benchmark.h"
#include "tools/benchmark/benchmark_unified_api.h"
namespace mindspore::lite {
int MS_API RunBenchmark(int argc, const char **argv);
} // namespace mindspore::lite
#endif // MINNIE_BENCHMARK_RUN_BENCHMARK_H_