fuse benchmark and timeprofiler

This commit is contained in:
cjh9368 2020-09-19 14:46:01 +08:00
parent 5ebdaaca26
commit 0ac7c7884d
5 changed files with 169 additions and 44 deletions

View File

@ -78,7 +78,7 @@ Model *Model::Import(const char *model_buf, size_t size) {
MS_LOG(ERROR) << "The buffer is invalid and fail to create graph.";
return nullptr;
}
Model *model = new (std::nothrow) Model();
auto *model = new (std::nothrow) Model();
if (model == nullptr) {
MS_LOG(ERROR) << "new model fail!";
return nullptr;
@ -86,14 +86,14 @@ Model *Model::Import(const char *model_buf, size_t size) {
model->buf = reinterpret_cast<char *>(malloc(size));
if (model->buf == nullptr) {
MS_LOG(ERROR) << "new inner model buf fail!";
delete(model);
delete (model);
return nullptr;
}
memcpy(model->buf, model_buf, size);
auto meta_graph = schema::GetMetaGraph(model->buf);
if (meta_graph == nullptr) {
MS_LOG(ERROR) << "meta_graph is nullptr!";
delete(model);
delete (model);
return nullptr;
}

View File

@ -265,7 +265,8 @@ int Benchmark::MarkPerformance() {
for (int i = 0; i < _flags->loopCount; i++) {
session->BindThread(true);
auto start = GetTimeUs();
auto status = session->RunGraph();
auto status =
_flags->runTimeProfiler ? session->RunGraph(before_call_back_, after_call_back_) : session->RunGraph();
if (status != 0) {
MS_LOG(ERROR) << "Inference error " << status;
std::cerr << "Inference error " << status;
@ -280,6 +281,14 @@ int Benchmark::MarkPerformance() {
session->BindThread(false);
}
if (_flags->runTimeProfiler) {
const std::vector<std::string> per_op_name = {"opName", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
const std::vector<std::string> per_op_type = {"opType", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
PrintResult(per_op_name, op_times_by_name_);
PrintResult(per_op_type, op_times_by_type_);
}
if (_flags->loopCount > 0) {
timeAvg /= _flags->loopCount;
MS_LOG(INFO) << "Model = " << _flags->modelPath.substr(_flags->modelPath.find_last_of(DELIM_SLASH) + 1).c_str()
@ -295,25 +304,25 @@ int Benchmark::MarkPerformance() {
int Benchmark::MarkAccuracy() {
MS_LOG(INFO) << "MarkAccuracy";
std::cout << "MarkAccuracy" << std::endl;
for (size_t i = 0; i < msInputs.size(); i++) {
switch (msInputs.at(i)->data_type()) {
for (auto &msInput : msInputs) {
switch (msInput->data_type()) {
case TypeId::kNumberTypeFloat:
PrintInputData<float>(msInputs.at(i));
PrintInputData<float>(msInput);
break;
case TypeId::kNumberTypeFloat32:
PrintInputData<float>(msInputs.at(i));
PrintInputData<float>(msInput);
break;
case TypeId::kNumberTypeInt8:
PrintInputData<int8_t>(msInputs.at(i));
PrintInputData<int8_t>(msInput);
break;
case TypeId::kNumberTypeUInt8:
PrintInputData<uint8_t>(msInputs.at(i));
PrintInputData<uint8_t>(msInput);
break;
case TypeId::kNumberTypeInt32:
PrintInputData<int>(msInputs.at(i));
PrintInputData<int>(msInput);
break;
default:
MS_LOG(ERROR) << "Datatype " << msInputs.at(i)->data_type() << " is not supported.";
MS_LOG(ERROR) << "Datatype " << msInput->data_type() << " is not supported.";
return RET_ERROR;
}
}
@ -340,7 +349,7 @@ int Benchmark::MarkAccuracy() {
return RET_OK;
}
int Benchmark::RunBenchmark(const std::string &deviceType) {
int Benchmark::RunBenchmark() {
auto startPrepareTime = GetTimeUs();
// Load graph
std::string modelName = _flags->modelPath.substr(_flags->modelPath.find_last_of(DELIM_SLASH) + 1);
@ -355,13 +364,12 @@ int Benchmark::RunBenchmark(const std::string &deviceType) {
return RET_ERROR;
}
auto model = lite::Model::Import(graphBuf, size);
delete[](graphBuf);
if (model == nullptr) {
MS_LOG(ERROR) << "Import model file failed while running " << modelName.c_str();
std::cerr << "Import model file failed while running " << modelName.c_str() << std::endl;
delete[](graphBuf);
return RET_ERROR;
}
delete[](graphBuf);
auto context = new (std::nothrow) lite::Context;
if (context == nullptr) {
MS_LOG(ERROR) << "New context failed while running " << modelName.c_str();
@ -372,8 +380,6 @@ int Benchmark::RunBenchmark(const std::string &deviceType) {
context->device_type_ = lite::DT_CPU;
} else if (_flags->device == "GPU") {
context->device_type_ = lite::DT_GPU;
} else {
context->device_type_ = lite::DT_NPU;
}
if (_flags->cpuBindMode == -1) {
@ -403,13 +409,8 @@ int Benchmark::RunBenchmark(const std::string &deviceType) {
model->Free();
msInputs = session->GetInputs();
auto endPrepareTime = GetTimeUs();
#if defined(__arm__)
MS_LOG(INFO) << "PrepareTime = " << (endPrepareTime - startPrepareTime) / 1000 << " ms";
printf("PrepareTime = %lld ms, ", (endPrepareTime - startPrepareTime) / 1000);
#else
MS_LOG(INFO) << "PrepareTime = " << (endPrepareTime - startPrepareTime) / 1000 << " ms ";
printf("PrepareTime = %ld ms, ", (endPrepareTime - startPrepareTime) / 1000);
#endif
std::cout << "PrepareTime = " << (endPrepareTime - startPrepareTime) / 1000 << " ms" << std::endl;
// Load input
MS_LOG(INFO) << "start generate input data";
@ -481,6 +482,54 @@ void BenchmarkFlags::InitResizeDimsList() {
}
}
int Benchmark::InitCallbackParameter() {
// before callback
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const session::CallBackParam &callParam) {
if (before_inputs.empty()) {
MS_LOG(INFO) << "The num of beforeInputs is empty";
}
if (before_outputs.empty()) {
MS_LOG(INFO) << "The num of beforeOutputs is empty";
}
if (op_times_by_type_.find(callParam.type_callback_param) == op_times_by_type_.end()) {
op_times_by_type_.insert(std::make_pair(callParam.type_callback_param, std::make_pair(0, 0.0f)));
}
if (op_times_by_name_.find(callParam.name_callback_param) == op_times_by_name_.end()) {
op_times_by_name_.insert(std::make_pair(callParam.name_callback_param, std::make_pair(0, 0.0f)));
}
op_call_times_total_++;
op_begin_ = GetTimeUs();
return true;
};
// after callback
after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
const session::CallBackParam &call_param) {
uint64_t opEnd = GetTimeUs();
if (after_inputs.empty()) {
MS_LOG(INFO) << "The num of after inputs is empty";
}
if (after_outputs.empty()) {
MS_LOG(INFO) << "The num of after outputs is empty";
}
float cost = static_cast<float>(opEnd - op_begin_) / 1000.0f;
op_cost_total_ += cost;
op_times_by_type_[call_param.type_callback_param].first++;
op_times_by_type_[call_param.type_callback_param].second += cost;
op_times_by_name_[call_param.name_callback_param].first++;
op_times_by_name_[call_param.name_callback_param].second += cost;
return true;
};
return RET_OK;
}
int Benchmark::Init() {
if (this->_flags == nullptr) {
return 1;
@ -550,6 +599,79 @@ int Benchmark::Init() {
return RET_ERROR;
}
if (_flags->runTimeProfiler) {
auto status = InitCallbackParameter();
if (status != RET_OK) {
MS_LOG(ERROR) << "Init callback Parameter failed.";
std::cerr << "Init callback Parameter failed." << std::endl;
return RET_ERROR;
}
}
return RET_OK;
}
int Benchmark::PrintResult(const std::vector<std::string> &title,
const std::map<std::string, std::pair<int, float>> &result) {
std::vector<size_t> columnLenMax(5);
std::vector<std::vector<std::string>> rows;
for (auto &iter : result) {
char stringBuf[5][100] = {};
std::vector<std::string> columns;
size_t len;
len = iter.first.size();
if (len > columnLenMax.at(0)) {
columnLenMax.at(0) = len + 4;
}
columns.push_back(iter.first);
len = snprintf(stringBuf[1], sizeof(stringBuf[1]), "%f", iter.second.second / _flags->loopCount);
if (len > columnLenMax.at(1)) {
columnLenMax.at(1) = len + 4;
}
columns.emplace_back(stringBuf[1]);
len = snprintf(stringBuf[2], sizeof(stringBuf[2]), "%f", iter.second.second / op_cost_total_);
if (len > columnLenMax.at(2)) {
columnLenMax.at(2) = len + 4;
}
columns.emplace_back(stringBuf[2]);
len = snprintf(stringBuf[3], sizeof(stringBuf[3]), "%d", iter.second.first);
if (len > columnLenMax.at(3)) {
columnLenMax.at(3) = len + 4;
}
columns.emplace_back(stringBuf[3]);
len = snprintf(stringBuf[4], sizeof(stringBuf[4]), "%f", iter.second.second);
if (len > columnLenMax.at(4)) {
columnLenMax.at(4) = len + 4;
}
columns.emplace_back(stringBuf[4]);
rows.push_back(columns);
}
printf("-------------------------------------------------------------------------\n");
for (int i = 0; i < 5; i++) {
auto printBuf = title[i];
if (printBuf.size() > columnLenMax.at(i)) {
columnLenMax.at(i) = printBuf.size();
}
printBuf.resize(columnLenMax.at(i), ' ');
printf("%s\t", printBuf.c_str());
}
printf("\n");
for (size_t i = 0; i < rows.size(); i++) {
for (int j = 0; j < 5; j++) {
auto printBuf = rows[i][j];
printBuf.resize(columnLenMax.at(j), ' ');
printf("%s\t", printBuf.c_str());
}
printf("\n");
}
return RET_OK;
}
@ -583,16 +705,7 @@ int RunBenchmark(int argc, const char **argv) {
return RET_ERROR;
}
if (flags.device == "GPU") {
status = mBenchmark.RunBenchmark("GPU");
} else if (flags.device == "CPU") {
status = mBenchmark.RunBenchmark("CPU");
} else {
MS_LOG(ERROR) << "Device type" << flags.device << " not support.";
std::cerr << "Device type" << flags.device << " not support." << std::endl;
return RET_ERROR;
}
status = mBenchmark.RunBenchmark();
if (status != 0) {
MS_LOG(ERROR) << "Run Benchmark " << flags.modelPath.substr(flags.modelPath.find_last_of(DELIM_SLASH) + 1).c_str()
<< " Failed : " << status;

View File

@ -28,6 +28,7 @@
#include <vector>
#include <memory>
#include <cfloat>
#include <utility>
#include "include/model.h"
#include "tools/common/flag_parser.h"
#include "src/common/file_utils.h"
@ -64,6 +65,7 @@ class MS_API BenchmarkFlags : public virtual FlagParser {
AddFlag(&BenchmarkFlags::numThreads, "numThreads", "Run threads number", 2);
AddFlag(&BenchmarkFlags::fp16Priority, "fp16Priority", "Priority float16", false);
AddFlag(&BenchmarkFlags::warmUpLoopCount, "warmUpLoopCount", "Run warm up loop", 3);
AddFlag(&BenchmarkFlags::runTimeProfiler, "runTimeProfiler", "Run time profiler", false);
// MarkAccuracy
AddFlag(&BenchmarkFlags::calibDataPath, "calibDataPath", "Calibration data file path", "");
AddFlag(&BenchmarkFlags::calibDataType, "calibDataType", "Calibration data type. FLOAT | INT32 | INT8 | UINT8",
@ -90,6 +92,7 @@ class MS_API BenchmarkFlags : public virtual FlagParser {
int numThreads;
bool fp16Priority;
int warmUpLoopCount;
bool runTimeProfiler;
// MarkAccuracy
std::string calibDataPath;
std::string calibDataType;
@ -108,7 +111,7 @@ class MS_API Benchmark {
virtual ~Benchmark();
int Init();
int RunBenchmark(const std::string &deviceType = "NPU");
int RunBenchmark();
private:
// call GenerateInputData or ReadInputFile to init inputTensors
@ -125,6 +128,10 @@ class MS_API Benchmark {
int CompareOutput();
int InitCallbackParameter();
int PrintResult(const std::vector<std::string> &title, const std::map<std::string, std::pair<int, float>> &result);
template <typename T>
void PrintInputData(tensor::MSTensor *input) {
MS_ASSERT(input != nullptr);
@ -228,6 +235,16 @@ class MS_API Benchmark {
{"INT32", TypeId::kNumberTypeInt32},
{"UINT8", TypeId::kNumberTypeUInt8}};
TypeId msCalibDataType = TypeId::kNumberTypeFloat;
// callback parameters
uint64_t op_begin_ = 0;
int op_call_times_total_ = 0;
float op_cost_total_ = 0.0f;
std::map<std::string, std::pair<int, float>> op_times_by_type_;
std::map<std::string, std::pair<int, float>> op_times_by_name_;
session::KernelCallBack before_call_back_;
session::KernelCallBack after_call_back_;
};
int MS_API RunBenchmark(int argc, const char **argv);

View File

@ -38,13 +38,13 @@ Option<std::string> FlagParser::ParseFlags(int argc, const char *const *argv, bo
}
if (flagItem.find("--") == std::string::npos) {
continue;
return Option<std::string>("Failed: flag " + flagItem + " is not valid.");
}
std::string key;
Option<std::string> value = Option<std::string>(None());
size_t pos = flagItem.find_first_of("=");
size_t pos = flagItem.find_first_of('=');
if (pos == std::string::npos && flagItem.find("--no-") != std::string::npos) {
key = flagItem.substr(FLAG_PREFIX_LEN);
} else if (pos == std::string::npos) {

View File

@ -79,13 +79,8 @@ AwareQuantizer::AwareQuantizer(schema::MetaGraphT *graph, const TypeId &inferTyp
const float stdValue = std::stof(stdValues, &sz);
sz = 0;
const float mean = std::stof(meanValues, &sz);
std::unique_ptr<InputArray> inArr = nullptr;
if (inferType == kNumberTypeFloat) {
inArr.reset(new (std::nothrow) InputArray(mean, stdValue));
} else {
inArr.reset(new (std::nothrow) InputArray(mean, stdValue, TypeId::kNumberTypeInt8));
}
mInputArray = inArr.get();
mInputArray = new (std::nothrow) InputArray(mean, stdValue);
mInputArray->dataType = inferType;
mInputArray->InitQuantParam();
}
@ -132,7 +127,7 @@ STATUS AwareQuantizer::GenerateQuantParam() {
} else {
auto status = quantParamCalcer->Calc(graph, *node);
if (status != RET_OK) {
MS_LOG(ERROR) << "quantParamCalcer failed: " << status << " node: " << node->name.c_str();
MS_LOG(WARNING) << "quantParamCalcer failed: " << status << " node: " << node->name.c_str();
node->quantType = schema::QuantType_QUANT_NONE;
} else {
node->quantType = schema::QuantType_AwareTraining;