!19987 [MS][LITE]fix bug and add new api benchmark

Merge pull request !19987 from 张学同/api13
2021-07-12 01:36:58 +00:00 · 2021-07-12 01:36:58 +00:00 · ec2ec77666
parent cd2d2bef0a 3ff778ac53
commit ec2ec77666
13 changed files with 1986 additions and 929 deletions
--- a/mindspore/lite/src/cxx_api/model/model_impl.cc
+++ b/mindspore/lite/src/cxx_api/model/model_impl.cc
@ -41,79 +41,9 @@ CreateTrainSessionProto *CreateTrainSessionCallbackHolder(CreateTrainSessionProt
  return proto_;
 }

-lite::CpuBindMode ModelImpl::GetCpuBindMode() {
-  auto affinity_mode = context_->GetThreadAffinityMode();
-  switch (affinity_mode) {
-    case 0:
-      return lite::NO_BIND;
-    case 1:
-      return lite::HIGHER_CPU;
-    case 2:
-      return lite::MID_CPU;
-    default:
-      return lite::NO_BIND;
-  }
-}
-
-Status ModelImpl::ConverterContext(const std::shared_ptr<Context> &context, lite::Context *model_context) {
-  auto device_list = context->MutableDeviceInfo();
-  if (device_list.size() == 0) {
-    MS_LOG(ERROR) << "Invalid device list.";
-    return kLiteInputParamInvalid;
-  }
-  if (device_list.size() > 2) {
-    MS_LOG(ERROR) << "Only CPU/CPU & GPU/CPU & NPU mode is supported.";
-    return kLiteInputParamInvalid;
-  }
-
-  model_context->thread_num_ = context->GetThreadNum();
-  model_context->enable_parallel_ = context->GetEnableParallel();
-  model_context->affinity_core_list_ = context->GetThreadAffinityCoreList();
-  model_context->device_list_.clear();
-  if (device_list[0]->GetDeviceType() != kCPU) {
-    MS_LOG(ERROR) << "CPU context must be enabled and in the first place of device list.";
-    return kLiteInputParamInvalid;
-  }
-
-  auto cpu_context = device_list[0]->Cast<CPUDeviceInfo>();
-  model_context->allocator = cpu_context->GetAllocator();
-  if (model_context->allocator == nullptr) {
-    model_context->allocator = Allocator::Create();
-    if (model_context->allocator == nullptr) {
-      MS_LOG(ERROR) << "Create Allocator failed.";
-      return kLiteNullptr;
-    }
-    MS_LOG(DEBUG) << "Set new allocator.";
-    cpu_context->SetAllocator(model_context->allocator);
-  }
-
-  lite::CpuBindMode mode = GetCpuBindMode();
-  lite::DeviceInfo cpu_info = {0};
-  cpu_info.cpu_device_info_ = {cpu_context->GetEnableFP16(), mode};
-  model_context->device_list_.push_back({lite::DT_CPU, cpu_info, cpu_context->GetProvider(),
-                                         cpu_context->GetProviderDevice(), cpu_context->GetAllocator()});
-  if (device_list.size() == 2) {
-    lite::DeviceInfo device_info = {0};
-    if (device_list[1]->GetDeviceType() == kMaliGPU) {
-      auto gpu_context = device_list[1]->Cast<MaliGPUDeviceInfo>();
-      device_info.gpu_device_info_ = {gpu_context->GetEnableFP16()};
-      model_context->device_list_.push_back({lite::DT_GPU, device_info, gpu_context->GetProvider(),
-                                             gpu_context->GetProviderDevice(), gpu_context->GetAllocator()});
-    } else if (device_list[1]->GetDeviceType() == kKirinNPU) {
-      auto npu_context = device_list[1]->Cast<KirinNPUDeviceInfo>();
-      device_info.npu_device_info_ = {npu_context->GetFrequency()};
-      model_context->device_list_.push_back({lite::DT_NPU, device_info});
-    } else {
-      MS_LOG(ERROR) << "Invalid device.";
-      return kLiteInputParamInvalid;
-    }
-  }
-  model_context->delegate = context->GetDelegate();
-  return kSuccess;
-}
-
 Status ModelImpl::Build(const void *model_data, size_t data_size, ModelType model_type,
                        const std::shared_ptr<Context> &ms_context) {
+  context_ = ms_context;
  lite::Context lite_context;
  auto status = A2L_ConvertContext(ms_context.get(), &lite_context);
  if (status != kSuccess) {
--- a/mindspore/lite/src/cxx_api/model/model_impl.h
+++ b/mindspore/lite/src/cxx_api/model/model_impl.h
@ -100,8 +100,6 @@ class ModelImpl {
  void SetGraph(const std::shared_ptr<Graph> &graph) { graph_ = graph; }
  void SetContext(const std::shared_ptr<Context> &context) { context_ = context; }
  void SetConfig(const std::shared_ptr<TrainCfg> cfg) { cfg_ = cfg; }
-  lite::CpuBindMode GetCpuBindMode();
-  Status ConverterContext(const std::shared_ptr<Context> &context, lite::Context *model_context);
  Status RunGraph(const MSKernelCallBack &before, const MSKernelCallBack &after);
 };
 }  // namespace mindspore
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@ -331,6 +331,7 @@ if(MSLITE_ENABLE_CONVERTER)
            ${TEST_SRC}
            ${TEST_DIR}/st/converter_test.cc
            ${TEST_DIR}/st/mindrt_parallel_test.cc
+            ${TEST_DIR}/st/graph_test.cc
            ${TEST_DIR}/st/sub_graph_test.cc
            ${TEST_DIR}/common/import_from_meta_graphT.cc
            ${TEST_DIR}/ut/tools/optimizer/fusion/conv_biasadd_fusion_test.cc
@ -384,6 +385,8 @@ if(ENABLE_FP16 AND SUPPORT_TRAIN)
    list(APPEND TEST_SRC ${TEST_CASE_KERNEL_FP16_SRC_GRAD})
 endif()

+file(GLOB_RECURSE API_SRC ${LITE_DI}/src/cxx_api/*.cc)
+set(TEST_SRC ${TEST_SRC} ${API_SRC})
 add_executable(lite-test ${TEST_SRC})
 add_dependencies(lite-test fbs_src)

--- a/mindspore/lite/tools/benchmark/CMakeLists.txt
+++ b/mindspore/lite/tools/benchmark/CMakeLists.txt
@ -7,7 +7,10 @@ set(COMMON_SRC
 if(NOT TARGET_HIMIX200)
    add_executable(benchmark
            ${CMAKE_CURRENT_SOURCE_DIR}/main.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/run_benchmark.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_base.cc
            ${CMAKE_CURRENT_SOURCE_DIR}/benchmark.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/benchmark_unified_api.cc
            ${COMMON_SRC})

    add_dependencies(benchmark fbs_src)
--- a/mindspore/lite/tools/benchmark/benchmark.cc
+++ b/mindspore/lite/tools/benchmark/benchmark.cc
@ -42,69 +42,6 @@

 namespace mindspore {
 namespace lite {
-namespace {
-constexpr int kNumPrintMin = 5;
-}
-static const char *DELIM_COLON = ":";
-static const char *DELIM_COMMA = ",";
-static const char *DELIM_SLASH = "/";
-static const std::unordered_map<TypeId, std::string> TYPE_ID_MAP{
-  {kNumberTypeFloat16, "Float16"}, {kNumberTypeFloat, "Float32"},    {kNumberTypeFloat32, "Float32"},
-  {kNumberTypeInt8, "Int8"},       {kNumberTypeInt16, "Int16"},      {kNumberTypeInt, "Int32"},
-  {kNumberTypeInt32, "Int32"},     {kNumberTypeUInt8, "UInt8"},      {kNumberTypeUInt16, "UInt16"},
-  {kNumberTypeUInt, "UInt32"},     {kNumberTypeUInt32, "UInt32"},    {kObjectTypeString, "String"},
-  {kNumberTypeBool, "Bool"},       {kObjectTypeTensorType, "Tensor"}};
-static const std::unordered_map<schema::Format, std::string> TENSOR_FORMAT_MAP{
-  {schema::Format_NCHW, "NCHW"}, {schema::Format_NHWC, "NHWC"},     {schema::Format_NHWC4, "NHWC4"},
-  {schema::Format_HWKC, "HWKC"}, {schema::Format_HWCK, "HWCK"},     {schema::Format_KCHW, "KCHW"},
-  {schema::Format_CKHW, "CKHW"}, {schema::Format_KHWC, "KHWC"},     {schema::Format_CHWK, "CHWK"},
-  {schema::Format_HW, "HW"},     {schema::Format_HW4, "HW4"},       {schema::Format_NC, "NC"},
-  {schema::Format_NC4, "NC4"},   {schema::Format_NC4HW4, "NC4HW4"}, {schema::Format_NCDHW, "NCDHW"}};
-
-namespace dump {
-constexpr auto kConfigPath = "MINDSPORE_DUMP_CONFIG";
-constexpr auto kSettings = "common_dump_settings";
-constexpr auto kMode = "dump_mode";
-constexpr auto kPath = "path";
-constexpr auto kNetName = "net_name";
-constexpr auto kInputOutput = "input_output";
-constexpr auto kKernels = "kernels";
-}  // namespace dump
-
-int Benchmark::GenerateRandomData(size_t size, void *data, TypeId data_type) {
-  MS_ASSERT(data != nullptr);
-  switch (data_type) {
-    case kNumberTypeFloat32:
-    case kNumberTypeFloat:
-      FillInputData<float>(size, data, std::uniform_real_distribution<float>(0.1f, 1.0f));
-      break;
-    case kNumberTypeFloat64:
-      FillInputData<double>(size, data, std::uniform_real_distribution<double>(0.1, 1.0));
-      break;
-    case kNumberTypeInt64:
-      FillInputData<int64_t>(size, data, std::uniform_int_distribution<int64_t>(0, 1));
-      break;
-    case kNumberTypeInt:
-    case kNumberTypeInt32:
-      FillInputData<int32_t>(size, data, std::uniform_int_distribution<int32_t>(0, 1));
-      break;
-    case kNumberTypeInt16:
-      FillInputData<int16_t>(size, data, std::uniform_int_distribution<int16_t>(0, 1));
-      break;
-    case kNumberTypeInt8:
-      FillInputData<int8_t>(size, data, std::uniform_int_distribution<int8_t>(-127, 127));
-      break;
-    case kNumberTypeUInt8:
-      FillInputData<uint8_t>(size, data, std::uniform_int_distribution<uint8_t>(0, 254));
-      break;
-    default:
-      char *casted_data = static_cast<char *>(data);
-      for (size_t i = 0; i < size; i++) {
-        casted_data[i] = static_cast<char>(i);
-      }
-  }
-  return RET_OK;
-}

 int Benchmark::GenerateInputData() {
  for (auto tensor : ms_inputs_) {
@ -118,7 +55,7 @@ int Benchmark::GenerateInputData() {
    if (tensor->data_type() == kObjectTypeString) {
      status = StringsToMSTensor({"you're the best."}, tensor);
    } else {
-      status = GenerateRandomData(tensor->Size(), input_data, tensor->data_type());
+      status = GenerateRandomData(tensor->Size(), input_data, static_cast<float>(tensor->data_type()));
    }
    if (status != RET_OK) {
      std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
@ -129,25 +66,6 @@ int Benchmark::GenerateInputData() {
  return RET_OK;
 }

-int Benchmark::LoadInput() {
-  if (flags_->in_data_file_.empty()) {
-    auto status = GenerateInputData();
-    if (status != 0) {
-      std::cerr << "Generate input data error " << status << std::endl;
-      MS_LOG(ERROR) << "Generate input data error " << status;
-      return status;
-    }
-  } else {
-    auto status = ReadInputFile();
-    if (status != 0) {
-      std::cerr << "ReadInputFile error, " << status << std::endl;
-      MS_LOG(ERROR) << "ReadInputFile error, " << status;
-      return status;
-    }
-  }
-  return RET_OK;
-}
-
 int Benchmark::ReadInputFile() {
  if (ms_inputs_.empty()) {
    return RET_OK;
@ -196,49 +114,6 @@ int Benchmark::ReadInputFile() {
  return RET_OK;
 }

-// calibData is FP32
-int Benchmark::ReadCalibData() {
-  const char *calib_data_path = flags_->benchmark_data_file_.c_str();
-  // read calib data
-  std::ifstream in_file(calib_data_path);
-  if (!in_file.good()) {
-    std::cerr << "file: " << calib_data_path << " is not exist" << std::endl;
-    MS_LOG(ERROR) << "file: " << calib_data_path << " is not exist";
-    return RET_ERROR;
-  }
-
-  if (!in_file.is_open()) {
-    std::cerr << "file: " << calib_data_path << " open failed" << std::endl;
-    MS_LOG(ERROR) << "file: " << calib_data_path << " open failed";
-    in_file.close();
-    return RET_ERROR;
-  }
-  MS_LOG(INFO) << "Start reading calibData file";
-  std::string line;
-  std::string tensor_name;
-
-  while (!in_file.eof()) {
-    getline(in_file, line);
-    std::stringstream string_line1(line);
-    size_t dim = 0;
-    string_line1 >> tensor_name >> dim;
-    std::vector<size_t> dims;
-    for (size_t i = 0; i < dim; i++) {
-      size_t tmp_dim;
-      string_line1 >> tmp_dim;
-      dims.push_back(tmp_dim);
-    }
-    auto ret = ReadTensorData(in_file, tensor_name, dims);
-    if (ret != RET_OK) {
-      MS_LOG(ERROR) << "Read tensor data failed, tensor name: " << tensor_name;
-      return RET_ERROR;
-    }
-  }
-  in_file.close();
-  MS_LOG(INFO) << "Finish reading calibData file";
-  return RET_OK;
-}
-
 int Benchmark::ReadTensorData(std::ifstream &in_file_stream, const std::string &tensor_name,
                              const std::vector<size_t> &dims) {
  std::string line;
@ -379,28 +254,6 @@ tensor::MSTensor *Benchmark::GetTensorByNameOrShape(const std::string &node_or_t
  return tensor;
 }

-int Benchmark::CompareStringData(const std::string &name, tensor::MSTensor *tensor) {
-  auto iter = this->benchmark_data_.find(name);
-  if (iter != this->benchmark_data_.end()) {
-    std::vector<std::string> calib_strings = iter->second->strings_data;
-    std::vector<std::string> output_strings = MSTensorToStrings(tensor);
-    size_t compare_num = std::min(calib_strings.size(), output_strings.size());
-    size_t print_num = std::min(compare_num, static_cast<size_t>(kNumPrintMin));
-
-    std::cout << "Data of node " << name << " : " << std::endl;
-    for (size_t i = 0; i < compare_num; i++) {
-      if (i < print_num) {
-        std::cout << "  " << output_strings[i] << std::endl;
-      }
-      if (calib_strings[i] != output_strings[i]) {
-        MS_LOG(ERROR) << "Compare failed, index: " << i;
-        return RET_ERROR;
-      }
-    }
-  }
-  return RET_OK;
-}
-
 int Benchmark::CompareDataGetTotalBiasAndSize(const std::string &name, tensor::MSTensor *tensor, float *total_bias,
                                              int *total_size) {
  float bias = 0;
@ -698,36 +551,6 @@ int Benchmark::RunBenchmark() {
  return RET_OK;
 }

-void BenchmarkFlags::InitInputDataList() {
-  char *input_list = new char[this->in_data_file_.length() + 1];
-  snprintf(input_list, this->in_data_file_.length() + 1, "%s", this->in_data_file_.c_str());
-  char *cur_input;
-  const char *split_c = ",";
-  cur_input = strtok(input_list, split_c);
-  while (cur_input != nullptr) {
-    input_data_list_.emplace_back(cur_input);
-    cur_input = strtok(nullptr, split_c);
-  }
-  delete[] input_list;
-}
-
-void BenchmarkFlags::InitResizeDimsList() {
-  std::string content = this->resize_dims_in_;
-  std::vector<int> shape;
-  auto shape_strs = StringSplit(content, std::string(DELIM_COLON));
-  for (const auto &shape_str : shape_strs) {
-    shape.clear();
-    auto dim_strs = StringSplit(shape_str, std::string(DELIM_COMMA));
-    std::cout << "Resize Dims: ";
-    for (const auto &dim_str : dim_strs) {
-      std::cout << dim_str << " ";
-      shape.emplace_back(static_cast<int>(std::stoi(dim_str)));
-    }
-    std::cout << std::endl;
-    this->resize_dims_.emplace_back(shape);
-  }
-}
-
 int Benchmark::InitTimeProfilingCallbackParameter() {
  // before callback
  before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
@ -1022,444 +845,7 @@ int Benchmark::InitDumpTensorDataCallbackParameter() {
  return RET_OK;
 }

-int Benchmark::CheckThreadNumValid() {
-  if (this->flags_->num_threads_ < 1) {
-    MS_LOG(ERROR) << "numThreads:" << this->flags_->num_threads_ << " must be greater than 0";
-    std::cerr << "numThreads:" << this->flags_->num_threads_ << " must be greater than 0" << std::endl;
-    return RET_ERROR;
-  }
+Benchmark::~Benchmark() { delete (session_); }

-  if (flags_->enable_parallel_) {
-    if (flags_->num_threads_ < 2) {
-      MS_LOG(ERROR) << "enable parallel need more than 1 thread.";
-      std::cerr << "enable parallel need more than 1 thread." << std::endl;
-      return RET_ERROR;
-    }
-  }
-  return RET_OK;
-}
-
-int Benchmark::InitDumpConfigFromJson(char *path) {
-  auto real_path = RealPath(path);
-  std::ifstream ifs(real_path);
-  if (!ifs.good()) {
-    MS_LOG(ERROR) << "file: " << real_path << " is not exist";
-    return RET_ERROR;
-  }
-  if (!ifs.is_open()) {
-    MS_LOG(ERROR) << "file: " << real_path << " open failed";
-    return RET_ERROR;
-  }
-
-  try {
-    dump_cfg_json_ = nlohmann::json::parse(ifs);
-  } catch (const nlohmann::json::parse_error &error) {
-    MS_LOG(ERROR) << "parse json file failed, please check your file.";
-    return RET_ERROR;
-  }
-  if (dump_cfg_json_[dump::kSettings] == nullptr) {
-    MS_LOG(ERROR) << "\"common_dump_settings\" is required.";
-    return RET_ERROR;
-  }
-  if (dump_cfg_json_[dump::kSettings][dump::kMode] == nullptr) {
-    MS_LOG(ERROR) << "\"dump_mode\" is required.";
-    return RET_ERROR;
-  }
-  if (dump_cfg_json_[dump::kSettings][dump::kPath] == nullptr) {
-    MS_LOG(ERROR) << "\"path\" is required.";
-    return RET_ERROR;
-  }
-  if (dump_cfg_json_[dump::kSettings][dump::kNetName] == nullptr) {
-    dump_cfg_json_[dump::kSettings][dump::kNetName] = "Default";
-  }
-  if (dump_cfg_json_[dump::kSettings][dump::kInputOutput] == nullptr) {
-    dump_cfg_json_[dump::kSettings][dump::kInputOutput] = 0;
-  }
-  if (dump_cfg_json_[dump::kSettings][dump::kKernels] != nullptr &&
-      !dump_cfg_json_[dump::kSettings][dump::kKernels].empty()) {
-    if (dump_cfg_json_[dump::kSettings][dump::kMode] == 0) {
-      MS_LOG(ERROR) << R"("dump_mode" should be 1 when "kernels" isn't empty.)";
-      return RET_ERROR;
-    }
-  }
-
-  auto abs_path = dump_cfg_json_[dump::kSettings][dump::kPath].get<std::string>();
-  auto net_name = dump_cfg_json_[dump::kSettings][dump::kNetName].get<std::string>();
-  if (abs_path.back() == '\\' || abs_path.back() == '/') {
-    dump_file_output_dir_ = abs_path + net_name;
-  } else {
-#ifdef _WIN32
-    dump_file_output_dir_ = abs_path + "\\" + net_name;
-#else
-    dump_file_output_dir_ = abs_path + "/" + net_name;
-#endif
-  }
-
-  auto status = CreateOutputDir(&dump_file_output_dir_);
-  if (status != RET_OK) {
-    MS_LOG(ERROR) << "create data output directory failed.";
-    return RET_ERROR;
-  }
-
-  return RET_OK;
-}
-
-int Benchmark::InitCallbackParameter() {
-  int ret = RET_OK;
-  if (flags_->time_profiling_) {
-    ret = InitTimeProfilingCallbackParameter();
-  } else if (flags_->perf_profiling_) {
-    ret = InitPerfProfilingCallbackParameter();
-  } else if (flags_->print_tensor_data_) {
-    ret = InitPrintTensorDataCallbackParameter();
-  } else if (flags_->dump_tensor_data_) {
-    ret = InitDumpTensorDataCallbackParameter();
-  }
-  return ret;
-}
-
-int Benchmark::Init() {
-  if (this->flags_ == nullptr) {
-    return 1;
-  }
-  MS_LOG(INFO) << "ModelPath = " << this->flags_->model_file_;
-  MS_LOG(INFO) << "InDataPath = " << this->flags_->in_data_file_;
-  MS_LOG(INFO) << "InDataType = " << this->flags_->in_data_type_in_;
-  MS_LOG(INFO) << "LoopCount = " << this->flags_->loop_count_;
-  MS_LOG(INFO) << "DeviceType = " << this->flags_->device_;
-  MS_LOG(INFO) << "AccuracyThreshold = " << this->flags_->accuracy_threshold_;
-  MS_LOG(INFO) << "WarmUpLoopCount = " << this->flags_->warm_up_loop_count_;
-  MS_LOG(INFO) << "NumThreads = " << this->flags_->num_threads_;
-  MS_LOG(INFO) << "Fp16Priority = " << this->flags_->enable_fp16_;
-  MS_LOG(INFO) << "EnableParallel = " << this->flags_->enable_parallel_;
-  MS_LOG(INFO) << "calibDataPath = " << this->flags_->benchmark_data_file_;
-  std::cout << "ModelPath = " << this->flags_->model_file_ << std::endl;
-  std::cout << "InDataPath = " << this->flags_->in_data_file_ << std::endl;
-  std::cout << "InDataType = " << this->flags_->in_data_type_in_ << std::endl;
-  std::cout << "LoopCount = " << this->flags_->loop_count_ << std::endl;
-  std::cout << "DeviceType = " << this->flags_->device_ << std::endl;
-  std::cout << "AccuracyThreshold = " << this->flags_->accuracy_threshold_ << std::endl;
-  std::cout << "WarmUpLoopCount = " << this->flags_->warm_up_loop_count_ << std::endl;
-  std::cout << "NumThreads = " << this->flags_->num_threads_ << std::endl;
-  std::cout << "Fp16Priority = " << this->flags_->enable_fp16_ << std::endl;
-  std::cout << "EnableParallel = " << this->flags_->enable_parallel_ << std::endl;
-  std::cout << "calibDataPath = " << this->flags_->benchmark_data_file_ << std::endl;
-  if (this->flags_->loop_count_ < 1) {
-    MS_LOG(ERROR) << "LoopCount:" << this->flags_->loop_count_ << " must be greater than 0";
-    std::cerr << "LoopCount:" << this->flags_->loop_count_ << " must be greater than 0" << std::endl;
-    return RET_ERROR;
-  }
-
-  auto thread_ret = CheckThreadNumValid();
-  if (thread_ret != RET_OK) {
-    MS_LOG(ERROR) << "Invalid numThreads.";
-    std::cerr << "Invalid numThreads." << std::endl;
-    return RET_ERROR;
-  }
-  static std::vector<std::string> CPU_BIND_MODE_MAP = {"NO_BIND", "HIGHER_CPU", "MID_CPU"};
-  if (this->flags_->cpu_bind_mode_ >= 1) {
-    MS_LOG(INFO) << "cpuBindMode = " << CPU_BIND_MODE_MAP[this->flags_->cpu_bind_mode_];
-    std::cout << "cpuBindMode = " << CPU_BIND_MODE_MAP[this->flags_->cpu_bind_mode_] << std::endl;
-  } else {
-    MS_LOG(INFO) << "cpuBindMode = NO_BIND";
-    std::cout << "cpuBindMode = NO_BIND" << std::endl;
-  }
-
-  this->flags_->in_data_type_ = this->flags_->in_data_type_in_ == "img" ? kImage : kBinary;
-
-  if (!flags_->benchmark_data_type_.empty()) {
-    if (data_type_map_.find(flags_->benchmark_data_type_) == data_type_map_.end()) {
-      MS_LOG(ERROR) << "CalibDataType not supported: " << flags_->benchmark_data_type_.c_str();
-      return RET_ERROR;
-    }
-    msCalibDataType = data_type_map_.at(flags_->benchmark_data_type_);
-    MS_LOG(INFO) << "CalibDataType = " << flags_->benchmark_data_type_.c_str();
-    std::cout << "CalibDataType = " << flags_->benchmark_data_type_.c_str() << std::endl;
-  }
-
-  if (flags_->model_file_.empty()) {
-    MS_LOG(ERROR) << "modelPath is required";
-    std::cerr << "modelPath is required" << std::endl;
-    return 1;
-  }
-  flags_->InitInputDataList();
-  flags_->InitResizeDimsList();
-  if (!flags_->resize_dims_.empty() && !flags_->input_data_list_.empty() &&
-      flags_->resize_dims_.size() != flags_->input_data_list_.size()) {
-    MS_LOG(ERROR) << "Size of input resizeDims should be equal to size of input inDataPath";
-    std::cerr << "Size of input resizeDims should be equal to size of input inDataPath" << std::endl;
-    return RET_ERROR;
-  }
-
-  if (flags_->device_ != "CPU" && flags_->device_ != "GPU" && flags_->device_ != "NPU") {
-    MS_LOG(ERROR) << "Device type:" << flags_->device_ << " is not supported.";
-    std::cerr << "Device type:" << flags_->device_ << " is not supported." << std::endl;
-    return RET_ERROR;
-  }
-
-  if (flags_->time_profiling_ && flags_->perf_profiling_) {
-    MS_LOG(INFO) << "time_profiling is enabled, will not run perf_profiling.";
-  }
-
-  // get dump data output path
-  auto dump_cfg_path = std::getenv(dump::kConfigPath);
-  if (dump_cfg_path != nullptr) {
-    flags_->dump_tensor_data_ = true;
-    if (InitDumpConfigFromJson(dump_cfg_path) != RET_OK) {
-      MS_LOG(ERROR) << "parse dump config file failed.";
-      return RET_ERROR;
-    }
-  } else {
-    MS_LOG(INFO) << "No MINDSPORE_DUMP_CONFIG in env, don't need to dump data";
-  }
-
-  auto status = InitCallbackParameter();
-  if (status != RET_OK) {
-    MS_LOG(ERROR) << "Init callback Parameter failed.";
-    std::cerr << "Init callback Parameter failed." << std::endl;
-    return RET_ERROR;
-  }
-
-  return RET_OK;
-}
-
-int Benchmark::PrintResult(const std::vector<std::string> &title,
-                           const std::map<std::string, std::pair<int, float>> &result) {
-  std::vector<size_t> columnLenMax(5);
-  std::vector<std::vector<std::string>> rows;
-
-  for (auto &iter : result) {
-    char stringBuf[5][100] = {};
-    std::vector<std::string> columns;
-    size_t len = 0;
-
-    len = iter.first.size();
-    if (len > columnLenMax.at(0)) {
-      columnLenMax.at(0) = len + 4;
-    }
-    columns.push_back(iter.first);
-
-    len =
-      snprintf(stringBuf[1], sizeof(stringBuf[1]), "%f", iter.second.second / static_cast<float>(flags_->loop_count_));
-    if (len > columnLenMax.at(1)) {
-      columnLenMax.at(1) = len + 4;
-    }
-    columns.emplace_back(stringBuf[1]);
-
-    len = snprintf(stringBuf[2], sizeof(stringBuf[2]), "%f", iter.second.second / op_cost_total_);
-    if (len > columnLenMax.at(2)) {
-      columnLenMax.at(2) = len + 4;
-    }
-    columns.emplace_back(stringBuf[2]);
-
-    len = snprintf(stringBuf[3], sizeof(stringBuf[3]), "%d", iter.second.first);
-    if (len > columnLenMax.at(3)) {
-      columnLenMax.at(3) = len + 4;
-    }
-    columns.emplace_back(stringBuf[3]);
-
-    len = snprintf(stringBuf[4], sizeof(stringBuf[4]), "%f", iter.second.second);
-    if (len > columnLenMax.at(4)) {
-      columnLenMax.at(4) = len + 4;
-    }
-    columns.emplace_back(stringBuf[4]);
-
-    rows.push_back(columns);
-  }
-
-  printf("-------------------------------------------------------------------------\n");
-  for (int i = 0; i < 5; i++) {
-    auto printBuf = title[i];
-    if (printBuf.size() > columnLenMax.at(i)) {
-      columnLenMax.at(i) = printBuf.size();
-    }
-    printBuf.resize(columnLenMax.at(i), ' ');
-    printf("%s\t", printBuf.c_str());
-  }
-  printf("\n");
-  for (auto &row : rows) {
-    for (int j = 0; j < 5; j++) {
-      auto printBuf = row[j];
-      printBuf.resize(columnLenMax.at(j), ' ');
-      printf("%s\t", printBuf.c_str());
-    }
-    printf("\n");
-  }
-  return RET_OK;
-}
-
-#ifdef ENABLE_ARM64
-int Benchmark::PrintPerfResult(const std::vector<std::string> &title,
-                               const std::map<std::string, std::pair<int, struct PerfCount>> &result) {
-  std::vector<size_t> columnLenMax(5);
-  std::vector<std::vector<std::string>> rows;
-
-  for (auto &iter : result) {
-    char stringBuf[5][100] = {};
-    std::vector<std::string> columns;
-    size_t len = 0;
-
-    len = iter.first.size();
-    if (len > columnLenMax.at(0)) {
-      columnLenMax.at(0) = len + 4;
-    }
-    columns.push_back(iter.first);
-
-    float tmp = float_t(flags_->num_threads_) * iter.second.second.value[0] / float_t(flags_->loop_count_) / 1000.0f;
-    len = snprintf(stringBuf[1], sizeof(stringBuf[1]), "%.2f", tmp);
-    if (len > columnLenMax.at(1)) {
-      columnLenMax.at(1) = len + 4;
-    }
-    columns.emplace_back(stringBuf[1]);
-
-    len = snprintf(stringBuf[2], sizeof(stringBuf[2]), "%f", iter.second.second.value[0] / op_cost_total_);
-    if (len > columnLenMax.at(2)) {
-      columnLenMax.at(2) = len + 4;
-    }
-    columns.emplace_back(stringBuf[2]);
-
-    tmp = float_t(flags_->num_threads_) * iter.second.second.value[1] / float_t(flags_->loop_count_) / 1000.0f;
-    len = snprintf(stringBuf[3], sizeof(stringBuf[3]), "%.2f", tmp);
-    if (len > columnLenMax.at(3)) {
-      columnLenMax.at(3) = len + 4;
-    }
-    columns.emplace_back(stringBuf[3]);
-
-    len = snprintf(stringBuf[4], sizeof(stringBuf[4]), "%f", iter.second.second.value[1] / op_cost2_total_);
-    if (len > columnLenMax.at(4)) {
-      columnLenMax.at(4) = len + 4;
-    }
-    columns.emplace_back(stringBuf[4]);
-
-    rows.push_back(columns);
-  }
-
-  printf("-------------------------------------------------------------------------\n");
-  for (int i = 0; i < 5; i++) {
-    auto printBuf = title[i];
-    if (printBuf.size() > columnLenMax.at(i)) {
-      columnLenMax.at(i) = printBuf.size();
-    }
-    printBuf.resize(columnLenMax.at(i), ' ');
-    printf("%s\t", printBuf.c_str());
-  }
-  printf("\n");
-  for (auto &row : rows) {
-    for (int j = 0; j < 5; j++) {
-      auto printBuf = row[j];
-      printBuf.resize(columnLenMax.at(j), ' ');
-      printf("%s\t", printBuf.c_str());
-    }
-    printf("\n");
-  }
-  return RET_OK;
-}
-#endif
-
-#ifdef SUPPORT_NNIE
-int SvpSysInit() {
-  HI_S32 ret = HI_SUCCESS;
-  VB_CONFIG_S struVbConf;
-
-  HI_MPI_SYS_Exit();
-  HI_MPI_VB_Exit();
-
-  memset(&struVbConf, 0, sizeof(VB_CONFIG_S));
-  struVbConf.u32MaxPoolCnt = 2;
-  struVbConf.astCommPool[1].u64BlkSize = 768 * 576 * 2;
-  struVbConf.astCommPool[1].u32BlkCnt = 1;
-
-  ret = HI_MPI_VB_SetConfig((const VB_CONFIG_S *)&struVbConf);
-  if (HI_SUCCESS != ret) {
-    MS_LOG(ERROR) << "Error:HI_MPI_VB_SetConf failed!";
-    return RET_ERROR;
-  }
-
-  ret = HI_MPI_VB_Init();
-  if (HI_SUCCESS != ret) {
-    MS_LOG(ERROR) << "Error:HI_MPI_VB_Init failed!";
-    return RET_ERROR;
-  }
-
-  ret = HI_MPI_SYS_Init();
-  if (HI_SUCCESS != ret) {
-    MS_LOG(ERROR) << "Error:HI_MPI_SYS_Init failed!";
-    return RET_ERROR;
-  }
-
-  return RET_OK;
-}
-
-int SvpSysExit() {
-  HI_S32 ret = HI_SUCCESS;
-
-  ret = HI_MPI_SYS_Exit();
-  if (HI_SUCCESS != ret) {
-    MS_LOG(ERROR) << "Error:HI_MPI_SYS_Exit failed!";
-    return RET_ERROR;
-  }
-
-  ret = HI_MPI_VB_Exit();
-  if (HI_SUCCESS != ret) {
-    MS_LOG(ERROR) << "Error:HI_MPI_VB_Exit failed!";
-    return RET_ERROR;
-  }
-
-  return RET_OK;
-}
-#endif
-
-Benchmark::~Benchmark() {
-  for (const auto &iter : this->benchmark_data_) {
-    delete (iter.second);
-  }
-  this->benchmark_data_.clear();
-  delete (session_);
-#ifdef SUPPORT_NNIE
-  SvpSysExit();
-#endif
-}
-
-int RunBenchmark(int argc, const char **argv) {
-  BenchmarkFlags flags;
-  Option<std::string> err = flags.ParseFlags(argc, argv);
-#ifdef SUPPORT_NNIE
-  SvpSysInit();
-#endif
-  if (err.IsSome()) {
-    std::cerr << err.Get() << std::endl;
-    std::cerr << flags.Usage() << std::endl;
-    return RET_ERROR;
-  }
-
-  if (flags.help) {
-    std::cerr << flags.Usage() << std::endl;
-    return RET_OK;
-  }
-
-  Benchmark benchmark(&flags);
-  auto status = benchmark.Init();
-  if (status != 0) {
-    MS_LOG(ERROR) << "Benchmark init Error : " << status;
-    std::cerr << "Benchmark init Error : " << status << std::endl;
-    return RET_ERROR;
-  }
-
-  status = benchmark.RunBenchmark();
-  if (status != 0) {
-    MS_LOG(ERROR) << "Run Benchmark "
-                  << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
-                  << " Failed : " << status;
-    std::cerr << "Run Benchmark " << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
-              << " Failed : " << status << std::endl;
-    return RET_ERROR;
-  }
-
-  MS_LOG(INFO) << "Run Benchmark " << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
-               << " Success.";
-  std::cout << "Run Benchmark " << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
-            << " Success." << std::endl;
-  return RET_OK;
-}
 }  // namespace lite
 }  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/benchmark.h
+++ b/mindspore/lite/tools/benchmark/benchmark.h
@ -31,6 +31,7 @@
 #include <cfloat>
 #include <utility>
 #include <nlohmann/json.hpp>
+#include "tools/benchmark/benchmark_base.h"
 #include "include/model.h"
 #include "tools/common/flag_parser.h"
 #include "src/common/file_utils.h"
@ -38,283 +39,57 @@
 #include "include/lite_session.h"

 namespace mindspore::lite {
-enum MS_API InDataType { kImage = 0, kBinary = 1 };

-constexpr float relativeTolerance = 1e-5;
-constexpr float absoluteTolerance = 1e-8;
-
-#ifdef ENABLE_ARM64
-struct PerfResult {
-  int64_t nr;
-  struct {
-    int64_t value;
-    int64_t id;
-  } values[2];
-};
-struct PerfCount {
-  int64_t value[2];
-};
-#endif
-
-struct MS_API CheckTensor {
-  CheckTensor(const std::vector<size_t> &shape, const std::vector<float> &data,
-              const std::vector<std::string> &strings_data = {""}) {
-    this->shape = shape;
-    this->data = data;
-    this->strings_data = strings_data;
-  }
-  std::vector<size_t> shape;
-  std::vector<float> data;
-  std::vector<std::string> strings_data;
-};
-
-class MS_API BenchmarkFlags : public virtual FlagParser {
+class MS_API Benchmark : public BenchmarkBase {
 public:
-  BenchmarkFlags() {
-    // common
-    AddFlag(&BenchmarkFlags::model_file_, "modelFile", "Input model file", "");
-    AddFlag(&BenchmarkFlags::in_data_file_, "inDataFile", "Input data file, if not set, use random input", "");
-    AddFlag(&BenchmarkFlags::device_, "device", "CPU | GPU | NPU", "CPU");
-    AddFlag(&BenchmarkFlags::cpu_bind_mode_, "cpuBindMode",
-            "Input 0 for NO_BIND, 1 for HIGHER_CPU, 2 for MID_CPU, default value: 1", 1);
-    // MarkPerformance
-    AddFlag(&BenchmarkFlags::loop_count_, "loopCount", "Run loop count", 10);
-    AddFlag(&BenchmarkFlags::num_threads_, "numThreads", "Run threads number", 2);
-    AddFlag(&BenchmarkFlags::enable_fp16_, "enableFp16", "Enable float16", false);
-    AddFlag(&BenchmarkFlags::enable_parallel_, "enableParallel", "Enable subgraph parallel : true | false", false);
-    AddFlag(&BenchmarkFlags::warm_up_loop_count_, "warmUpLoopCount", "Run warm up loop", 3);
-    AddFlag(&BenchmarkFlags::time_profiling_, "timeProfiling", "Run time profiling", false);
-    AddFlag(&BenchmarkFlags::perf_profiling_, "perfProfiling",
-            "Perf event profiling(only instructions statics enabled currently)", false);
-    AddFlag(&BenchmarkFlags::perf_event_, "perfEvent", "CYCLE|CACHE|STALL", "CYCLE");
-    // MarkAccuracy
-    AddFlag(&BenchmarkFlags::benchmark_data_file_, "benchmarkDataFile", "Benchmark data file path", "");
-    AddFlag(&BenchmarkFlags::benchmark_data_type_, "benchmarkDataType",
-            "Benchmark data type. FLOAT | INT32 | INT8 | UINT8", "FLOAT");
-    AddFlag(&BenchmarkFlags::accuracy_threshold_, "accuracyThreshold", "Threshold of accuracy", 0.5);
-    AddFlag(&BenchmarkFlags::resize_dims_in_, "inputShapes",
-            "Shape of input data, the format should be NHWC. e.g. 1,32,32,32:1,1,32,32,1", "");
-  }
-
-  ~BenchmarkFlags() override = default;
-
-  void InitInputDataList();
-
-  void InitResizeDimsList();
-
- public:
-  // common
-  std::string model_file_;
-  std::string in_data_file_;
-  std::vector<std::string> input_data_list_;
-  InDataType in_data_type_ = kBinary;
-  std::string in_data_type_in_ = "bin";
-  int cpu_bind_mode_ = 1;
-  // MarkPerformance
-  int loop_count_ = 10;
-  int num_threads_ = 2;
-  bool enable_fp16_ = false;
-  bool enable_parallel_ = false;
-  int warm_up_loop_count_ = 3;
-  // MarkAccuracy
-  std::string benchmark_data_file_;
-  std::string benchmark_data_type_ = "FLOAT";
-  float accuracy_threshold_ = 0.5;
-  // Resize
-  std::string resize_dims_in_;
-  std::vector<std::vector<int>> resize_dims_;
-
-  std::string device_ = "CPU";
-  bool time_profiling_ = false;
-  bool perf_profiling_ = false;
-  std::string perf_event_ = "CYCLE";
-  bool dump_tensor_data_ = false;
-  bool print_tensor_data_ = false;
-};
-
-class MS_API Benchmark {
- public:
-  explicit Benchmark(BenchmarkFlags *flags) : flags_(flags) {}
+  explicit Benchmark(BenchmarkFlags *flags) : BenchmarkBase(flags) {}

  virtual ~Benchmark();

-  int Init();
-  int RunBenchmark();
-
- private:
-  // call GenerateInputData or ReadInputFile to init inputTensors
-  int LoadInput();
+  int RunBenchmark() override;

+ protected:
  // call GenerateRandomData to fill inputTensors
-  int GenerateInputData();
+  int GenerateInputData() override;

-  int GenerateRandomData(size_t size, void *data, TypeId data_type);
+  int ReadInputFile() override;

-  int ReadInputFile();
-
-  int ReadCalibData();
-
-  int ReadTensorData(std::ifstream &in_file_stream, const std::string &tensor_name, const std::vector<size_t> &dims);
+  int ReadTensorData(std::ifstream &in_file_stream, const std::string &tensor_name,
+                     const std::vector<size_t> &dims) override;

  void InitContext(const std::shared_ptr<Context> &context);

-  int CompareOutput();
+  int CompareOutput() override;

  tensor::MSTensor *GetTensorByNameOrShape(const std::string &node_or_tensor_name, const std::vector<size_t> &dims);

  tensor::MSTensor *GetTensorByNodeShape(const std::vector<size_t> &node_shape);

-  int CompareStringData(const std::string &name, tensor::MSTensor *tensor);
-
  int CompareDataGetTotalBiasAndSize(const std::string &name, tensor::MSTensor *tensor, float *total_bias,
                                     int *total_size);

-  int InitDumpConfigFromJson(char *path);
+  int InitTimeProfilingCallbackParameter() override;

-  int InitCallbackParameter();
+  int InitPerfProfilingCallbackParameter() override;

-  int InitTimeProfilingCallbackParameter();
+  int InitDumpTensorDataCallbackParameter() override;

-  int InitPerfProfilingCallbackParameter();
-
-  int InitDumpTensorDataCallbackParameter();
-
-  int InitPrintTensorDataCallbackParameter();
-
-  int PrintResult(const std::vector<std::string> &title, const std::map<std::string, std::pair<int, float>> &result);
-
-#ifdef ENABLE_ARM64
-  int PrintPerfResult(const std::vector<std::string> &title,
-                      const std::map<std::string, std::pair<int, struct PerfCount>> &result);
-#endif
+  int InitPrintTensorDataCallbackParameter() override;

  int PrintInputData();

-  // tensorData need to be converter first
-  template <typename T>
-  float CompareData(const std::string &nodeName, const std::vector<int> &msShape, const void *tensor_data) {
-    const T *msTensorData = static_cast<const T *>(tensor_data);
-    auto iter = this->benchmark_data_.find(nodeName);
-    if (iter != this->benchmark_data_.end()) {
-      std::vector<size_t> castedMSShape;
-      size_t shapeSize = 1;
-      for (int64_t dim : msShape) {
-        castedMSShape.push_back(size_t(dim));
-        shapeSize *= dim;
-      }
-
-      CheckTensor *calibTensor = iter->second;
-      if (calibTensor->shape != castedMSShape) {
-        std::ostringstream oss;
-        oss << "Shape of mslite output(";
-        for (auto dim : castedMSShape) {
-          oss << dim << ",";
-        }
-        oss << ") and shape source model output(";
-        for (auto dim : calibTensor->shape) {
-          oss << dim << ",";
-        }
-        oss << ") are different";
-        std::cerr << oss.str() << std::endl;
-        MS_LOG(ERROR) << oss.str().c_str();
-        return RET_ERROR;
-      }
-      size_t errorCount = 0;
-      float meanError = 0;
-      std::cout << "Data of node " << nodeName << " : ";
-      for (size_t j = 0; j < shapeSize; j++) {
-        if (j < 50) {
-          std::cout << static_cast<float>(msTensorData[j]) << " ";
-        }
-
-        if (std::isnan(msTensorData[j]) || std::isinf(msTensorData[j])) {
-          std::cerr << "Output tensor has nan or inf data, compare fail" << std::endl;
-          MS_LOG(ERROR) << "Output tensor has nan or inf data, compare fail";
-          return RET_ERROR;
-        }
-
-        auto tolerance = absoluteTolerance + relativeTolerance * fabs(calibTensor->data.at(j));
-        auto absoluteError = std::fabs(msTensorData[j] - calibTensor->data.at(j));
-        if (absoluteError > tolerance) {
-          if (fabs(calibTensor->data.at(j) - 0.0f) < FLT_EPSILON) {
-            if (absoluteError > 1e-5) {
-              meanError += absoluteError;
-              errorCount++;
-            } else {
-              continue;
-            }
-          } else {
-            // just assume that atol = rtol
-            meanError += absoluteError / (fabs(calibTensor->data.at(j)) + FLT_MIN);
-            errorCount++;
-          }
-        }
-      }
-      std::cout << std::endl;
-      if (meanError > 0.0f) {
-        meanError /= errorCount;
-      }
-
-      if (meanError <= 0.0000001) {
-        std::cout << "Mean bias of node/tensor " << nodeName << " : 0%" << std::endl;
-      } else {
-        std::cout << "Mean bias of node/tensor " << nodeName << " : " << meanError * 100 << "%" << std::endl;
-      }
-      return meanError;
-    } else {
-      MS_LOG(INFO) << "%s is not in Source Model output", nodeName.c_str();
-      return RET_ERROR;
-    }
-  }
-
-  template <typename T, typename Distribution>
-  void FillInputData(int size, void *data, Distribution distribution) {
-    MS_ASSERT(data != nullptr);
-    int elements_num = size / sizeof(T);
-    (void)std::generate_n(static_cast<T *>(data), elements_num,
-                          [&]() { return static_cast<T>(distribution(random_engine_)); });
-  }
-
  int MarkPerformance();

  int MarkAccuracy();

-  int CheckThreadNumValid();
-
 private:
-  BenchmarkFlags *flags_;
  session::LiteSession *session_{nullptr};
  std::vector<mindspore::tensor::MSTensor *> ms_inputs_;
  std::unordered_map<std::string, std::vector<mindspore::tensor::MSTensor *>> ms_outputs_;
-  std::unordered_map<std::string, CheckTensor *> benchmark_data_;
-  std::unordered_map<std::string, TypeId> data_type_map_{{"FLOAT", TypeId::kNumberTypeFloat},
-                                                         {"INT8", TypeId::kNumberTypeInt8},
-                                                         {"INT32", TypeId::kNumberTypeInt32},
-                                                         {"UINT8", TypeId::kNumberTypeUInt8}};
-  TypeId msCalibDataType = TypeId::kNumberTypeFloat;

-  // callback parameters
-  uint64_t op_begin_ = 0;
-  int op_call_times_total_ = 0;
-  float op_cost_total_ = 0.0f;
-  std::map<std::string, std::pair<int, float>> op_times_by_type_;
-  std::map<std::string, std::pair<int, float>> op_times_by_name_;
-
-  // dump data
-  nlohmann::json dump_cfg_json_;
-  std::string dump_file_output_dir_;
-#ifdef ENABLE_ARM64
-  int perf_fd = 0;
-  int perf_fd2 = 0;
-  float op_cost2_total_ = 0.0f;
-  std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_type_;
-  std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_name_;
-#endif
  KernelCallBack before_call_back_ = nullptr;
  KernelCallBack after_call_back_ = nullptr;
-  std::mt19937 random_engine_;
 };

-int MS_API RunBenchmark(int argc, const char **argv);
 }  // namespace mindspore::lite
 #endif  // MINNIE_BENCHMARK_BENCHMARK_H_
--- a/mindspore/lite/tools/benchmark/benchmark_base.cc
+++ b/mindspore/lite/tools/benchmark/benchmark_base.cc
@ -0,0 +1,606 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tools/benchmark/benchmark_base.h"
+#define __STDC_FORMAT_MACROS
+#include <cinttypes>
+#undef __STDC_FORMAT_MACROS
+#include <algorithm>
+#include <utility>
+#include <functional>
+#include "include/context.h"
+#include "include/ms_tensor.h"
+#include "include/version.h"
+#include "schema/model_generated.h"
+#include "src/common/common.h"
+#include "src/tensor.h"
+#ifdef ENABLE_ARM64
+#include <linux/perf_event.h>
+#include <sys/ioctl.h>
+#include <asm/unistd.h>
+#include <unistd.h>
+#endif
+#ifdef SUPPORT_NNIE
+#include "include/hi_common.h"
+#include "include/hi_comm_vb.h"
+#include "include/mpi_sys.h"
+#include "include/mpi_vb.h"
+#endif
+
+namespace mindspore {
+namespace lite {
+
+const std::unordered_map<int, std::string> TYPE_ID_MAP{
+  {kNumberTypeFloat16, "Float16"}, {kNumberTypeFloat, "Float32"},    {kNumberTypeFloat32, "Float32"},
+  {kNumberTypeInt8, "Int8"},       {kNumberTypeInt16, "Int16"},      {kNumberTypeInt, "Int32"},
+  {kNumberTypeInt32, "Int32"},     {kNumberTypeUInt8, "UInt8"},      {kNumberTypeUInt16, "UInt16"},
+  {kNumberTypeUInt, "UInt32"},     {kNumberTypeUInt32, "UInt32"},    {kObjectTypeString, "String"},
+  {kNumberTypeBool, "Bool"},       {kObjectTypeTensorType, "Tensor"}};
+const std::unordered_map<schema::Format, std::string> TENSOR_FORMAT_MAP{
+  {schema::Format_NCHW, "NCHW"}, {schema::Format_NHWC, "NHWC"},     {schema::Format_NHWC4, "NHWC4"},
+  {schema::Format_HWKC, "HWKC"}, {schema::Format_HWCK, "HWCK"},     {schema::Format_KCHW, "KCHW"},
+  {schema::Format_CKHW, "CKHW"}, {schema::Format_KHWC, "KHWC"},     {schema::Format_CHWK, "CHWK"},
+  {schema::Format_HW, "HW"},     {schema::Format_HW4, "HW4"},       {schema::Format_NC, "NC"},
+  {schema::Format_NC4, "NC4"},   {schema::Format_NC4HW4, "NC4HW4"}, {schema::Format_NCDHW, "NCDHW"}};
+
+int BenchmarkBase::GenerateRandomData(size_t size, void *data, int data_type) {
+  MS_ASSERT(data != nullptr);
+  switch (data_type) {
+    case kNumberTypeFloat32:
+    case kNumberTypeFloat:
+      FillInputData<float>(size, data, std::uniform_real_distribution<float>(0.1f, 1.0f));
+      break;
+    case kNumberTypeFloat64:
+      FillInputData<double>(size, data, std::uniform_real_distribution<double>(0.1, 1.0));
+      break;
+    case kNumberTypeInt64:
+      FillInputData<int64_t>(size, data, std::uniform_int_distribution<int64_t>(0, 1));
+      break;
+    case kNumberTypeInt:
+    case kNumberTypeInt32:
+      FillInputData<int32_t>(size, data, std::uniform_int_distribution<int32_t>(0, 1));
+      break;
+    case kNumberTypeInt16:
+      FillInputData<int16_t>(size, data, std::uniform_int_distribution<int16_t>(0, 1));
+      break;
+    case kNumberTypeInt8:
+      FillInputData<int8_t>(size, data, std::uniform_int_distribution<int8_t>(-127, 127));
+      break;
+    case kNumberTypeUInt8:
+      FillInputData<uint8_t>(size, data, std::uniform_int_distribution<uint8_t>(0, 254));
+      break;
+    default:
+      char *casted_data = static_cast<char *>(data);
+      for (size_t i = 0; i < size; i++) {
+        casted_data[i] = static_cast<char>(i);
+      }
+  }
+  return RET_OK;
+}
+
+int BenchmarkBase::LoadInput() {
+  if (flags_->in_data_file_.empty()) {
+    auto status = GenerateInputData();
+    if (status != 0) {
+      std::cerr << "Generate input data error " << status << std::endl;
+      MS_LOG(ERROR) << "Generate input data error " << status;
+      return status;
+    }
+  } else {
+    auto status = ReadInputFile();
+    if (status != 0) {
+      std::cerr << "ReadInputFile error, " << status << std::endl;
+      MS_LOG(ERROR) << "ReadInputFile error, " << status;
+      return status;
+    }
+  }
+  return RET_OK;
+}
+
+// calibData is FP32
+int BenchmarkBase::ReadCalibData() {
+  const char *calib_data_path = flags_->benchmark_data_file_.c_str();
+  // read calib data
+  std::ifstream in_file(calib_data_path);
+  if (!in_file.good()) {
+    std::cerr << "file: " << calib_data_path << " is not exist" << std::endl;
+    MS_LOG(ERROR) << "file: " << calib_data_path << " is not exist";
+    return RET_ERROR;
+  }
+
+  if (!in_file.is_open()) {
+    std::cerr << "file: " << calib_data_path << " open failed" << std::endl;
+    MS_LOG(ERROR) << "file: " << calib_data_path << " open failed";
+    in_file.close();
+    return RET_ERROR;
+  }
+  MS_LOG(INFO) << "Start reading calibData file";
+  std::string line;
+  std::string tensor_name;
+
+  while (!in_file.eof()) {
+    getline(in_file, line);
+    std::stringstream string_line1(line);
+    size_t dim = 0;
+    string_line1 >> tensor_name >> dim;
+    std::vector<size_t> dims;
+    for (size_t i = 0; i < dim; i++) {
+      size_t tmp_dim;
+      string_line1 >> tmp_dim;
+      dims.push_back(tmp_dim);
+    }
+    auto ret = ReadTensorData(in_file, tensor_name, dims);
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Read tensor data failed, tensor name: " << tensor_name;
+      return RET_ERROR;
+    }
+  }
+  in_file.close();
+  MS_LOG(INFO) << "Finish reading calibData file";
+  return RET_OK;
+}
+
+int BenchmarkBase::CompareStringData(const std::string &name, tensor::MSTensor *tensor) {
+  auto iter = this->benchmark_data_.find(name);
+  if (iter != this->benchmark_data_.end()) {
+    std::vector<std::string> calib_strings = iter->second->strings_data;
+    std::vector<std::string> output_strings = MSTensorToStrings(tensor);
+    size_t compare_num = std::min(calib_strings.size(), output_strings.size());
+    size_t print_num = std::min(compare_num, static_cast<size_t>(kNumPrintMin));
+
+    std::cout << "Data of node " << name << " : " << std::endl;
+    for (size_t i = 0; i < compare_num; i++) {
+      if (i < print_num) {
+        std::cout << "  " << output_strings[i] << std::endl;
+      }
+      if (calib_strings[i] != output_strings[i]) {
+        MS_LOG(ERROR) << "Compare failed, index: " << i;
+        return RET_ERROR;
+      }
+    }
+  }
+  return RET_OK;
+}
+
+void BenchmarkFlags::InitInputDataList() {
+  char *input_list = new char[this->in_data_file_.length() + 1];
+  snprintf(input_list, this->in_data_file_.length() + 1, "%s", this->in_data_file_.c_str());
+  char *cur_input;
+  const char *split_c = ",";
+  cur_input = strtok(input_list, split_c);
+  while (cur_input != nullptr) {
+    input_data_list_.emplace_back(cur_input);
+    cur_input = strtok(nullptr, split_c);
+  }
+  delete[] input_list;
+}
+
+void BenchmarkFlags::InitResizeDimsList() {
+  std::string content = this->resize_dims_in_;
+  std::vector<int> shape;
+  auto shape_strs = StringSplit(content, std::string(DELIM_COLON));
+  for (const auto &shape_str : shape_strs) {
+    shape.clear();
+    auto dim_strs = StringSplit(shape_str, std::string(DELIM_COMMA));
+    std::cout << "Resize Dims: ";
+    for (const auto &dim_str : dim_strs) {
+      std::cout << dim_str << " ";
+      shape.emplace_back(static_cast<int>(std::stoi(dim_str)));
+    }
+    std::cout << std::endl;
+    this->resize_dims_.emplace_back(shape);
+  }
+}
+
+int BenchmarkBase::CheckThreadNumValid() {
+  if (this->flags_->num_threads_ < 1) {
+    MS_LOG(ERROR) << "numThreads:" << this->flags_->num_threads_ << " must be greater than 0";
+    std::cerr << "numThreads:" << this->flags_->num_threads_ << " must be greater than 0" << std::endl;
+    return RET_ERROR;
+  }
+
+  if (flags_->enable_parallel_) {
+    if (flags_->num_threads_ < 2) {
+      MS_LOG(ERROR) << "enable parallel need more than 1 thread.";
+      std::cerr << "enable parallel need more than 1 thread." << std::endl;
+      return RET_ERROR;
+    }
+  }
+  return RET_OK;
+}
+
+int BenchmarkBase::InitDumpConfigFromJson(char *path) {
+  auto real_path = RealPath(path);
+  std::ifstream ifs(real_path);
+  if (!ifs.good()) {
+    MS_LOG(ERROR) << "file: " << real_path << " is not exist";
+    return RET_ERROR;
+  }
+  if (!ifs.is_open()) {
+    MS_LOG(ERROR) << "file: " << real_path << " open failed";
+    return RET_ERROR;
+  }
+
+  try {
+    dump_cfg_json_ = nlohmann::json::parse(ifs);
+  } catch (const nlohmann::json::parse_error &error) {
+    MS_LOG(ERROR) << "parse json file failed, please check your file.";
+    return RET_ERROR;
+  }
+  if (dump_cfg_json_[dump::kSettings] == nullptr) {
+    MS_LOG(ERROR) << "\"common_dump_settings\" is required.";
+    return RET_ERROR;
+  }
+  if (dump_cfg_json_[dump::kSettings][dump::kMode] == nullptr) {
+    MS_LOG(ERROR) << "\"dump_mode\" is required.";
+    return RET_ERROR;
+  }
+  if (dump_cfg_json_[dump::kSettings][dump::kPath] == nullptr) {
+    MS_LOG(ERROR) << "\"path\" is required.";
+    return RET_ERROR;
+  }
+  if (dump_cfg_json_[dump::kSettings][dump::kNetName] == nullptr) {
+    dump_cfg_json_[dump::kSettings][dump::kNetName] = "Default";
+  }
+  if (dump_cfg_json_[dump::kSettings][dump::kInputOutput] == nullptr) {
+    dump_cfg_json_[dump::kSettings][dump::kInputOutput] = 0;
+  }
+  if (dump_cfg_json_[dump::kSettings][dump::kKernels] != nullptr &&
+      !dump_cfg_json_[dump::kSettings][dump::kKernels].empty()) {
+    if (dump_cfg_json_[dump::kSettings][dump::kMode] == 0) {
+      MS_LOG(ERROR) << R"("dump_mode" should be 1 when "kernels" isn't empty.)";
+      return RET_ERROR;
+    }
+  }
+
+  auto abs_path = dump_cfg_json_[dump::kSettings][dump::kPath].get<std::string>();
+  auto net_name = dump_cfg_json_[dump::kSettings][dump::kNetName].get<std::string>();
+  if (abs_path.back() == '\\' || abs_path.back() == '/') {
+    dump_file_output_dir_ = abs_path + net_name;
+  } else {
+#ifdef _WIN32
+    dump_file_output_dir_ = abs_path + "\\" + net_name;
+#else
+    dump_file_output_dir_ = abs_path + "/" + net_name;
+#endif
+  }
+
+  auto status = CreateOutputDir(&dump_file_output_dir_);
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "create data output directory failed.";
+    return RET_ERROR;
+  }
+
+  return RET_OK;
+}
+
+int BenchmarkBase::InitCallbackParameter() {
+  int ret = RET_OK;
+  if (flags_->time_profiling_) {
+    ret = InitTimeProfilingCallbackParameter();
+  } else if (flags_->perf_profiling_) {
+    ret = InitPerfProfilingCallbackParameter();
+  } else if (flags_->print_tensor_data_) {
+    ret = InitPrintTensorDataCallbackParameter();
+  } else if (flags_->dump_tensor_data_) {
+    ret = InitDumpTensorDataCallbackParameter();
+  }
+  return ret;
+}
+
+int BenchmarkBase::Init() {
+  if (this->flags_ == nullptr) {
+    return 1;
+  }
+  MS_LOG(INFO) << "ModelPath = " << this->flags_->model_file_;
+  MS_LOG(INFO) << "InDataPath = " << this->flags_->in_data_file_;
+  MS_LOG(INFO) << "InDataType = " << this->flags_->in_data_type_in_;
+  MS_LOG(INFO) << "LoopCount = " << this->flags_->loop_count_;
+  MS_LOG(INFO) << "DeviceType = " << this->flags_->device_;
+  MS_LOG(INFO) << "AccuracyThreshold = " << this->flags_->accuracy_threshold_;
+  MS_LOG(INFO) << "WarmUpLoopCount = " << this->flags_->warm_up_loop_count_;
+  MS_LOG(INFO) << "NumThreads = " << this->flags_->num_threads_;
+  MS_LOG(INFO) << "Fp16Priority = " << this->flags_->enable_fp16_;
+  MS_LOG(INFO) << "EnableParallel = " << this->flags_->enable_parallel_;
+  MS_LOG(INFO) << "calibDataPath = " << this->flags_->benchmark_data_file_;
+  std::cout << "ModelPath = " << this->flags_->model_file_ << std::endl;
+  std::cout << "InDataPath = " << this->flags_->in_data_file_ << std::endl;
+  std::cout << "InDataType = " << this->flags_->in_data_type_in_ << std::endl;
+  std::cout << "LoopCount = " << this->flags_->loop_count_ << std::endl;
+  std::cout << "DeviceType = " << this->flags_->device_ << std::endl;
+  std::cout << "AccuracyThreshold = " << this->flags_->accuracy_threshold_ << std::endl;
+  std::cout << "WarmUpLoopCount = " << this->flags_->warm_up_loop_count_ << std::endl;
+  std::cout << "NumThreads = " << this->flags_->num_threads_ << std::endl;
+  std::cout << "Fp16Priority = " << this->flags_->enable_fp16_ << std::endl;
+  std::cout << "EnableParallel = " << this->flags_->enable_parallel_ << std::endl;
+  std::cout << "calibDataPath = " << this->flags_->benchmark_data_file_ << std::endl;
+  if (this->flags_->loop_count_ < 1) {
+    MS_LOG(ERROR) << "LoopCount:" << this->flags_->loop_count_ << " must be greater than 0";
+    std::cerr << "LoopCount:" << this->flags_->loop_count_ << " must be greater than 0" << std::endl;
+    return RET_ERROR;
+  }
+
+  auto thread_ret = CheckThreadNumValid();
+  if (thread_ret != RET_OK) {
+    MS_LOG(ERROR) << "Invalid numThreads.";
+    std::cerr << "Invalid numThreads." << std::endl;
+    return RET_ERROR;
+  }
+  static std::vector<std::string> CPU_BIND_MODE_MAP = {"NO_BIND", "HIGHER_CPU", "MID_CPU"};
+  if (this->flags_->cpu_bind_mode_ >= 1) {
+    MS_LOG(INFO) << "cpuBindMode = " << CPU_BIND_MODE_MAP[this->flags_->cpu_bind_mode_];
+    std::cout << "cpuBindMode = " << CPU_BIND_MODE_MAP[this->flags_->cpu_bind_mode_] << std::endl;
+  } else {
+    MS_LOG(INFO) << "cpuBindMode = NO_BIND";
+    std::cout << "cpuBindMode = NO_BIND" << std::endl;
+  }
+
+  this->flags_->in_data_type_ = this->flags_->in_data_type_in_ == "img" ? kImage : kBinary;
+
+  if (!flags_->benchmark_data_type_.empty()) {
+    if (data_type_map_.find(flags_->benchmark_data_type_) == data_type_map_.end()) {
+      MS_LOG(ERROR) << "CalibDataType not supported: " << flags_->benchmark_data_type_.c_str();
+      return RET_ERROR;
+    }
+    msCalibDataType = data_type_map_.at(flags_->benchmark_data_type_);
+    MS_LOG(INFO) << "CalibDataType = " << flags_->benchmark_data_type_.c_str();
+    std::cout << "CalibDataType = " << flags_->benchmark_data_type_.c_str() << std::endl;
+  }
+
+  if (flags_->model_file_.empty()) {
+    MS_LOG(ERROR) << "modelPath is required";
+    std::cerr << "modelPath is required" << std::endl;
+    return 1;
+  }
+  flags_->InitInputDataList();
+  flags_->InitResizeDimsList();
+  if (!flags_->resize_dims_.empty() && !flags_->input_data_list_.empty() &&
+      flags_->resize_dims_.size() != flags_->input_data_list_.size()) {
+    MS_LOG(ERROR) << "Size of input resizeDims should be equal to size of input inDataPath";
+    std::cerr << "Size of input resizeDims should be equal to size of input inDataPath" << std::endl;
+    return RET_ERROR;
+  }
+
+  if (flags_->device_ != "CPU" && flags_->device_ != "GPU" && flags_->device_ != "NPU") {
+    MS_LOG(ERROR) << "Device type:" << flags_->device_ << " is not supported.";
+    std::cerr << "Device type:" << flags_->device_ << " is not supported." << std::endl;
+    return RET_ERROR;
+  }
+
+  if (flags_->time_profiling_ && flags_->perf_profiling_) {
+    MS_LOG(INFO) << "time_profiling is enabled, will not run perf_profiling.";
+  }
+
+  // get dump data output path
+  auto dump_cfg_path = std::getenv(dump::kConfigPath);
+  if (dump_cfg_path != nullptr) {
+    flags_->dump_tensor_data_ = true;
+    if (InitDumpConfigFromJson(dump_cfg_path) != RET_OK) {
+      MS_LOG(ERROR) << "parse dump config file failed.";
+      return RET_ERROR;
+    }
+  } else {
+    MS_LOG(INFO) << "No MINDSPORE_DUMP_CONFIG in env, don't need to dump data";
+  }
+
+  auto status = InitCallbackParameter();
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "Init callback Parameter failed.";
+    std::cerr << "Init callback Parameter failed." << std::endl;
+    return RET_ERROR;
+  }
+
+  return RET_OK;
+}
+
+int BenchmarkBase::PrintResult(const std::vector<std::string> &title,
+                               const std::map<std::string, std::pair<int, float>> &result) {
+  std::vector<size_t> columnLenMax(5);
+  std::vector<std::vector<std::string>> rows;
+
+  for (auto &iter : result) {
+    char stringBuf[5][100] = {};
+    std::vector<std::string> columns;
+    size_t len = 0;
+
+    len = iter.first.size();
+    if (len > columnLenMax.at(0)) {
+      columnLenMax.at(0) = len + 4;
+    }
+    columns.push_back(iter.first);
+
+    len =
+      snprintf(stringBuf[1], sizeof(stringBuf[1]), "%f", iter.second.second / static_cast<float>(flags_->loop_count_));
+    if (len > columnLenMax.at(1)) {
+      columnLenMax.at(1) = len + 4;
+    }
+    columns.emplace_back(stringBuf[1]);
+
+    len = snprintf(stringBuf[2], sizeof(stringBuf[2]), "%f", iter.second.second / op_cost_total_);
+    if (len > columnLenMax.at(2)) {
+      columnLenMax.at(2) = len + 4;
+    }
+    columns.emplace_back(stringBuf[2]);
+
+    len = snprintf(stringBuf[3], sizeof(stringBuf[3]), "%d", iter.second.first);
+    if (len > columnLenMax.at(3)) {
+      columnLenMax.at(3) = len + 4;
+    }
+    columns.emplace_back(stringBuf[3]);
+
+    len = snprintf(stringBuf[4], sizeof(stringBuf[4]), "%f", iter.second.second);
+    if (len > columnLenMax.at(4)) {
+      columnLenMax.at(4) = len + 4;
+    }
+    columns.emplace_back(stringBuf[4]);
+
+    rows.push_back(columns);
+  }
+
+  printf("-------------------------------------------------------------------------\n");
+  for (int i = 0; i < 5; i++) {
+    auto printBuf = title[i];
+    if (printBuf.size() > columnLenMax.at(i)) {
+      columnLenMax.at(i) = printBuf.size();
+    }
+    printBuf.resize(columnLenMax.at(i), ' ');
+    printf("%s\t", printBuf.c_str());
+  }
+  printf("\n");
+  for (auto &row : rows) {
+    for (int j = 0; j < 5; j++) {
+      auto printBuf = row[j];
+      printBuf.resize(columnLenMax.at(j), ' ');
+      printf("%s\t", printBuf.c_str());
+    }
+    printf("\n");
+  }
+  return RET_OK;
+}
+
+#ifdef ENABLE_ARM64
+int BenchmarkBase::PrintPerfResult(const std::vector<std::string> &title,
+                                   const std::map<std::string, std::pair<int, struct PerfCount>> &result) {
+  std::vector<size_t> columnLenMax(5);
+  std::vector<std::vector<std::string>> rows;
+
+  for (auto &iter : result) {
+    char stringBuf[5][100] = {};
+    std::vector<std::string> columns;
+    size_t len = 0;
+
+    len = iter.first.size();
+    if (len > columnLenMax.at(0)) {
+      columnLenMax.at(0) = len + 4;
+    }
+    columns.push_back(iter.first);
+
+    float tmp = float_t(flags_->num_threads_) * iter.second.second.value[0] / float_t(flags_->loop_count_) / 1000.0f;
+    len = snprintf(stringBuf[1], sizeof(stringBuf[1]), "%.2f", tmp);
+    if (len > columnLenMax.at(1)) {
+      columnLenMax.at(1) = len + 4;
+    }
+    columns.emplace_back(stringBuf[1]);
+
+    len = snprintf(stringBuf[2], sizeof(stringBuf[2]), "%f", iter.second.second.value[0] / op_cost_total_);
+    if (len > columnLenMax.at(2)) {
+      columnLenMax.at(2) = len + 4;
+    }
+    columns.emplace_back(stringBuf[2]);
+
+    tmp = float_t(flags_->num_threads_) * iter.second.second.value[1] / float_t(flags_->loop_count_) / 1000.0f;
+    len = snprintf(stringBuf[3], sizeof(stringBuf[3]), "%.2f", tmp);
+    if (len > columnLenMax.at(3)) {
+      columnLenMax.at(3) = len + 4;
+    }
+    columns.emplace_back(stringBuf[3]);
+
+    len = snprintf(stringBuf[4], sizeof(stringBuf[4]), "%f", iter.second.second.value[1] / op_cost2_total_);
+    if (len > columnLenMax.at(4)) {
+      columnLenMax.at(4) = len + 4;
+    }
+    columns.emplace_back(stringBuf[4]);
+
+    rows.push_back(columns);
+  }
+
+  printf("-------------------------------------------------------------------------\n");
+  for (int i = 0; i < 5; i++) {
+    auto printBuf = title[i];
+    if (printBuf.size() > columnLenMax.at(i)) {
+      columnLenMax.at(i) = printBuf.size();
+    }
+    printBuf.resize(columnLenMax.at(i), ' ');
+    printf("%s\t", printBuf.c_str());
+  }
+  printf("\n");
+  for (auto &row : rows) {
+    for (int j = 0; j < 5; j++) {
+      auto printBuf = row[j];
+      printBuf.resize(columnLenMax.at(j), ' ');
+      printf("%s\t", printBuf.c_str());
+    }
+    printf("\n");
+  }
+  return RET_OK;
+}
+#endif
+
+#ifdef SUPPORT_NNIE
+int SvpSysInit() {
+  HI_S32 ret = HI_SUCCESS;
+  VB_CONFIG_S struVbConf;
+
+  HI_MPI_SYS_Exit();
+  HI_MPI_VB_Exit();
+
+  memset(&struVbConf, 0, sizeof(VB_CONFIG_S));
+  struVbConf.u32MaxPoolCnt = 2;
+  struVbConf.astCommPool[1].u64BlkSize = 768 * 576 * 2;
+  struVbConf.astCommPool[1].u32BlkCnt = 1;
+
+  ret = HI_MPI_VB_SetConfig((const VB_CONFIG_S *)&struVbConf);
+  if (HI_SUCCESS != ret) {
+    MS_LOG(ERROR) << "Error:HI_MPI_VB_SetConf failed!";
+    return RET_ERROR;
+  }
+
+  ret = HI_MPI_VB_Init();
+  if (HI_SUCCESS != ret) {
+    MS_LOG(ERROR) << "Error:HI_MPI_VB_Init failed!";
+    return RET_ERROR;
+  }
+
+  ret = HI_MPI_SYS_Init();
+  if (HI_SUCCESS != ret) {
+    MS_LOG(ERROR) << "Error:HI_MPI_SYS_Init failed!";
+    return RET_ERROR;
+  }
+
+  return RET_OK;
+}
+
+int SvpSysExit() {
+  HI_S32 ret = HI_SUCCESS;
+
+  ret = HI_MPI_SYS_Exit();
+  if (HI_SUCCESS != ret) {
+    MS_LOG(ERROR) << "Error:HI_MPI_SYS_Exit failed!";
+    return RET_ERROR;
+  }
+
+  ret = HI_MPI_VB_Exit();
+  if (HI_SUCCESS != ret) {
+    MS_LOG(ERROR) << "Error:HI_MPI_VB_Exit failed!";
+    return RET_ERROR;
+  }
+
+  return RET_OK;
+}
+#endif
+
+BenchmarkBase::~BenchmarkBase() {
+  for (const auto &iter : this->benchmark_data_) {
+    delete (iter.second);
+  }
+  this->benchmark_data_.clear();
+#ifdef SUPPORT_NNIE
+  SvpSysExit();
+#endif
+}
+
+}  // namespace lite
+}  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/benchmark_base.h
+++ b/mindspore/lite/tools/benchmark/benchmark_base.h
@ -0,0 +1,316 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINNIE_BENCHMARK_BENCHMARK_BASE_H_
+#define MINNIE_BENCHMARK_BENCHMARK_BASE_H_
+
+#include <getopt.h>
+#include <signal.h>
+#include <random>
+#include <unordered_map>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <cmath>
+#include <string>
+#include <vector>
+#include <memory>
+#include <cfloat>
+#include <utility>
+#include <nlohmann/json.hpp>
+#include "include/model.h"
+#include "tools/common/flag_parser.h"
+#include "src/common/file_utils.h"
+#include "src/common/utils.h"
+#include "ir/dtype/type_id.h"
+#include "schema/model_generated.h"
+
+namespace mindspore::lite {
+enum MS_API InDataType { kImage = 0, kBinary = 1 };
+
+constexpr float relativeTolerance = 1e-5;
+constexpr float absoluteTolerance = 1e-8;
+
+constexpr int kNumPrintMin = 5;
+constexpr const char *DELIM_COLON = ":";
+constexpr const char *DELIM_COMMA = ",";
+constexpr const char *DELIM_SLASH = "/";
+
+extern const std::unordered_map<int, std::string> TYPE_ID_MAP;
+extern const std::unordered_map<schema::Format, std::string> TENSOR_FORMAT_MAP;
+
+//
+namespace dump {
+constexpr auto kConfigPath = "MINDSPORE_DUMP_CONFIG";
+constexpr auto kSettings = "common_dump_settings";
+constexpr auto kMode = "dump_mode";
+constexpr auto kPath = "path";
+constexpr auto kNetName = "net_name";
+constexpr auto kInputOutput = "input_output";
+constexpr auto kKernels = "kernels";
+}  // namespace dump
+
+#ifdef ENABLE_ARM64
+struct PerfResult {
+  int64_t nr;
+  struct {
+    int64_t value;
+    int64_t id;
+  } values[2];
+};
+struct PerfCount {
+  int64_t value[2];
+};
+#endif
+
+struct MS_API CheckTensor {
+  CheckTensor(const std::vector<size_t> &shape, const std::vector<float> &data,
+              const std::vector<std::string> &strings_data = {""}) {
+    this->shape = shape;
+    this->data = data;
+    this->strings_data = strings_data;
+  }
+  std::vector<size_t> shape;
+  std::vector<float> data;
+  std::vector<std::string> strings_data;
+};
+
+class MS_API BenchmarkFlags : public virtual FlagParser {
+ public:
+  BenchmarkFlags() {
+    // common
+    AddFlag(&BenchmarkFlags::model_file_, "modelFile", "Input model file", "");
+    AddFlag(&BenchmarkFlags::in_data_file_, "inDataFile", "Input data file, if not set, use random input", "");
+    AddFlag(&BenchmarkFlags::device_, "device", "CPU | GPU | NPU", "CPU");
+    AddFlag(&BenchmarkFlags::cpu_bind_mode_, "cpuBindMode",
+            "Input 0 for NO_BIND, 1 for HIGHER_CPU, 2 for MID_CPU, default value: 1", 1);
+    // MarkPerformance
+    AddFlag(&BenchmarkFlags::loop_count_, "loopCount", "Run loop count", 10);
+    AddFlag(&BenchmarkFlags::num_threads_, "numThreads", "Run threads number", 2);
+    AddFlag(&BenchmarkFlags::enable_fp16_, "enableFp16", "Enable float16", false);
+    AddFlag(&BenchmarkFlags::enable_parallel_, "enableParallel", "Enable subgraph parallel : true | false", false);
+    AddFlag(&BenchmarkFlags::warm_up_loop_count_, "warmUpLoopCount", "Run warm up loop", 3);
+    AddFlag(&BenchmarkFlags::time_profiling_, "timeProfiling", "Run time profiling", false);
+    AddFlag(&BenchmarkFlags::perf_profiling_, "perfProfiling",
+            "Perf event profiling(only instructions statics enabled currently)", false);
+    AddFlag(&BenchmarkFlags::perf_event_, "perfEvent", "CYCLE|CACHE|STALL", "CYCLE");
+    // MarkAccuracy
+    AddFlag(&BenchmarkFlags::benchmark_data_file_, "benchmarkDataFile", "Benchmark data file path", "");
+    AddFlag(&BenchmarkFlags::benchmark_data_type_, "benchmarkDataType",
+            "Benchmark data type. FLOAT | INT32 | INT8 | UINT8", "FLOAT");
+    AddFlag(&BenchmarkFlags::accuracy_threshold_, "accuracyThreshold", "Threshold of accuracy", 0.5);
+    AddFlag(&BenchmarkFlags::resize_dims_in_, "inputShapes",
+            "Shape of input data, the format should be NHWC. e.g. 1,32,32,32:1,1,32,32,1", "");
+  }
+
+  ~BenchmarkFlags() override = default;
+
+  void InitInputDataList();
+
+  void InitResizeDimsList();
+
+ public:
+  // common
+  std::string model_file_;
+  std::string in_data_file_;
+  std::vector<std::string> input_data_list_;
+  InDataType in_data_type_ = kBinary;
+  std::string in_data_type_in_ = "bin";
+  int cpu_bind_mode_ = 1;
+  // MarkPerformance
+  int loop_count_ = 10;
+  int num_threads_ = 2;
+  bool enable_fp16_ = false;
+  bool enable_parallel_ = false;
+  int warm_up_loop_count_ = 3;
+  // MarkAccuracy
+  std::string benchmark_data_file_;
+  std::string benchmark_data_type_ = "FLOAT";
+  float accuracy_threshold_ = 0.5;
+  // Resize
+  std::string resize_dims_in_;
+  std::vector<std::vector<int>> resize_dims_;
+
+  std::string device_ = "CPU";
+  bool time_profiling_ = false;
+  bool perf_profiling_ = false;
+  std::string perf_event_ = "CYCLE";
+  bool dump_tensor_data_ = false;
+  bool print_tensor_data_ = false;
+};
+
+class MS_API BenchmarkBase {
+ public:
+  explicit BenchmarkBase(BenchmarkFlags *flags) : flags_(flags) {}
+
+  virtual ~BenchmarkBase();
+
+  int Init();
+  virtual int RunBenchmark() = 0;
+
+ protected:
+  int LoadInput();
+
+  virtual int GenerateInputData() = 0;
+
+  int GenerateRandomData(size_t size, void *data, int data_type);
+
+  virtual int ReadInputFile() = 0;
+
+  int ReadCalibData();
+
+  virtual int ReadTensorData(std::ifstream &in_file_stream, const std::string &tensor_name,
+                             const std::vector<size_t> &dims) = 0;
+
+  virtual int CompareOutput() = 0;
+
+  int CompareStringData(const std::string &name, tensor::MSTensor *tensor);
+
+  int InitDumpConfigFromJson(char *path);
+
+  int InitCallbackParameter();
+
+  virtual int InitTimeProfilingCallbackParameter() = 0;
+
+  virtual int InitPerfProfilingCallbackParameter() = 0;
+
+  virtual int InitDumpTensorDataCallbackParameter() = 0;
+
+  virtual int InitPrintTensorDataCallbackParameter() = 0;
+
+  int PrintResult(const std::vector<std::string> &title, const std::map<std::string, std::pair<int, float>> &result);
+
+#ifdef ENABLE_ARM64
+  int PrintPerfResult(const std::vector<std::string> &title,
+                      const std::map<std::string, std::pair<int, struct PerfCount>> &result);
+#endif
+
+  // tensorData need to be converter first
+  template <typename T, typename ST>
+  float CompareData(const std::string &nodeName, const std::vector<ST> &msShape, const void *tensor_data) {
+    const T *msTensorData = static_cast<const T *>(tensor_data);
+    auto iter = this->benchmark_data_.find(nodeName);
+    if (iter != this->benchmark_data_.end()) {
+      std::vector<size_t> castedMSShape;
+      size_t shapeSize = 1;
+      for (int64_t dim : msShape) {
+        castedMSShape.push_back(size_t(dim));
+        shapeSize *= dim;
+      }
+
+      CheckTensor *calibTensor = iter->second;
+      if (calibTensor->shape != castedMSShape) {
+        std::ostringstream oss;
+        oss << "Shape of mslite output(";
+        for (auto dim : castedMSShape) {
+          oss << dim << ",";
+        }
+        oss << ") and shape source model output(";
+        for (auto dim : calibTensor->shape) {
+          oss << dim << ",";
+        }
+        oss << ") are different";
+        std::cerr << oss.str() << std::endl;
+        MS_LOG(ERROR) << oss.str().c_str();
+        return RET_ERROR;
+      }
+      size_t errorCount = 0;
+      float meanError = 0;
+      std::cout << "Data of node " << nodeName << " : ";
+      for (size_t j = 0; j < shapeSize; j++) {
+        if (j < 50) {
+          std::cout << static_cast<float>(msTensorData[j]) << " ";
+        }
+
+        if (std::isnan(msTensorData[j]) || std::isinf(msTensorData[j])) {
+          std::cerr << "Output tensor has nan or inf data, compare fail" << std::endl;
+          MS_LOG(ERROR) << "Output tensor has nan or inf data, compare fail";
+          return RET_ERROR;
+        }
+
+        auto tolerance = absoluteTolerance + relativeTolerance * fabs(calibTensor->data.at(j));
+        auto absoluteError = std::fabs(msTensorData[j] - calibTensor->data.at(j));
+        if (absoluteError > tolerance) {
+          if (fabs(calibTensor->data.at(j) - 0.0f) < FLT_EPSILON) {
+            if (absoluteError > 1e-5) {
+              meanError += absoluteError;
+              errorCount++;
+            } else {
+              continue;
+            }
+          } else {
+            // just assume that atol = rtol
+            meanError += absoluteError / (fabs(calibTensor->data.at(j)) + FLT_MIN);
+            errorCount++;
+          }
+        }
+      }
+      std::cout << std::endl;
+      if (meanError > 0.0f) {
+        meanError /= errorCount;
+      }
+
+      if (meanError <= 0.0000001) {
+        std::cout << "Mean bias of node/tensor " << nodeName << " : 0%" << std::endl;
+      } else {
+        std::cout << "Mean bias of node/tensor " << nodeName << " : " << meanError * 100 << "%" << std::endl;
+      }
+      return meanError;
+    } else {
+      MS_LOG(INFO) << "%s is not in Source Model output", nodeName.c_str();
+      return RET_ERROR;
+    }
+  }
+
+  template <typename T, typename Distribution>
+  void FillInputData(int size, void *data, Distribution distribution) {
+    MS_ASSERT(data != nullptr);
+    int elements_num = size / sizeof(T);
+    (void)std::generate_n(static_cast<T *>(data), elements_num,
+                          [&]() { return static_cast<T>(distribution(random_engine_)); });
+  }
+
+  int CheckThreadNumValid();
+
+ protected:
+  BenchmarkFlags *flags_;
+  std::unordered_map<std::string, CheckTensor *> benchmark_data_;
+  std::unordered_map<std::string, int> data_type_map_{
+    {"FLOAT", kNumberTypeFloat}, {"INT8", kNumberTypeInt8}, {"INT32", kNumberTypeInt32}, {"UINT8", kNumberTypeUInt8}};
+  int msCalibDataType = kNumberTypeFloat;
+
+  // callback parameters
+  uint64_t op_begin_ = 0;
+  int op_call_times_total_ = 0;
+  float op_cost_total_ = 0.0f;
+  std::map<std::string, std::pair<int, float>> op_times_by_type_;
+  std::map<std::string, std::pair<int, float>> op_times_by_name_;
+
+  // dump data
+  nlohmann::json dump_cfg_json_;
+  std::string dump_file_output_dir_;
+#ifdef ENABLE_ARM64
+  int perf_fd = 0;
+  int perf_fd2 = 0;
+  float op_cost2_total_ = 0.0f;
+  std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_type_;
+  std::map<std::string, std::pair<int, struct PerfCount>> op_perf_by_name_;
+#endif
+  std::mt19937 random_engine_;
+};
+
+}  // namespace mindspore::lite
+#endif  // MINNIE_BENCHMARK_BENCHMARK_BASE_H_
--- a/mindspore/lite/tools/benchmark/benchmark_unified_api.cc
+++ b/mindspore/lite/tools/benchmark/benchmark_unified_api.cc
@ -0,0 +1,828 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tools/benchmark/benchmark_unified_api.h"
+#define __STDC_FORMAT_MACROS
+#include <cinttypes>
+#undef __STDC_FORMAT_MACROS
+#include <algorithm>
+#include <utility>
+#include <functional>
+#include "include/context.h"
+#include "include/ms_tensor.h"
+#include "include/version.h"
+#include "schema/model_generated.h"
+#include "src/common/common.h"
+#include "src/tensor.h"
+#ifdef ENABLE_ARM64
+#include <linux/perf_event.h>
+#include <sys/ioctl.h>
+#include <asm/unistd.h>
+#include <unistd.h>
+#endif
+#ifdef SUPPORT_NNIE
+#include "include/hi_common.h"
+#include "include/hi_comm_vb.h"
+#include "include/mpi_sys.h"
+#include "include/mpi_vb.h"
+#endif
+
+namespace mindspore {
+namespace lite {
+
+int BenchmarkUnifiedApi::GenerateInputData() {
+  for (auto tensor : ms_inputs_for_api_) {
+    MS_ASSERT(tensor != nullptr);
+    auto input_data = tensor.MutableData();
+    if (input_data == nullptr) {
+      MS_LOG(ERROR) << "MallocData for inTensor failed";
+      return RET_ERROR;
+    }
+    int status;
+    if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
+      std::cerr << "Unsupported  kObjectTypeString:" << std::endl;
+      MS_LOG(ERROR) << "Unsupported  kObjectTypeString:";
+      return RET_ERROR;
+      //      status = StringsToMSTensor({"you're the best."}, tensor);
+    } else {
+      status = GenerateRandomData(tensor.DataSize(), input_data, static_cast<int>(tensor.DataType()));
+    }
+    if (status != RET_OK) {
+      std::cerr << "GenerateRandomData for inTensor failed: " << status << std::endl;
+      MS_LOG(ERROR) << "GenerateRandomData for inTensor failed:" << status;
+      return status;
+    }
+  }
+  return RET_OK;
+}
+
+int BenchmarkUnifiedApi::ReadInputFile() {
+  if (ms_inputs_for_api_.empty()) {
+    return RET_OK;
+  }
+
+  if (this->flags_->in_data_type_ == kImage) {
+    MS_LOG(ERROR) << "Not supported image input";
+    return RET_ERROR;
+  } else {
+    for (size_t i = 0; i < flags_->input_data_list_.size(); i++) {
+      auto cur_tensor = ms_inputs_for_api_.at(i);
+      MS_ASSERT(cur_tensor != nullptr);
+      size_t size;
+      char *bin_buf = ReadFile(flags_->input_data_list_[i].c_str(), &size);
+      if (bin_buf == nullptr) {
+        MS_LOG(ERROR) << "ReadFile return nullptr";
+        return RET_ERROR;
+      }
+      if (static_cast<int>(cur_tensor.DataType()) == kObjectTypeString) {
+        std::cerr << "Unsupported  kObjectTypeString:" << std::endl;
+        MS_LOG(ERROR) << "Unsupported  kObjectTypeString:";
+        return RET_ERROR;
+
+      } else {
+        auto tensor_data_size = cur_tensor.DataSize();
+        if (size != tensor_data_size) {
+          std::cerr << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size
+                    << std::endl;
+          MS_LOG(ERROR) << "Input binary file size error, required: " << tensor_data_size << ", in fact: " << size;
+          delete[] bin_buf;
+          return RET_ERROR;
+        }
+        auto input_data = cur_tensor.MutableData();
+        if (input_data == nullptr) {
+          MS_LOG(ERROR) << "input_data is nullptr.";
+          return RET_ERROR;
+        }
+        memcpy(input_data, bin_buf, tensor_data_size);
+      }
+      delete[] bin_buf;
+    }
+  }
+  return RET_OK;
+}
+
+int BenchmarkUnifiedApi::ReadTensorData(std::ifstream &in_file_stream, const std::string &tensor_name,
+                                        const std::vector<size_t> &dims) {
+  std::string line;
+  getline(in_file_stream, line);
+  std::stringstream line_stream(line);
+  if (this->benchmark_data_.find(tensor_name) != this->benchmark_data_.end()) {
+    return RET_OK;
+  }
+  mindspore::MSTensor tensor = GetMSTensorByNameOrShape(tensor_name, dims);
+  if (tensor == nullptr) {
+    MS_LOG(ERROR) << "Get tensor failed, tensor name: " << tensor_name;
+    return RET_ERROR;
+  }
+  std::vector<float> data;
+  std::vector<std::string> strings_data;
+  size_t shape_size = std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<size_t>());
+  if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
+    strings_data.push_back(line);
+    for (size_t i = 1; i < shape_size; i++) {
+      getline(in_file_stream, line);
+      strings_data.push_back(line);
+    }
+  } else {
+    for (size_t i = 0; i < shape_size; i++) {
+      float tmp_data;
+      line_stream >> tmp_data;
+      data.push_back(tmp_data);
+    }
+  }
+  auto *check_tensor = new (std::nothrow) CheckTensor(dims, data, strings_data);
+  if (check_tensor == nullptr) {
+    MS_LOG(ERROR) << "New CheckTensor failed, tensor name: " << tensor_name;
+    return RET_ERROR;
+  }
+  this->benchmark_data_.insert(std::make_pair(tensor_name, check_tensor));
+  return RET_OK;
+}
+
+void BenchmarkUnifiedApi::InitMSContext(const std::shared_ptr<mindspore::Context> &context) {
+  context->SetThreadNum(flags_->num_threads_);
+  context->SetEnableParallel(flags_->enable_parallel_);
+  context->SetThreadAffinity(flags_->cpu_bind_mode_);
+  auto &device_list = context->MutableDeviceInfo();
+
+  std::shared_ptr<CPUDeviceInfo> device_info = std::make_shared<CPUDeviceInfo>();
+  device_info->SetEnableFP16(flags_->enable_fp16_);
+  device_list.push_back(device_info);
+
+  if (flags_->device_ == "GPU") {
+    std::shared_ptr<MaliGPUDeviceInfo> gpu_device_info = std::make_shared<MaliGPUDeviceInfo>();
+    gpu_device_info->SetEnableFP16(flags_->enable_fp16_);
+    device_list.push_back(gpu_device_info);
+  }
+
+  if (flags_->device_ == "NPU") {
+    std::shared_ptr<KirinNPUDeviceInfo> npu_device_info = std::make_shared<KirinNPUDeviceInfo>();
+    npu_device_info->SetFrequency(3);
+    device_list.push_back(npu_device_info);
+  }
+}
+
+int BenchmarkUnifiedApi::CompareOutput() {
+  std::cout << "================ Comparing Output data ================" << std::endl;
+  float total_bias = 0;
+  int total_size = 0;
+  for (const auto &calib_tensor : benchmark_data_) {
+    std::string node_or_tensor_name = calib_tensor.first;
+    mindspore::MSTensor tensor = GetMSTensorByNameOrShape(node_or_tensor_name, calib_tensor.second->shape);
+    if (tensor == nullptr) {
+      MS_LOG(ERROR) << "Get tensor failed, tensor name: " << node_or_tensor_name;
+      return RET_ERROR;
+    }
+    int ret;
+    if (static_cast<int>(tensor.DataType()) == kObjectTypeString) {
+      std::cerr << "Unsupported  kObjectTypeString:" << std::endl;
+      MS_LOG(ERROR) << "Unsupported  kObjectTypeString:";
+      return RET_ERROR;
+      // ret = CompareStringData(node_or_tensor_name, tensor);
+    } else {
+      ret = CompareDataGetTotalBiasAndSize(node_or_tensor_name, &tensor, &total_bias, &total_size);
+    }
+    if (ret != RET_OK) {
+      MS_LOG(ERROR) << "Error in CompareData";
+      std::cerr << "Error in CompareData" << std::endl;
+      std::cout << "=======================================================" << std::endl << std::endl;
+      return ret;
+    }
+  }
+  float mean_bias;
+  if (total_size != 0) {
+    mean_bias = total_bias / float_t(total_size) * 100;
+  } else {
+    mean_bias = 0;
+  }
+
+  std::cout << "Mean bias of all nodes/tensors: " << mean_bias << "%" << std::endl;
+  std::cout << "=======================================================" << std::endl << std::endl;
+
+  if (mean_bias > this->flags_->accuracy_threshold_) {
+    MS_LOG(ERROR) << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%";
+    std::cerr << "Mean bias of all nodes/tensors is too big: " << mean_bias << "%" << std::endl;
+    return RET_ERROR;
+  }
+  return RET_OK;
+}
+
+mindspore::MSTensor BenchmarkUnifiedApi::GetMSTensorByNodeShape(const std::vector<size_t> &node_shape) {
+  std::vector<mindspore::MSTensor> match_tensors;
+  std::vector<int64_t> shape_vector = ConverterToInt64Vector<size_t>(node_shape);
+  auto tensors = ms_model_.GetOutputs();
+  for (auto &out_tensor_pair : tensors) {
+    if (out_tensor_pair.Shape() == shape_vector) {
+      match_tensors.emplace_back(out_tensor_pair);
+    }
+  }
+
+  return match_tensors.front();
+}
+
+mindspore::MSTensor BenchmarkUnifiedApi::GetMSTensorByNameOrShape(const std::string &node_or_tensor_name,
+                                                                  const std::vector<size_t> &dims) {
+  mindspore::MSTensor tensor;
+  auto tensors = ms_model_.GetOutputsByNodeName(node_or_tensor_name);
+  if (tensors.empty() || tensors.size() != 1) {
+    MS_LOG(INFO) << "Cannot find output node: " << node_or_tensor_name
+                 << " or node has more than one output tensor, switch to GetOutputByTensorName";
+    tensor = ms_model_.GetOutputByTensorName(node_or_tensor_name);
+    if (tensor == nullptr) {
+      return GetMSTensorByNodeShape(dims);
+    }
+  } else {
+    tensor = tensors.front();
+  }
+  return tensor;
+}
+
+int BenchmarkUnifiedApi::CompareDataGetTotalBiasAndSize(const std::string &name, mindspore::MSTensor *tensor,
+                                                        float *total_bias, int *total_size) {
+  float bias = 0;
+  auto mutableData = tensor->MutableData();
+  if (mutableData == nullptr) {
+    MS_LOG(ERROR) << "mutableData is nullptr.";
+    return RET_ERROR;
+  }
+  switch (static_cast<int>(tensor->DataType())) {
+    case TypeId::kNumberTypeFloat:
+    case TypeId::kNumberTypeFloat32: {
+      bias = CompareData<float>(name, tensor->Shape(), mutableData);
+      break;
+    }
+    case TypeId::kNumberTypeInt8: {
+      bias = CompareData<int8_t>(name, tensor->Shape(), mutableData);
+      break;
+    }
+    case TypeId::kNumberTypeUInt8: {
+      bias = CompareData<uint8_t>(name, tensor->Shape(), mutableData);
+      break;
+    }
+    case TypeId::kNumberTypeInt32: {
+      bias = CompareData<int32_t>(name, tensor->Shape(), mutableData);
+      break;
+    }
+    case TypeId::kNumberTypeInt16: {
+      bias = CompareData<int16_t>(name, tensor->Shape(), mutableData);
+      break;
+    }
+    case TypeId::kNumberTypeBool: {
+      bias = CompareData<bool>(name, tensor->Shape(), mutableData);
+      break;
+    }
+    default:
+      MS_LOG(ERROR) << "Datatype " << static_cast<int>(tensor->DataType()) << " is not supported.";
+      return RET_ERROR;
+  }
+  if (bias < 0) {
+    MS_LOG(ERROR) << "CompareData failed, name: " << name;
+    return RET_ERROR;
+  }
+  *total_bias += bias;
+  *total_size += 1;
+  return RET_OK;
+}
+
+int BenchmarkUnifiedApi::MarkPerformance() {
+  MS_LOG(INFO) << "Running warm up loops...";
+  std::cout << "Running warm up loops..." << std::endl;
+  std::vector<MSTensor> outputs;
+
+  for (int i = 0; i < flags_->warm_up_loop_count_; i++) {
+    auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs);
+    if (status != kSuccess) {
+      MS_LOG(ERROR) << "Inference error ";
+      std::cerr << "Inference error " << std::endl;
+      return RET_ERROR;
+    }
+  }
+
+  MS_LOG(INFO) << "Running benchmark loops...";
+  std::cout << "Running benchmark loops..." << std::endl;
+  uint64_t time_min = 1000000;
+  uint64_t time_max = 0;
+  uint64_t time_avg = 0;
+
+  for (int i = 0; i < flags_->loop_count_; i++) {
+    auto inputs = ms_model_.GetInputs();
+    for (auto tensor : inputs) {
+      tensor.MutableData();  // prepare data
+    }
+    auto start = GetTimeUs();
+    auto status = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
+    if (status != kSuccess) {
+      MS_LOG(ERROR) << "Inference error ";
+      std::cerr << "Inference error ";
+      return RET_ERROR;
+    }
+
+    auto end = GetTimeUs();
+    auto time = end - start;
+    time_min = std::min(time_min, time);
+    time_max = std::max(time_max, time);
+    time_avg += time;
+  }
+
+  if (flags_->time_profiling_) {
+    const std::vector<std::string> per_op_name = {"opName", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
+    const std::vector<std::string> per_op_type = {"opType", "avg(ms)", "percent", "calledTimes", "opTotalTime"};
+    PrintResult(per_op_name, op_times_by_name_);
+    PrintResult(per_op_type, op_times_by_type_);
+#ifdef ENABLE_ARM64
+  } else if (flags_->perf_profiling_) {
+    if (flags_->perf_event_ == "CACHE") {
+      const std::vector<std::string> per_op_name = {"opName", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
+      const std::vector<std::string> per_op_type = {"opType", "cache ref(k)", "cache ref(%)", "miss(k)", "miss(%)"};
+      PrintPerfResult(per_op_name, op_perf_by_name_);
+      PrintPerfResult(per_op_type, op_perf_by_type_);
+    } else if (flags_->perf_event_ == "STALL") {
+      const std::vector<std::string> per_op_name = {"opName", "frontend(k)", "frontend(%)", "backendend(k)",
+                                                    "backendend(%)"};
+      const std::vector<std::string> per_op_type = {"opType", "frontend(k)", "frontend(%)", "backendend(k)",
+                                                    "backendend(%)"};
+      PrintPerfResult(per_op_name, op_perf_by_name_);
+      PrintPerfResult(per_op_type, op_perf_by_type_);
+    } else {
+      const std::vector<std::string> per_op_name = {"opName", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
+      const std::vector<std::string> per_op_type = {"opType", "cycles(k)", "cycles(%)", "ins(k)", "ins(%)"};
+      PrintPerfResult(per_op_name, op_perf_by_name_);
+      PrintPerfResult(per_op_type, op_perf_by_type_);
+    }
+#endif
+  }
+
+  if (flags_->loop_count_ > 0) {
+    time_avg /= flags_->loop_count_;
+    MS_LOG(INFO) << "Model = " << flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
+                 << ", NumThreads = " << flags_->num_threads_ << ", MinRunTime = " << time_min / 1000.0f
+                 << ", MaxRuntime = " << time_max / 1000.0f << ", AvgRunTime = " << time_avg / 1000.0f;
+    printf("Model = %s, NumThreads = %d, MinRunTime = %f ms, MaxRuntime = %f ms, AvgRunTime = %f ms\n",
+           flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1).c_str(), flags_->num_threads_,
+           time_min / 1000.0f, time_max / 1000.0f, time_avg / 1000.0f);
+  }
+  return RET_OK;
+}
+
+int BenchmarkUnifiedApi::MarkAccuracy() {
+  MS_LOG(INFO) << "MarkAccuracy";
+  std::cout << "MarkAccuracy" << std::endl;
+
+  auto status = PrintInputData();
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "PrintInputData error " << status;
+    std::cerr << "PrintInputData error " << status << std::endl;
+    return status;
+  }
+  std::vector<MSTensor> outputs;
+  auto ret = ms_model_.Predict(ms_inputs_for_api_, &outputs, ms_before_call_back_, ms_after_call_back_);
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "Inference error ";
+    std::cerr << "Inference error " << std::endl;
+    return RET_ERROR;
+  }
+  status = ReadCalibData();
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "Read calib data error " << status;
+    std::cerr << "Read calib data error " << status << std::endl;
+    return status;
+  }
+  status = CompareOutput();
+  if (status != RET_OK) {
+    MS_LOG(ERROR) << "Compare output error " << status;
+    std::cerr << "Compare output error " << status << std::endl;
+    return status;
+  }
+  return RET_OK;
+}
+
+int BenchmarkUnifiedApi::PrintInputData() {
+  for (size_t i = 0; i < ms_inputs_for_api_.size(); i++) {
+    auto input = ms_inputs_for_api_[i];
+    MS_ASSERT(input != nullptr);
+    auto tensor_data_type = static_cast<int>(input.DataType());
+
+    std::cout << "InData" << i << ": ";
+    if (tensor_data_type == TypeId::kObjectTypeString) {
+      std::cerr << "Unsupported  kObjectTypeString:" << std::endl;
+      MS_LOG(ERROR) << "Unsupported  kObjectTypeString:";
+      return RET_ERROR;
+    }
+    size_t print_num = std::min(static_cast<int>(input.ElementNum()), 20);
+    const void *in_data = input.MutableData();
+    if (in_data == nullptr) {
+      MS_LOG(ERROR) << "in_data is nullptr.";
+      return RET_ERROR;
+    }
+
+    for (size_t j = 0; j < print_num; j++) {
+      if (tensor_data_type == TypeId::kNumberTypeFloat32 || tensor_data_type == TypeId::kNumberTypeFloat) {
+        std::cout << static_cast<const float *>(in_data)[j] << " ";
+      } else if (tensor_data_type == TypeId::kNumberTypeInt8) {
+        std::cout << static_cast<const int8_t *>(in_data)[j] << " ";
+      } else if (tensor_data_type == TypeId::kNumberTypeUInt8) {
+        std::cout << static_cast<const uint8_t *>(in_data)[j] << " ";
+      } else if (tensor_data_type == TypeId::kNumberTypeInt32) {
+        std::cout << static_cast<const int32_t *>(in_data)[j] << " ";
+      } else if (tensor_data_type == TypeId::kNumberTypeInt64) {
+        std::cout << static_cast<const int64_t *>(in_data)[j] << " ";
+      } else if (tensor_data_type == TypeId::kNumberTypeBool) {
+        std::cout << static_cast<const bool *>(in_data)[j] << " ";
+      } else {
+        MS_LOG(ERROR) << "Datatype: " << tensor_data_type << " is not supported.";
+        return RET_ERROR;
+      }
+    }
+    std::cout << std::endl;
+  }
+  return RET_OK;
+}
+
+int BenchmarkUnifiedApi::RunBenchmark() {
+  auto start_prepare_time = GetTimeUs();
+  // Load graph
+  std::string model_name = flags_->model_file_.substr(flags_->model_file_.find_last_of(DELIM_SLASH) + 1);
+
+  MS_LOG(INFO) << "start reading model file";
+  std::cout << "start reading model file" << std::endl;
+  size_t size = 0;
+  char *graph_buf = ReadFile(flags_->model_file_.c_str(), &size);
+  if (graph_buf == nullptr) {
+    MS_LOG(ERROR) << "Read model file failed while running " << model_name.c_str();
+    std::cerr << "Read model file failed while running " << model_name.c_str() << std::endl;
+    return RET_ERROR;
+  }
+
+  auto context = std::make_shared<mindspore::Context>();
+  if (context == nullptr) {
+    MS_LOG(ERROR) << "New context failed while running " << model_name.c_str();
+    std::cerr << "New context failed while running " << model_name.c_str() << std::endl;
+    return RET_ERROR;
+  }
+
+  (void)InitMSContext(context);
+  auto ret = ms_model_.Build(graph_buf, size, kMindIR, context);
+  if (ret != kSuccess) {
+    MS_LOG(ERROR) << "ms_model_.Build failed while running ", model_name.c_str();
+    std::cout << "ms_model_.Build failed while running ", model_name.c_str();
+    return RET_ERROR;
+  }
+
+  if (!flags_->resize_dims_.empty()) {
+    std::vector<std::vector<int64_t>> resize_dims;
+    (void)std::transform(flags_->resize_dims_.begin(), flags_->resize_dims_.end(), std::back_inserter(resize_dims),
+                         [&](auto &shapes) { return this->ConverterToInt64Vector<int>(shapes); });
+
+    ret = ms_model_.Resize(ms_model_.GetInputs(), resize_dims);
+    if (ret != kSuccess) {
+      MS_LOG(ERROR) << "Input tensor resize failed.";
+      std::cout << "Input tensor resize failed.";
+      return RET_ERROR;
+    }
+  }
+
+  ms_inputs_for_api_ = ms_model_.GetInputs();
+  auto end_prepare_time = GetTimeUs();
+  MS_LOG(INFO) << "PrepareTime = " << (end_prepare_time - start_prepare_time) / 1000 << " ms";
+  std::cout << "PrepareTime = " << (end_prepare_time - start_prepare_time) / 1000 << " ms" << std::endl;
+
+  // Load input
+  MS_LOG(INFO) << "start generate input data";
+  auto status = LoadInput();
+  if (status != 0) {
+    MS_LOG(ERROR) << "Generate input data error";
+    return status;
+  }
+  if (!flags_->benchmark_data_file_.empty()) {
+    status = MarkAccuracy();
+    for (auto &data : benchmark_data_) {
+      data.second->shape.clear();
+      data.second->data.clear();
+      delete data.second;
+      data.second = nullptr;
+    }
+    benchmark_data_.clear();
+    if (status != 0) {
+      MS_LOG(ERROR) << "Run MarkAccuracy error: " << status;
+      std::cout << "Run MarkAccuracy error: " << status << std::endl;
+      return status;
+    }
+  } else {
+    status = MarkPerformance();
+    if (status != 0) {
+      MS_LOG(ERROR) << "Run MarkPerformance error: " << status;
+      std::cout << "Run MarkPerformance error: " << status << std::endl;
+      return status;
+    }
+  }
+  if (flags_->dump_tensor_data_) {
+    std::cout << "Dumped file is saved to : " + dump_file_output_dir_ << std::endl;
+  }
+  return RET_OK;
+}
+
+int BenchmarkUnifiedApi::InitTimeProfilingCallbackParameter() {
+  // before callback
+  ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
+                             const std::vector<mindspore::MSTensor> &before_outputs,
+                             const MSCallBackParam &call_param) {
+    if (before_inputs.empty()) {
+      MS_LOG(INFO) << "The num of beforeInputs is empty";
+    }
+    if (before_outputs.empty()) {
+      MS_LOG(INFO) << "The num of beforeOutputs is empty";
+    }
+    if (op_times_by_type_.find(call_param.node_type_) == op_times_by_type_.end()) {
+      op_times_by_type_.insert(std::make_pair(call_param.node_type_, std::make_pair(0, 0.0f)));
+    }
+    if (op_times_by_name_.find(call_param.node_name_) == op_times_by_name_.end()) {
+      op_times_by_name_.insert(std::make_pair(call_param.node_name_, std::make_pair(0, 0.0f)));
+    }
+
+    op_call_times_total_++;
+    op_begin_ = GetTimeUs();
+    return true;
+  };
+
+  // after callback
+  ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
+                            const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
+    uint64_t opEnd = GetTimeUs();
+
+    if (after_inputs.empty()) {
+      MS_LOG(INFO) << "The num of after inputs is empty";
+    }
+    if (after_outputs.empty()) {
+      MS_LOG(INFO) << "The num of after outputs is empty";
+    }
+
+    float cost = static_cast<float>(opEnd - op_begin_) / 1000.0f;
+    if (flags_->device_ == "GPU") {
+      auto gpu_param = reinterpret_cast<const GPUCallBackParam &>(call_param);
+      cost = static_cast<float>(gpu_param.execute_time);
+    }
+    op_cost_total_ += cost;
+    op_times_by_type_[call_param.node_type_].first++;
+    op_times_by_type_[call_param.node_type_].second += cost;
+    op_times_by_name_[call_param.node_name_].first++;
+    op_times_by_name_[call_param.node_name_].second += cost;
+    return true;
+  };
+  return RET_OK;
+}
+
+int BenchmarkUnifiedApi::InitPerfProfilingCallbackParameter() {
+#ifndef ENABLE_ARM64
+  MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
+  return RET_ERROR;
+#else
+  struct perf_event_attr pe, pe2;
+  memset(&pe, 0, sizeof(struct perf_event_attr));
+  memset(&pe2, 0, sizeof(struct perf_event_attr));
+  pe.type = PERF_TYPE_HARDWARE;
+  pe2.type = PERF_TYPE_HARDWARE;
+  pe.size = sizeof(struct perf_event_attr);
+  pe2.size = sizeof(struct perf_event_attr);
+  pe.disabled = 1;
+  pe2.disabled = 1;
+  pe.exclude_kernel = 1;   // don't count kernel
+  pe2.exclude_kernel = 1;  // don't count kernel
+  pe.exclude_hv = 1;       // don't count hypervisor
+  pe2.exclude_hv = 1;      // don't count hypervisor
+  pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
+  pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
+  if (flags_->perf_event_ == "CACHE") {
+    pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
+    pe2.config = PERF_COUNT_HW_CACHE_MISSES;
+  } else if (flags_->perf_event_ == "STALL") {
+    pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
+    pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
+  } else {
+    pe.config = PERF_COUNT_HW_CPU_CYCLES;
+    pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
+  }
+  perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
+  if (perf_fd == -1) {
+    MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
+    return RET_ERROR;
+  }
+  perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
+  if (perf_fd2 == -1) {
+    MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
+    return RET_ERROR;
+  }
+  struct PerfCount zero;
+  zero.value[0] = 0;
+  zero.value[1] = 0;
+  // before callback
+  ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
+                             const std::vector<mindspore::MSTensor> &before_outputs,
+                             const MSCallBackParam &call_param) {
+    if (before_inputs.empty()) {
+      MS_LOG(INFO) << "The num of beforeInputs is empty";
+    }
+    if (before_outputs.empty()) {
+      MS_LOG(INFO) << "The num of beforeOutputs is empty";
+    }
+    if (op_perf_by_type_.find(call_param.node_type_) == op_perf_by_type_.end()) {
+      op_perf_by_type_.insert(std::make_pair(call_param.node_type_, std::make_pair(0, zero)));
+    }
+    if (op_perf_by_name_.find(call_param.node_name_) == op_perf_by_name_.end()) {
+      op_perf_by_name_.insert(std::make_pair(call_param.node_name_, std::make_pair(0, zero)));
+    }
+
+    op_call_times_total_++;
+    ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
+    ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
+    return true;
+  };
+
+  // after callback
+  ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
+                            const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
+    struct PerfResult res;
+    ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
+    read(perf_fd, &res, sizeof(struct PerfResult));
+
+    if (after_inputs.empty()) {
+      MS_LOG(INFO) << "The num of after inputs is empty";
+    }
+    if (after_outputs.empty()) {
+      MS_LOG(INFO) << "The num of after outputs is empty";
+    }
+    float cost1 = static_cast<float>(res.values[0].value);
+    float cost2 = static_cast<float>(res.values[1].value);
+    op_cost_total_ += cost1;
+    op_cost2_total_ += cost2;
+    op_perf_by_type_[call_param.node_type_].first++;
+    op_perf_by_type_[call_param.node_type_].second.value[0] += cost1;
+    op_perf_by_type_[call_param.node_type_].second.value[1] += cost2;
+    op_perf_by_name_[call_param.node_name_].first++;
+    op_perf_by_name_[call_param.node_name_].second.value[0] += cost1;
+    op_perf_by_name_[call_param.node_name_].second.value[1] += cost2;
+    return true;
+  };
+#endif
+  return RET_OK;
+}
+
+namespace {
+template <typename T>
+std::string DataToString(void *data, size_t data_number) {
+  if (data == nullptr) {
+    return "Data of tensor is nullptr";
+  }
+  std::ostringstream oss;
+  auto casted_data = static_cast<T *>(data);
+  for (size_t i = 0; i < 40 && i < data_number; i++) {
+    oss << " " << casted_data[i];
+  }
+  return oss.str();
+}
+
+std::string DumpMSTensor(mindspore::MSTensor *tensor) {
+  if (tensor == nullptr) {
+    return "Tensor is nullptr";
+  }
+  std::ostringstream oss;
+  oss << " DataType: " << static_cast<int>(tensor->DataType());
+  oss << " Shape:";
+  for (auto &dim : tensor->Shape()) {
+    oss << " " << dim;
+  }
+  oss << std::endl << " Data:";
+  switch (static_cast<int>(tensor->DataType())) {
+    case kNumberTypeFloat32: {
+      oss << DataToString<float>(tensor->MutableData(), tensor->ElementNum());
+    } break;
+    case kNumberTypeFloat16: {
+      oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
+    } break;
+    case kNumberTypeInt32: {
+      oss << DataToString<int32_t>(tensor->MutableData(), tensor->ElementNum());
+    } break;
+    case kNumberTypeInt16: {
+      oss << DataToString<int16_t>(tensor->MutableData(), tensor->ElementNum());
+    } break;
+    case kNumberTypeInt8: {
+      oss << DataToString<int8_t>(tensor->MutableData(), tensor->ElementNum());
+    } break;
+    default:
+      oss << "Unsupported data type to print";
+      break;
+  }
+  return oss.str();
+}
+
+std::string GenerateOutputFileName(mindspore::MSTensor *tensor, const std::string &op_name,
+                                   const std::string &file_type, const size_t &idx) {
+  std::string file_name = op_name;
+  auto pos = file_name.find_first_of('/');
+  while (pos != std::string::npos) {
+    file_name.replace(pos, 1, ".");
+    pos = file_name.find_first_of('/');
+  }
+  file_name += "_" + file_type + "_" + std::to_string(idx) + "_shape_";
+  for (const auto &dim : tensor->Shape()) {
+    file_name += std::to_string(dim) + "_";
+  }
+  if (TYPE_ID_MAP.find(static_cast<int>(tensor->DataType())) != TYPE_ID_MAP.end()) {
+    file_name += TYPE_ID_MAP.at(static_cast<int>(tensor->DataType()));
+  }
+
+  file_name += +".bin";
+  return file_name;
+}
+}  // namespace
+
+int BenchmarkUnifiedApi::InitPrintTensorDataCallbackParameter() {
+  // before callback
+  ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
+                             const std::vector<mindspore::MSTensor> &before_outputs,
+                             const MSCallBackParam &call_param) { return true; };
+
+  // after callback
+  ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
+                            const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
+    std::cout << "================================================================" << std::endl;
+    std::cout << call_param.node_name_ << " inputs : " << std::endl;
+    for (auto ms_tensor : after_inputs) {
+      std::cout << DumpMSTensor(&ms_tensor) << std::endl;
+    }
+    std::cout << "----------------------------------------------------------------" << std::endl;
+    std::cout << call_param.node_name_ << " outputs : " << std::endl;
+    for (auto ms_tensor : after_outputs) {
+      std::cout << DumpMSTensor(&ms_tensor) << std::endl;
+    }
+    std::cout << "================================================================" << std::endl;
+    return true;
+  };
+  return RET_OK;
+}
+int BenchmarkUnifiedApi::InitDumpTensorDataCallbackParameter() {
+  // before callback
+  ms_before_call_back_ = [&](const std::vector<mindspore::MSTensor> &before_inputs,
+                             const std::vector<mindspore::MSTensor> &before_outputs,
+                             const MSCallBackParam &call_param) {
+    auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
+    auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
+    auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
+
+    if (dump_mode == 0 || std::find(kernels.begin(), kernels.end(), call_param.node_name_) != kernels.end()) {
+      if (input_output_mode == 0 || input_output_mode == 1) {
+        for (size_t i = 0; i < before_inputs.size(); i++) {
+          auto ms_tensor = before_inputs.at(i);
+          auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name_, "input", i);
+          auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
+          if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) {  // save to file
+            MS_LOG(ERROR) << "write tensor data to file failed.";
+            return false;
+          }
+        }
+      }
+    }
+    return true;
+  };
+
+  // after callback
+  ms_after_call_back_ = [&](const std::vector<mindspore::MSTensor> &after_inputs,
+                            const std::vector<mindspore::MSTensor> &after_outputs, const MSCallBackParam &call_param) {
+    auto dump_mode = dump_cfg_json_[dump::kSettings][dump::kMode].get<int>();
+    auto input_output_mode = dump_cfg_json_[dump::kSettings][dump::kInputOutput].get<int>();
+    auto kernels = dump_cfg_json_[dump::kSettings][dump::kKernels].get<std::vector<std::string>>();
+
+    if (dump_mode == 0 || std::find(kernels.begin(), kernels.end(), call_param.node_name_) != kernels.end()) {
+      if (input_output_mode == 0 || input_output_mode == 2) {
+        for (size_t i = 0; i < after_outputs.size(); i++) {
+          auto ms_tensor = after_outputs.at(i);
+          auto file_name = GenerateOutputFileName(&ms_tensor, call_param.node_name_, "output", i);
+          auto abs_file_path = dump_file_output_dir_ + "/" + file_name;
+          if (WriteToBin(abs_file_path, ms_tensor.MutableData(), ms_tensor.DataSize()) != RET_OK) {  // save to file
+            MS_LOG(ERROR) << "write tensor data to file failed.";
+            return false;
+          }
+        }
+      }
+    }
+    return true;
+  };
+  return RET_OK;
+}
+
+BenchmarkUnifiedApi::~BenchmarkUnifiedApi() {}
+
+}  // namespace lite
+}  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/benchmark_unified_api.h
+++ b/mindspore/lite/tools/benchmark/benchmark_unified_api.h
@ -0,0 +1,103 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_BENCHMARK_BENCHMARK_UNIFIED_API_H_
+#define MINDSPORE_BENCHMARK_BENCHMARK_UNIFIED_API_H_
+
+#include <getopt.h>
+#include <signal.h>
+#include <random>
+#include <unordered_map>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <cmath>
+#include <string>
+#include <vector>
+#include <memory>
+#include <cfloat>
+#include <utility>
+#include <nlohmann/json.hpp>
+#include "tools/benchmark/benchmark_base.h"
+#include "include/model.h"
+#include "tools/common/flag_parser.h"
+#include "src/common/file_utils.h"
+#include "src/common/utils.h"
+#include "include/api/types.h"
+#include "include/api/model.h"
+
+namespace mindspore::lite {
+
+class MS_API BenchmarkUnifiedApi : public BenchmarkBase {
+ public:
+  explicit BenchmarkUnifiedApi(BenchmarkFlags *flags) : BenchmarkBase(flags) {}
+
+  virtual ~BenchmarkUnifiedApi();
+
+  int RunBenchmark() override;
+
+ protected:
+  int CompareDataGetTotalBiasAndSize(const std::string &name, mindspore::MSTensor *tensor, float *total_bias,
+                                     int *total_size);
+  void InitContext(const std::shared_ptr<mindspore::Context> &context);
+  mindspore::MSTensor GetMSTensorByNodeShape(const std::vector<size_t> &node_shape);
+  mindspore::MSTensor GetMSTensorByNameOrShape(const std::string &node_or_tensor_name, const std::vector<size_t> &dims);
+
+  // call GenerateRandomData to fill inputTensors
+  int GenerateInputData() override;
+
+  int ReadInputFile() override;
+
+  int ReadTensorData(std::ifstream &in_file_stream, const std::string &tensor_name,
+                     const std::vector<size_t> &dims) override;
+
+  void InitMSContext(const std::shared_ptr<Context> &context);
+
+  int CompareOutput() override;
+
+  int InitTimeProfilingCallbackParameter() override;
+
+  int InitPerfProfilingCallbackParameter() override;
+
+  int InitDumpTensorDataCallbackParameter() override;
+
+  int InitPrintTensorDataCallbackParameter() override;
+
+  int PrintInputData();
+
+  template <typename T>
+  std::vector<int64_t> ConverterToInt64Vector(const std::vector<T> &srcDims) {
+    std::vector<int64_t> dims;
+    for (auto shape : srcDims) {
+      dims.push_back(static_cast<int64_t>(shape));
+    }
+    return dims;
+  }
+
+  int MarkPerformance();
+
+  int MarkAccuracy();
+
+ private:
+  mindspore::Model ms_model_;
+  std::vector<mindspore::MSTensor> ms_inputs_for_api_;
+
+  MSKernelCallBack ms_before_call_back_ = nullptr;
+  MSKernelCallBack ms_after_call_back_ = nullptr;
+};
+
+}  // namespace mindspore::lite
+#endif  // MINNIE_BENCHMARK_BENCHMARK_H_
--- a/mindspore/lite/tools/benchmark/main.cc
+++ b/mindspore/lite/tools/benchmark/main.cc
@ -14,7 +14,7 @@
 * limitations under the License.
 */

-#include "tools/benchmark/benchmark.h"
+#include "tools/benchmark/run_benchmark.h"
 #include "include/version.h"

 int main(int argc, const char **argv) {
--- a/mindspore/lite/tools/benchmark/run_benchmark.cc
+++ b/mindspore/lite/tools/benchmark/run_benchmark.cc
@ -0,0 +1,82 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tools/benchmark/run_benchmark.h"
+#include <string>
+namespace mindspore {
+namespace lite {
+
+int RunBenchmark(int argc, const char **argv) {
+  BenchmarkFlags flags;
+  Option<std::string> err = flags.ParseFlags(argc, argv);
+#ifdef SUPPORT_NNIE
+  SvpSysInit();
+#endif
+  if (err.IsSome()) {
+    std::cerr << err.Get() << std::endl;
+    std::cerr << flags.Usage() << std::endl;
+    return RET_ERROR;
+  }
+
+  if (flags.help) {
+    std::cerr << flags.Usage() << std::endl;
+    return RET_OK;
+  }
+
+  BenchmarkBase *benchmark = nullptr;
+  // get dump data output path
+  auto new_api = std::getenv("ENABLE_NEW_API");
+  if (new_api == nullptr || std::string(new_api) != "true") {
+    benchmark = new Benchmark(&flags);
+  } else {
+    benchmark = new BenchmarkUnifiedApi(&flags);
+  }
+  if (benchmark == nullptr) {
+    MS_LOG(ERROR) << "new benchmark failed ";
+    std::cerr << "new benchmark failed" << std::endl;
+    return RET_ERROR;
+  }
+  auto status = benchmark->Init();
+  if (status != 0) {
+    MS_LOG(ERROR) << "Benchmark init Error : " << status;
+    std::cerr << "Benchmark init Error : " << status << std::endl;
+    delete benchmark;
+    benchmark = nullptr;
+    return RET_ERROR;
+  }
+
+  status = benchmark->RunBenchmark();
+  if (status != 0) {
+    MS_LOG(ERROR) << "Run Benchmark "
+                  << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
+                  << " Failed : " << status;
+    std::cerr << "Run Benchmark " << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
+              << " Failed : " << status << std::endl;
+    delete benchmark;
+    benchmark = nullptr;
+    return RET_ERROR;
+  }
+
+  MS_LOG(INFO) << "Run Benchmark " << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
+               << " Success.";
+  std::cout << "Run Benchmark " << flags.model_file_.substr(flags.model_file_.find_last_of(DELIM_SLASH) + 1).c_str()
+            << " Success." << std::endl;
+  delete benchmark;
+  benchmark = nullptr;
+  return RET_OK;
+}
+}  // namespace lite
+}  // namespace mindspore
--- a/mindspore/lite/tools/benchmark/run_benchmark.h
+++ b/mindspore/lite/tools/benchmark/run_benchmark.h
@ -0,0 +1,27 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINNIE_BENCHMARK_RUN_BENCHMARK_H_
+#define MINNIE_BENCHMARK_RUN_BENCHMARK_H_
+#include "tools/benchmark/benchmark.h"
+#include "tools/benchmark/benchmark_unified_api.h"
+
+namespace mindspore::lite {
+
+int MS_API RunBenchmark(int argc, const char **argv);
+
+}  // namespace mindspore::lite
+#endif  // MINNIE_BENCHMARK_RUN_BENCHMARK_H_