forked from mindspore-Ecosystem/mindspore
!13485 [MS][LITE][GPU]GPU support print profiling info
From: @chenzupeng Reviewed-by: @ddwsky,@zhanghaibo5 Signed-off-by: @ddwsky
This commit is contained in:
commit
4b319bbac7
|
@ -100,6 +100,10 @@ struct CallBackParam {
|
|||
std::string node_type; /**< node type argument */
|
||||
};
|
||||
|
||||
struct GPUCallBackParam : CallBackParam {
|
||||
double execute_time{-1.f};
|
||||
};
|
||||
|
||||
/// \brief KernelCallBack defined the function pointer for callBack.
|
||||
using KernelCallBack = std::function<bool(std::vector<tensor::MSTensor *> inputs,
|
||||
std::vector<tensor::MSTensor *> outputs, const CallBackParam &opInfo)>;
|
||||
|
|
|
@ -32,15 +32,18 @@ int OpenCLExecutor::RunOrTune(const std::vector<Tensor *> &inputs, const std::ve
|
|||
const KernelCallBack &before, const KernelCallBack &after, bool is_tune) {
|
||||
int ret{RET_OK};
|
||||
auto opencl_runtime_ins = ocl_runtime.GetInstance();
|
||||
if (before != nullptr && after != nullptr) {
|
||||
opencl_runtime_ins->SetProfiling(true);
|
||||
}
|
||||
auto profiling_tmp = opencl_runtime_ins->isProfiling();
|
||||
if (is_tune) {
|
||||
opencl_runtime_ins->SetProfiling(true);
|
||||
}
|
||||
for (auto *kernel : kernels) {
|
||||
MS_ASSERT(kernel);
|
||||
CallBackParam callbackParam;
|
||||
GPUCallBackParam callbackParam;
|
||||
callbackParam.node_name = kernel->name();
|
||||
|
||||
callbackParam.node_type = kernel->type_str();
|
||||
if (before != nullptr) {
|
||||
if (!before(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()), callbackParam)) {
|
||||
MS_LOG(ERROR) << "run kernel before_callback failed, name: " << kernel->name();
|
||||
|
@ -70,9 +73,12 @@ int OpenCLExecutor::RunOrTune(const std::vector<Tensor *> &inputs, const std::ve
|
|||
MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name();
|
||||
return ret;
|
||||
}
|
||||
if (profiling_tmp)
|
||||
if (profiling_tmp) {
|
||||
auto execute_time = op_kernel->GetProfilingTimeMs();
|
||||
MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str()
|
||||
<< ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms";
|
||||
callbackParam.execute_time = execute_time;
|
||||
}
|
||||
}
|
||||
ret = kernel->PostProcess();
|
||||
if (ret != RET_OK) {
|
||||
|
|
|
@ -198,12 +198,29 @@ int WinogradOpenCLKernel::Run() {
|
|||
ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_, nullptr, &event_);
|
||||
|
||||
MS_LOG(DEBUG) << "winograd kernel1 Running!";
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
|
||||
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &kernel2_event_);
|
||||
|
||||
MS_LOG(DEBUG) << "winograd kernel2 Running!";
|
||||
ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c());
|
||||
ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_, nullptr, &event_);
|
||||
ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_, nullptr, &kernel3_event_);
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
double WinogradOpenCLKernel::GetProfilingTimeMs() {
|
||||
if (!ocl_runtime_->isProfiling()) {
|
||||
return MAX_PROFILING_TIME_MILLI_SECOND;
|
||||
}
|
||||
cl_ulong time_start;
|
||||
cl_ulong time_end;
|
||||
event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
|
||||
event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
|
||||
cl_ulong time_ns = time_end - time_start;
|
||||
kernel2_event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
|
||||
kernel2_event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
|
||||
time_ns += time_end - time_start;
|
||||
kernel3_event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
|
||||
kernel3_event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
|
||||
time_ns += time_end - time_start;
|
||||
return static_cast<double>(time_ns) * 1e-6;
|
||||
}
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -39,6 +39,7 @@ class WinogradOpenCLKernel : public Conv2DOpenCLKernel {
|
|||
|
||||
std::vector<BaseTuningParameter> GenerateTuningParam() override { return {}; }
|
||||
int Tune() override { return RET_OK; }
|
||||
double GetProfilingTimeMs() override;
|
||||
|
||||
private:
|
||||
void BuildKernel() override;
|
||||
|
@ -47,8 +48,10 @@ class WinogradOpenCLKernel : public Conv2DOpenCLKernel {
|
|||
|
||||
cl::Kernel kernel_4x4to36_;
|
||||
cl::Kernel kernel_36to4x4_;
|
||||
cl::Event kernel2_event_;
|
||||
cl::NDRange global_4x4to36_, local_4x4to36_;
|
||||
cl::NDRange global_36to4x4_, local_36to4x4_;
|
||||
cl::Event kernel3_event_;
|
||||
void *winograd_mem0_{nullptr};
|
||||
void *winograd_mem1_{nullptr};
|
||||
};
|
||||
|
|
|
@ -195,7 +195,7 @@ class OpenCLKernel : public LiteKernel {
|
|||
lite::opencl::MemType GetMemType() { return out_mem_type_; }
|
||||
void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; }
|
||||
OpParameter *GetParameter() { return op_parameter_; }
|
||||
double GetProfilingTimeMs();
|
||||
virtual double GetProfilingTimeMs();
|
||||
int DequantWeight();
|
||||
void FreeDequantedWeight();
|
||||
virtual int InferShape();
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <set>
|
||||
#include <map>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include "src/runtime/gpu/opencl/opencl_executor.h"
|
||||
#include "src/runtime/kernel/opencl/utils.h"
|
||||
#include "src/runtime/kernel/opencl/kernel/to_format.h"
|
||||
|
@ -467,4 +468,33 @@ int OpenCLSubGraph::Run() {
|
|||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int OpenCLSubGraph::Run(const KernelCallBack &before, const KernelCallBack &after) {
|
||||
if (executor_ == nullptr) {
|
||||
MS_LOG(ERROR) << "executor is nullptr";
|
||||
return RET_ERROR;
|
||||
}
|
||||
int ret;
|
||||
for (auto &tensor : in_tensors_) {
|
||||
MS_ASSERT(tensor);
|
||||
if (tensor->data_c() == nullptr) {
|
||||
MS_LOG(ERROR) << "OpenCL subgraph input tensor data is null";
|
||||
return RET_ERROR;
|
||||
}
|
||||
ret = allocator_->UnmapBuffer(tensor->data_c());
|
||||
if (ret != RET_OK) {
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
ret = executor_->Run(in_tensors_, out_tensors_, nodes_, allocator_, before, after);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
|
||||
return ret;
|
||||
}
|
||||
if (!ocl_runtime_->SyncCommandQueue()) {
|
||||
return RET_ERROR;
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
} // namespace mindspore::kernel
|
||||
|
|
|
@ -46,7 +46,7 @@ class OpenCLSubGraph : public SubGraphKernel {
|
|||
int ReSize() override;
|
||||
int ReSize(bool interrupt);
|
||||
int Run() override;
|
||||
int Run(const KernelCallBack &before, const KernelCallBack &after) override { return this->Run(); };
|
||||
int Run(const KernelCallBack &before, const KernelCallBack &after) override;
|
||||
int InsertOpsPass();
|
||||
bool IsSubGraphInferShapeDone();
|
||||
|
||||
|
|
|
@ -568,10 +568,8 @@ int Benchmark::RunBenchmark() {
|
|||
}
|
||||
|
||||
auto &cpu_device_ctx = context->device_list_[0];
|
||||
if (flags_->cpu_bind_mode_ == MID_CPU) {
|
||||
cpu_device_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = MID_CPU;
|
||||
} else if (flags_->cpu_bind_mode_ == HIGHER_CPU) {
|
||||
cpu_device_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = HIGHER_CPU;
|
||||
if (flags_->cpu_bind_mode_ == MID_CPU || flags_->cpu_bind_mode_ == HIGHER_CPU) {
|
||||
cpu_device_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = CpuBindMode(flags_->cpu_bind_mode_);
|
||||
} else {
|
||||
cpu_device_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = NO_BIND;
|
||||
}
|
||||
|
@ -611,9 +609,8 @@ int Benchmark::RunBenchmark() {
|
|||
return ret;
|
||||
}
|
||||
}
|
||||
if (model != nullptr) {
|
||||
model->Free();
|
||||
}
|
||||
if (model != nullptr) model->Free();
|
||||
|
||||
ms_inputs_ = session_->GetInputs();
|
||||
auto end_prepare_time = GetTimeUs();
|
||||
MS_LOG(INFO) << "PrepareTime = " << (end_prepare_time - start_prepare_time) / 1000 << " ms";
|
||||
|
@ -682,147 +679,161 @@ void BenchmarkFlags::InitResizeDimsList() {
|
|||
}
|
||||
}
|
||||
|
||||
int Benchmark::InitCallbackParameter() {
|
||||
if (flags_->time_profiling_) {
|
||||
// before callback
|
||||
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
|
||||
const CallBackParam &callParam) {
|
||||
if (before_inputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of beforeInputs is empty";
|
||||
}
|
||||
if (before_outputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of beforeOutputs is empty";
|
||||
}
|
||||
if (op_times_by_type_.find(callParam.node_type) == op_times_by_type_.end()) {
|
||||
op_times_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, 0.0f)));
|
||||
}
|
||||
if (op_times_by_name_.find(callParam.node_name) == op_times_by_name_.end()) {
|
||||
op_times_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, 0.0f)));
|
||||
}
|
||||
|
||||
op_call_times_total_++;
|
||||
op_begin_ = GetTimeUs();
|
||||
return true;
|
||||
};
|
||||
|
||||
// after callback
|
||||
after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
|
||||
const CallBackParam &call_param) {
|
||||
uint64_t opEnd = GetTimeUs();
|
||||
|
||||
if (after_inputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of after inputs is empty";
|
||||
}
|
||||
if (after_outputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of after outputs is empty";
|
||||
}
|
||||
|
||||
float cost = static_cast<float>(opEnd - op_begin_) / 1000.0f;
|
||||
op_cost_total_ += cost;
|
||||
op_times_by_type_[call_param.node_type].first++;
|
||||
op_times_by_type_[call_param.node_type].second += cost;
|
||||
op_times_by_name_[call_param.node_name].first++;
|
||||
op_times_by_name_[call_param.node_name].second += cost;
|
||||
return true;
|
||||
};
|
||||
} else if (flags_->perf_profiling_) {
|
||||
#ifndef ENABLE_ARM64
|
||||
MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
|
||||
return RET_ERROR;
|
||||
#else
|
||||
struct perf_event_attr pe, pe2;
|
||||
memset(&pe, 0, sizeof(struct perf_event_attr));
|
||||
memset(&pe2, 0, sizeof(struct perf_event_attr));
|
||||
pe.type = PERF_TYPE_HARDWARE;
|
||||
pe2.type = PERF_TYPE_HARDWARE;
|
||||
pe.size = sizeof(struct perf_event_attr);
|
||||
pe2.size = sizeof(struct perf_event_attr);
|
||||
pe.disabled = 1;
|
||||
pe2.disabled = 1;
|
||||
pe.exclude_kernel = 1; // don't count kernel
|
||||
pe2.exclude_kernel = 1; // don't count kernel
|
||||
pe.exclude_hv = 1; // don't count hypervisor
|
||||
pe2.exclude_hv = 1; // don't count hypervisor
|
||||
pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
|
||||
pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
|
||||
if (flags_->perf_event_ == "CACHE") {
|
||||
pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
|
||||
pe2.config = PERF_COUNT_HW_CACHE_MISSES;
|
||||
} else if (flags_->perf_event_ == "STALL") {
|
||||
pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
|
||||
pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
|
||||
} else {
|
||||
pe.config = PERF_COUNT_HW_CPU_CYCLES;
|
||||
pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
|
||||
int Benchmark::InitTimeProfilingCallbackParameter() {
|
||||
// before callback
|
||||
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
|
||||
const CallBackParam &callParam) {
|
||||
if (before_inputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of beforeInputs is empty";
|
||||
}
|
||||
perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
|
||||
if (perf_fd == -1) {
|
||||
MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
|
||||
return RET_ERROR;
|
||||
if (before_outputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of beforeOutputs is empty";
|
||||
}
|
||||
perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
|
||||
if (perf_fd2 == -1) {
|
||||
MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
|
||||
return RET_ERROR;
|
||||
if (op_times_by_type_.find(callParam.node_type) == op_times_by_type_.end()) {
|
||||
op_times_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, 0.0f)));
|
||||
}
|
||||
if (op_times_by_name_.find(callParam.node_name) == op_times_by_name_.end()) {
|
||||
op_times_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, 0.0f)));
|
||||
}
|
||||
struct PerfCount zero;
|
||||
zero.value[0] = 0;
|
||||
zero.value[1] = 0;
|
||||
// before callback
|
||||
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
|
||||
const CallBackParam &callParam) {
|
||||
if (before_inputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of beforeInputs is empty";
|
||||
}
|
||||
if (before_outputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of beforeOutputs is empty";
|
||||
}
|
||||
if (op_perf_by_type_.find(callParam.node_type) == op_perf_by_type_.end()) {
|
||||
op_perf_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, zero)));
|
||||
}
|
||||
if (op_perf_by_name_.find(callParam.node_name) == op_perf_by_name_.end()) {
|
||||
op_perf_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, zero)));
|
||||
}
|
||||
|
||||
op_call_times_total_++;
|
||||
ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
|
||||
ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
|
||||
return true;
|
||||
};
|
||||
op_call_times_total_++;
|
||||
op_begin_ = GetTimeUs();
|
||||
return true;
|
||||
};
|
||||
|
||||
// after callback
|
||||
after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
|
||||
const CallBackParam &call_param) {
|
||||
struct PerfResult res;
|
||||
ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
|
||||
read(perf_fd, &res, sizeof(struct PerfResult));
|
||||
// after callback
|
||||
after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
|
||||
const CallBackParam &call_param) {
|
||||
uint64_t opEnd = GetTimeUs();
|
||||
|
||||
if (after_inputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of after inputs is empty";
|
||||
}
|
||||
if (after_outputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of after outputs is empty";
|
||||
}
|
||||
float cost1 = static_cast<float>(res.values[0].value);
|
||||
float cost2 = static_cast<float>(res.values[1].value);
|
||||
op_cost_total_ += cost1;
|
||||
op_cost2_total_ += cost2;
|
||||
op_perf_by_type_[call_param.node_type].first++;
|
||||
op_perf_by_type_[call_param.node_type].second.value[0] += cost1;
|
||||
op_perf_by_type_[call_param.node_type].second.value[1] += cost2;
|
||||
op_perf_by_name_[call_param.node_name].first++;
|
||||
op_perf_by_name_[call_param.node_name].second.value[0] += cost1;
|
||||
op_perf_by_name_[call_param.node_name].second.value[1] += cost2;
|
||||
return true;
|
||||
};
|
||||
#endif
|
||||
}
|
||||
if (after_inputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of after inputs is empty";
|
||||
}
|
||||
if (after_outputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of after outputs is empty";
|
||||
}
|
||||
|
||||
float cost = static_cast<float>(opEnd - op_begin_) / 1000.0f;
|
||||
if (flags_->device_ == "GPU") {
|
||||
auto gpu_param = reinterpret_cast<const GPUCallBackParam &>(call_param);
|
||||
cost = static_cast<float>(gpu_param.execute_time);
|
||||
}
|
||||
op_cost_total_ += cost;
|
||||
op_times_by_type_[call_param.node_type].first++;
|
||||
op_times_by_type_[call_param.node_type].second += cost;
|
||||
op_times_by_name_[call_param.node_name].first++;
|
||||
op_times_by_name_[call_param.node_name].second += cost;
|
||||
return true;
|
||||
};
|
||||
return RET_OK;
|
||||
}
|
||||
int Benchmark::InitPerfProfilingCallbackParameter() {
|
||||
#ifndef ENABLE_ARM64
|
||||
MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
|
||||
return RET_ERROR;
|
||||
#else
|
||||
struct perf_event_attr pe, pe2;
|
||||
memset(&pe, 0, sizeof(struct perf_event_attr));
|
||||
memset(&pe2, 0, sizeof(struct perf_event_attr));
|
||||
pe.type = PERF_TYPE_HARDWARE;
|
||||
pe2.type = PERF_TYPE_HARDWARE;
|
||||
pe.size = sizeof(struct perf_event_attr);
|
||||
pe2.size = sizeof(struct perf_event_attr);
|
||||
pe.disabled = 1;
|
||||
pe2.disabled = 1;
|
||||
pe.exclude_kernel = 1; // don't count kernel
|
||||
pe2.exclude_kernel = 1; // don't count kernel
|
||||
pe.exclude_hv = 1; // don't count hypervisor
|
||||
pe2.exclude_hv = 1; // don't count hypervisor
|
||||
pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
|
||||
pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
|
||||
if (flags_->perf_event_ == "CACHE") {
|
||||
pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
|
||||
pe2.config = PERF_COUNT_HW_CACHE_MISSES;
|
||||
} else if (flags_->perf_event_ == "STALL") {
|
||||
pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
|
||||
pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
|
||||
} else {
|
||||
pe.config = PERF_COUNT_HW_CPU_CYCLES;
|
||||
pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
|
||||
}
|
||||
perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
|
||||
if (perf_fd == -1) {
|
||||
MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
|
||||
return RET_ERROR;
|
||||
}
|
||||
perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
|
||||
if (perf_fd2 == -1) {
|
||||
MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
|
||||
return RET_ERROR;
|
||||
}
|
||||
struct PerfCount zero;
|
||||
zero.value[0] = 0;
|
||||
zero.value[1] = 0;
|
||||
// before callback
|
||||
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
|
||||
const CallBackParam &callParam) {
|
||||
if (before_inputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of beforeInputs is empty";
|
||||
}
|
||||
if (before_outputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of beforeOutputs is empty";
|
||||
}
|
||||
if (op_perf_by_type_.find(callParam.node_type) == op_perf_by_type_.end()) {
|
||||
op_perf_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, zero)));
|
||||
}
|
||||
if (op_perf_by_name_.find(callParam.node_name) == op_perf_by_name_.end()) {
|
||||
op_perf_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, zero)));
|
||||
}
|
||||
|
||||
op_call_times_total_++;
|
||||
ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
|
||||
ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
|
||||
return true;
|
||||
};
|
||||
|
||||
// after callback
|
||||
after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
|
||||
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
|
||||
const CallBackParam &call_param) {
|
||||
struct PerfResult res;
|
||||
ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
|
||||
read(perf_fd, &res, sizeof(struct PerfResult));
|
||||
|
||||
if (after_inputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of after inputs is empty";
|
||||
}
|
||||
if (after_outputs.empty()) {
|
||||
MS_LOG(INFO) << "The num of after outputs is empty";
|
||||
}
|
||||
float cost1 = static_cast<float>(res.values[0].value);
|
||||
float cost2 = static_cast<float>(res.values[1].value);
|
||||
op_cost_total_ += cost1;
|
||||
op_cost2_total_ += cost2;
|
||||
op_perf_by_type_[call_param.node_type].first++;
|
||||
op_perf_by_type_[call_param.node_type].second.value[0] += cost1;
|
||||
op_perf_by_type_[call_param.node_type].second.value[1] += cost2;
|
||||
op_perf_by_name_[call_param.node_name].first++;
|
||||
op_perf_by_name_[call_param.node_name].second.value[0] += cost1;
|
||||
op_perf_by_name_[call_param.node_name].second.value[1] += cost2;
|
||||
return true;
|
||||
};
|
||||
#endif
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
int Benchmark::InitCallbackParameter() {
|
||||
int ret = RET_OK;
|
||||
if (flags_->time_profiling_) {
|
||||
ret = InitTimeProfilingCallbackParameter();
|
||||
} else if (flags_->perf_profiling_) {
|
||||
ret = InitPerfProfilingCallbackParameter();
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
int Benchmark::Init() {
|
||||
if (this->flags_ == nullptr) {
|
||||
|
@ -859,13 +870,10 @@ int Benchmark::Init() {
|
|||
std::cerr << "numThreads:" << this->flags_->num_threads_ << " must be greater than 0" << std::endl;
|
||||
return RET_ERROR;
|
||||
}
|
||||
|
||||
if (this->flags_->cpu_bind_mode_ == 2) {
|
||||
MS_LOG(INFO) << "cpuBindMode = MID_CPU";
|
||||
std::cout << "cpuBindMode = MID_CPU" << std::endl;
|
||||
} else if (this->flags_->cpu_bind_mode_ == 1) {
|
||||
MS_LOG(INFO) << "cpuBindMode = HIGHER_CPU";
|
||||
std::cout << "cpuBindMode = HIGHER_CPU" << std::endl;
|
||||
static std::vector<std::string> CPU_BIND_MODE_MAP = {"NO_BIND", "HIGHER_CPU", "MID_CPU"};
|
||||
if (this->flags_->cpu_bind_mode_ >= 1) {
|
||||
MS_LOG(INFO) << "cpuBindMode = " << CPU_BIND_MODE_MAP[this->flags_->cpu_bind_mode_];
|
||||
std::cout << "cpuBindMode = " << CPU_BIND_MODE_MAP[this->flags_->cpu_bind_mode_] << std::endl;
|
||||
} else {
|
||||
MS_LOG(INFO) << "cpuBindMode = NO_BIND";
|
||||
std::cout << "cpuBindMode = NO_BIND" << std::endl;
|
||||
|
|
|
@ -163,6 +163,8 @@ class MS_API Benchmark {
|
|||
int *total_size);
|
||||
|
||||
int InitCallbackParameter();
|
||||
int InitTimeProfilingCallbackParameter();
|
||||
int InitPerfProfilingCallbackParameter();
|
||||
|
||||
int PrintResult(const std::vector<std::string> &title, const std::map<std::string, std::pair<int, float>> &result);
|
||||
|
||||
|
|
Loading…
Reference in New Issue