!13485 [MS][LITE][GPU]GPU support print profiling info

From: @chenzupeng
Reviewed-by: @ddwsky,@zhanghaibo5
Signed-off-by: @ddwsky
This commit is contained in:
mindspore-ci-bot 2021-03-18 10:28:47 +08:00 committed by Gitee
commit 4b319bbac7
9 changed files with 224 additions and 154 deletions

View File

@ -100,6 +100,10 @@ struct CallBackParam {
std::string node_type; /**< node type argument */
};
struct GPUCallBackParam : CallBackParam {
double execute_time{-1.f};
};
/// \brief KernelCallBack defined the function pointer for callBack.
using KernelCallBack = std::function<bool(std::vector<tensor::MSTensor *> inputs,
std::vector<tensor::MSTensor *> outputs, const CallBackParam &opInfo)>;

View File

@ -32,15 +32,18 @@ int OpenCLExecutor::RunOrTune(const std::vector<Tensor *> &inputs, const std::ve
const KernelCallBack &before, const KernelCallBack &after, bool is_tune) {
int ret{RET_OK};
auto opencl_runtime_ins = ocl_runtime.GetInstance();
if (before != nullptr && after != nullptr) {
opencl_runtime_ins->SetProfiling(true);
}
auto profiling_tmp = opencl_runtime_ins->isProfiling();
if (is_tune) {
opencl_runtime_ins->SetProfiling(true);
}
for (auto *kernel : kernels) {
MS_ASSERT(kernel);
CallBackParam callbackParam;
GPUCallBackParam callbackParam;
callbackParam.node_name = kernel->name();
callbackParam.node_type = kernel->type_str();
if (before != nullptr) {
if (!before(TensorVectorCast(kernel->in_tensors()), TensorVectorCast(kernel->out_tensors()), callbackParam)) {
MS_LOG(ERROR) << "run kernel before_callback failed, name: " << kernel->name();
@ -70,9 +73,12 @@ int OpenCLExecutor::RunOrTune(const std::vector<Tensor *> &inputs, const std::ve
MS_LOG(ERROR) << "run kernel failed, name: " << kernel->name();
return ret;
}
if (profiling_tmp)
if (profiling_tmp) {
auto execute_time = op_kernel->GetProfilingTimeMs();
MS_LOG(INFO) << "OpenCl kernel " << kernel->name() << "(" << kernel->type_str()
<< ") execute time is: " << op_kernel->GetProfilingTimeMs() << "ms";
callbackParam.execute_time = execute_time;
}
}
ret = kernel->PostProcess();
if (ret != RET_OK) {

View File

@ -198,12 +198,29 @@ int WinogradOpenCLKernel::Run() {
ocl_runtime_->RunKernel(kernel_4x4to36_, global_4x4to36_, local_4x4to36_, nullptr, &event_);
MS_LOG(DEBUG) << "winograd kernel1 Running!";
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &event_);
ocl_runtime_->RunKernel(kernel_, global_range_, local_range_, nullptr, &kernel2_event_);
MS_LOG(DEBUG) << "winograd kernel2 Running!";
ocl_runtime_->SetKernelArg(kernel_36to4x4_, 1, out_tensors_.front()->data_c());
ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_, nullptr, &event_);
ocl_runtime_->RunKernel(kernel_36to4x4_, global_36to4x4_, local_36to4x4_, nullptr, &kernel3_event_);
return RET_OK;
}
double WinogradOpenCLKernel::GetProfilingTimeMs() {
if (!ocl_runtime_->isProfiling()) {
return MAX_PROFILING_TIME_MILLI_SECOND;
}
cl_ulong time_start;
cl_ulong time_end;
event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
cl_ulong time_ns = time_end - time_start;
kernel2_event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
kernel2_event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
time_ns += time_end - time_start;
kernel3_event_.getProfilingInfo(CL_PROFILING_COMMAND_START, &time_start);
kernel3_event_.getProfilingInfo(CL_PROFILING_COMMAND_END, &time_end);
time_ns += time_end - time_start;
return static_cast<double>(time_ns) * 1e-6;
}
} // namespace mindspore::kernel

View File

@ -39,6 +39,7 @@ class WinogradOpenCLKernel : public Conv2DOpenCLKernel {
std::vector<BaseTuningParameter> GenerateTuningParam() override { return {}; }
int Tune() override { return RET_OK; }
double GetProfilingTimeMs() override;
private:
void BuildKernel() override;
@ -47,8 +48,10 @@ class WinogradOpenCLKernel : public Conv2DOpenCLKernel {
cl::Kernel kernel_4x4to36_;
cl::Kernel kernel_36to4x4_;
cl::Event kernel2_event_;
cl::NDRange global_4x4to36_, local_4x4to36_;
cl::NDRange global_36to4x4_, local_36to4x4_;
cl::Event kernel3_event_;
void *winograd_mem0_{nullptr};
void *winograd_mem1_{nullptr};
};

View File

@ -195,7 +195,7 @@ class OpenCLKernel : public LiteKernel {
lite::opencl::MemType GetMemType() { return out_mem_type_; }
void SetMemType(lite::opencl::MemType mem_type) { out_mem_type_ = mem_type; }
OpParameter *GetParameter() { return op_parameter_; }
double GetProfilingTimeMs();
virtual double GetProfilingTimeMs();
int DequantWeight();
void FreeDequantedWeight();
virtual int InferShape();

View File

@ -18,6 +18,7 @@
#include <set>
#include <map>
#include <string>
#include <utility>
#include "src/runtime/gpu/opencl/opencl_executor.h"
#include "src/runtime/kernel/opencl/utils.h"
#include "src/runtime/kernel/opencl/kernel/to_format.h"
@ -467,4 +468,33 @@ int OpenCLSubGraph::Run() {
}
return RET_OK;
}
int OpenCLSubGraph::Run(const KernelCallBack &before, const KernelCallBack &after) {
if (executor_ == nullptr) {
MS_LOG(ERROR) << "executor is nullptr";
return RET_ERROR;
}
int ret;
for (auto &tensor : in_tensors_) {
MS_ASSERT(tensor);
if (tensor->data_c() == nullptr) {
MS_LOG(ERROR) << "OpenCL subgraph input tensor data is null";
return RET_ERROR;
}
ret = allocator_->UnmapBuffer(tensor->data_c());
if (ret != RET_OK) {
return ret;
}
}
ret = executor_->Run(in_tensors_, out_tensors_, nodes_, allocator_, before, after);
if (ret != RET_OK) {
MS_LOG(ERROR) << "Run opencl executor failed: " << ret;
return ret;
}
if (!ocl_runtime_->SyncCommandQueue()) {
return RET_ERROR;
}
return RET_OK;
}
} // namespace mindspore::kernel

View File

@ -46,7 +46,7 @@ class OpenCLSubGraph : public SubGraphKernel {
int ReSize() override;
int ReSize(bool interrupt);
int Run() override;
int Run(const KernelCallBack &before, const KernelCallBack &after) override { return this->Run(); };
int Run(const KernelCallBack &before, const KernelCallBack &after) override;
int InsertOpsPass();
bool IsSubGraphInferShapeDone();

View File

@ -568,10 +568,8 @@ int Benchmark::RunBenchmark() {
}
auto &cpu_device_ctx = context->device_list_[0];
if (flags_->cpu_bind_mode_ == MID_CPU) {
cpu_device_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = MID_CPU;
} else if (flags_->cpu_bind_mode_ == HIGHER_CPU) {
cpu_device_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = HIGHER_CPU;
if (flags_->cpu_bind_mode_ == MID_CPU || flags_->cpu_bind_mode_ == HIGHER_CPU) {
cpu_device_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = CpuBindMode(flags_->cpu_bind_mode_);
} else {
cpu_device_ctx.device_info_.cpu_device_info_.cpu_bind_mode_ = NO_BIND;
}
@ -611,9 +609,8 @@ int Benchmark::RunBenchmark() {
return ret;
}
}
if (model != nullptr) {
model->Free();
}
if (model != nullptr) model->Free();
ms_inputs_ = session_->GetInputs();
auto end_prepare_time = GetTimeUs();
MS_LOG(INFO) << "PrepareTime = " << (end_prepare_time - start_prepare_time) / 1000 << " ms";
@ -682,147 +679,161 @@ void BenchmarkFlags::InitResizeDimsList() {
}
}
int Benchmark::InitCallbackParameter() {
if (flags_->time_profiling_) {
// before callback
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &callParam) {
if (before_inputs.empty()) {
MS_LOG(INFO) << "The num of beforeInputs is empty";
}
if (before_outputs.empty()) {
MS_LOG(INFO) << "The num of beforeOutputs is empty";
}
if (op_times_by_type_.find(callParam.node_type) == op_times_by_type_.end()) {
op_times_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, 0.0f)));
}
if (op_times_by_name_.find(callParam.node_name) == op_times_by_name_.end()) {
op_times_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, 0.0f)));
}
op_call_times_total_++;
op_begin_ = GetTimeUs();
return true;
};
// after callback
after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
const CallBackParam &call_param) {
uint64_t opEnd = GetTimeUs();
if (after_inputs.empty()) {
MS_LOG(INFO) << "The num of after inputs is empty";
}
if (after_outputs.empty()) {
MS_LOG(INFO) << "The num of after outputs is empty";
}
float cost = static_cast<float>(opEnd - op_begin_) / 1000.0f;
op_cost_total_ += cost;
op_times_by_type_[call_param.node_type].first++;
op_times_by_type_[call_param.node_type].second += cost;
op_times_by_name_[call_param.node_name].first++;
op_times_by_name_[call_param.node_name].second += cost;
return true;
};
} else if (flags_->perf_profiling_) {
#ifndef ENABLE_ARM64
MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
return RET_ERROR;
#else
struct perf_event_attr pe, pe2;
memset(&pe, 0, sizeof(struct perf_event_attr));
memset(&pe2, 0, sizeof(struct perf_event_attr));
pe.type = PERF_TYPE_HARDWARE;
pe2.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(struct perf_event_attr);
pe2.size = sizeof(struct perf_event_attr);
pe.disabled = 1;
pe2.disabled = 1;
pe.exclude_kernel = 1; // don't count kernel
pe2.exclude_kernel = 1; // don't count kernel
pe.exclude_hv = 1; // don't count hypervisor
pe2.exclude_hv = 1; // don't count hypervisor
pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
if (flags_->perf_event_ == "CACHE") {
pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
pe2.config = PERF_COUNT_HW_CACHE_MISSES;
} else if (flags_->perf_event_ == "STALL") {
pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
} else {
pe.config = PERF_COUNT_HW_CPU_CYCLES;
pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
int Benchmark::InitTimeProfilingCallbackParameter() {
// before callback
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &callParam) {
if (before_inputs.empty()) {
MS_LOG(INFO) << "The num of beforeInputs is empty";
}
perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
if (perf_fd == -1) {
MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
return RET_ERROR;
if (before_outputs.empty()) {
MS_LOG(INFO) << "The num of beforeOutputs is empty";
}
perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
if (perf_fd2 == -1) {
MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
return RET_ERROR;
if (op_times_by_type_.find(callParam.node_type) == op_times_by_type_.end()) {
op_times_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, 0.0f)));
}
if (op_times_by_name_.find(callParam.node_name) == op_times_by_name_.end()) {
op_times_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, 0.0f)));
}
struct PerfCount zero;
zero.value[0] = 0;
zero.value[1] = 0;
// before callback
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &callParam) {
if (before_inputs.empty()) {
MS_LOG(INFO) << "The num of beforeInputs is empty";
}
if (before_outputs.empty()) {
MS_LOG(INFO) << "The num of beforeOutputs is empty";
}
if (op_perf_by_type_.find(callParam.node_type) == op_perf_by_type_.end()) {
op_perf_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, zero)));
}
if (op_perf_by_name_.find(callParam.node_name) == op_perf_by_name_.end()) {
op_perf_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, zero)));
}
op_call_times_total_++;
ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
return true;
};
op_call_times_total_++;
op_begin_ = GetTimeUs();
return true;
};
// after callback
after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
const CallBackParam &call_param) {
struct PerfResult res;
ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
read(perf_fd, &res, sizeof(struct PerfResult));
// after callback
after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
const CallBackParam &call_param) {
uint64_t opEnd = GetTimeUs();
if (after_inputs.empty()) {
MS_LOG(INFO) << "The num of after inputs is empty";
}
if (after_outputs.empty()) {
MS_LOG(INFO) << "The num of after outputs is empty";
}
float cost1 = static_cast<float>(res.values[0].value);
float cost2 = static_cast<float>(res.values[1].value);
op_cost_total_ += cost1;
op_cost2_total_ += cost2;
op_perf_by_type_[call_param.node_type].first++;
op_perf_by_type_[call_param.node_type].second.value[0] += cost1;
op_perf_by_type_[call_param.node_type].second.value[1] += cost2;
op_perf_by_name_[call_param.node_name].first++;
op_perf_by_name_[call_param.node_name].second.value[0] += cost1;
op_perf_by_name_[call_param.node_name].second.value[1] += cost2;
return true;
};
#endif
}
if (after_inputs.empty()) {
MS_LOG(INFO) << "The num of after inputs is empty";
}
if (after_outputs.empty()) {
MS_LOG(INFO) << "The num of after outputs is empty";
}
float cost = static_cast<float>(opEnd - op_begin_) / 1000.0f;
if (flags_->device_ == "GPU") {
auto gpu_param = reinterpret_cast<const GPUCallBackParam &>(call_param);
cost = static_cast<float>(gpu_param.execute_time);
}
op_cost_total_ += cost;
op_times_by_type_[call_param.node_type].first++;
op_times_by_type_[call_param.node_type].second += cost;
op_times_by_name_[call_param.node_name].first++;
op_times_by_name_[call_param.node_name].second += cost;
return true;
};
return RET_OK;
}
int Benchmark::InitPerfProfilingCallbackParameter() {
#ifndef ENABLE_ARM64
MS_LOG(ERROR) << "Only support perf_profiling on arm64.";
return RET_ERROR;
#else
struct perf_event_attr pe, pe2;
memset(&pe, 0, sizeof(struct perf_event_attr));
memset(&pe2, 0, sizeof(struct perf_event_attr));
pe.type = PERF_TYPE_HARDWARE;
pe2.type = PERF_TYPE_HARDWARE;
pe.size = sizeof(struct perf_event_attr);
pe2.size = sizeof(struct perf_event_attr);
pe.disabled = 1;
pe2.disabled = 1;
pe.exclude_kernel = 1; // don't count kernel
pe2.exclude_kernel = 1; // don't count kernel
pe.exclude_hv = 1; // don't count hypervisor
pe2.exclude_hv = 1; // don't count hypervisor
pe.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
pe2.read_format = PERF_FORMAT_GROUP | PERF_FORMAT_ID;
if (flags_->perf_event_ == "CACHE") {
pe.config = PERF_COUNT_HW_CACHE_REFERENCES;
pe2.config = PERF_COUNT_HW_CACHE_MISSES;
} else if (flags_->perf_event_ == "STALL") {
pe.config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND;
pe2.config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND;
} else {
pe.config = PERF_COUNT_HW_CPU_CYCLES;
pe2.config = PERF_COUNT_HW_INSTRUCTIONS;
}
perf_fd = syscall(__NR_perf_event_open, pe, 0, -1, -1, 0);
if (perf_fd == -1) {
MS_LOG(ERROR) << "Failed to open perf event " << pe.config;
return RET_ERROR;
}
perf_fd2 = syscall(__NR_perf_event_open, pe2, 0, -1, perf_fd, 0);
if (perf_fd2 == -1) {
MS_LOG(ERROR) << "Failed to open perf event " << pe2.config;
return RET_ERROR;
}
struct PerfCount zero;
zero.value[0] = 0;
zero.value[1] = 0;
// before callback
before_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &before_inputs,
const std::vector<mindspore::tensor::MSTensor *> &before_outputs,
const CallBackParam &callParam) {
if (before_inputs.empty()) {
MS_LOG(INFO) << "The num of beforeInputs is empty";
}
if (before_outputs.empty()) {
MS_LOG(INFO) << "The num of beforeOutputs is empty";
}
if (op_perf_by_type_.find(callParam.node_type) == op_perf_by_type_.end()) {
op_perf_by_type_.insert(std::make_pair(callParam.node_type, std::make_pair(0, zero)));
}
if (op_perf_by_name_.find(callParam.node_name) == op_perf_by_name_.end()) {
op_perf_by_name_.insert(std::make_pair(callParam.node_name, std::make_pair(0, zero)));
}
op_call_times_total_++;
ioctl(perf_fd, PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
ioctl(perf_fd, PERF_EVENT_IOC_ENABLE, PERF_IOC_FLAG_GROUP);
return true;
};
// after callback
after_call_back_ = [&](const std::vector<mindspore::tensor::MSTensor *> &after_inputs,
const std::vector<mindspore::tensor::MSTensor *> &after_outputs,
const CallBackParam &call_param) {
struct PerfResult res;
ioctl(perf_fd, PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
read(perf_fd, &res, sizeof(struct PerfResult));
if (after_inputs.empty()) {
MS_LOG(INFO) << "The num of after inputs is empty";
}
if (after_outputs.empty()) {
MS_LOG(INFO) << "The num of after outputs is empty";
}
float cost1 = static_cast<float>(res.values[0].value);
float cost2 = static_cast<float>(res.values[1].value);
op_cost_total_ += cost1;
op_cost2_total_ += cost2;
op_perf_by_type_[call_param.node_type].first++;
op_perf_by_type_[call_param.node_type].second.value[0] += cost1;
op_perf_by_type_[call_param.node_type].second.value[1] += cost2;
op_perf_by_name_[call_param.node_name].first++;
op_perf_by_name_[call_param.node_name].second.value[0] += cost1;
op_perf_by_name_[call_param.node_name].second.value[1] += cost2;
return true;
};
#endif
return RET_OK;
}
int Benchmark::InitCallbackParameter() {
int ret = RET_OK;
if (flags_->time_profiling_) {
ret = InitTimeProfilingCallbackParameter();
} else if (flags_->perf_profiling_) {
ret = InitPerfProfilingCallbackParameter();
}
return ret;
}
int Benchmark::Init() {
if (this->flags_ == nullptr) {
@ -859,13 +870,10 @@ int Benchmark::Init() {
std::cerr << "numThreads:" << this->flags_->num_threads_ << " must be greater than 0" << std::endl;
return RET_ERROR;
}
if (this->flags_->cpu_bind_mode_ == 2) {
MS_LOG(INFO) << "cpuBindMode = MID_CPU";
std::cout << "cpuBindMode = MID_CPU" << std::endl;
} else if (this->flags_->cpu_bind_mode_ == 1) {
MS_LOG(INFO) << "cpuBindMode = HIGHER_CPU";
std::cout << "cpuBindMode = HIGHER_CPU" << std::endl;
static std::vector<std::string> CPU_BIND_MODE_MAP = {"NO_BIND", "HIGHER_CPU", "MID_CPU"};
if (this->flags_->cpu_bind_mode_ >= 1) {
MS_LOG(INFO) << "cpuBindMode = " << CPU_BIND_MODE_MAP[this->flags_->cpu_bind_mode_];
std::cout << "cpuBindMode = " << CPU_BIND_MODE_MAP[this->flags_->cpu_bind_mode_] << std::endl;
} else {
MS_LOG(INFO) << "cpuBindMode = NO_BIND";
std::cout << "cpuBindMode = NO_BIND" << std::endl;

View File

@ -163,6 +163,8 @@ class MS_API Benchmark {
int *total_size);
int InitCallbackParameter();
int InitTimeProfilingCallbackParameter();
int InitPerfProfilingCallbackParameter();
int PrintResult(const std::vector<std::string> &title, const std::map<std::string, std::pair<int, float>> &result);