!17827 mindrt parallel ut test

Merge pull request !17827 from ling/pr
This commit is contained in:
i-robot 2021-06-09 18:15:31 +08:00 committed by Gitee
commit d27205ad6c
10 changed files with 53 additions and 96 deletions

View File

@ -40,13 +40,6 @@ class InnerKernel : public Kernel {
const lite::Context *ctx) const lite::Context *ctx)
: op_parameter_(parameter), in_tensors_(std::move(in_tensors)), out_tensors_(std::move(out_tensors)) { : op_parameter_(parameter), in_tensors_(std::move(in_tensors)), out_tensors_(std::move(out_tensors)) {
context_ = ctx; context_ = ctx;
if (parameter != nullptr && parameter->thread_num_ == 0) {
if (ctx != nullptr) {
op_parameter_->thread_num_ = ctx->thread_num_;
} else {
op_parameter_->thread_num_ = 1;
}
}
} }
virtual ~InnerKernel() { virtual ~InnerKernel() {

View File

@ -28,24 +28,12 @@ namespace mindspore::lite {
void LiteOpActor::RunOpData(OpData<lite::Tensor> *inputs, OpContext<lite::Tensor> *context) { void LiteOpActor::RunOpData(OpData<lite::Tensor> *inputs, OpContext<lite::Tensor> *context) {
auto op_uuid = context->sequential_num_; auto op_uuid = context->sequential_num_;
input_op_datas_[op_uuid].push_back(inputs); input_op_datas_[op_uuid].push_back(inputs);
inputs_data_[inputs->index_] = inputs->data_; inputs_data_[inputs->index_] = inputs->data_;
/* in-case infershape done in runtime */
kernel_->in_tensors()[inputs->index_]->set_shape(inputs->data_->shape());
kernel_->in_tensors()[inputs->index_]->set_format(inputs->data_->format());
if (input_op_datas_[op_uuid].size() < kernel_->in_tensors().size()) { if (input_op_datas_[op_uuid].size() < kernel_->in_tensors().size()) {
return; return;
} }
auto ret = CheckInputData(); auto ret = SetInputData();
if (ret != RET_OK) {
input_op_datas_.erase(op_uuid);
context->SetFailed(ret);
return;
}
ret = SetInputData();
if (ret != RET_OK) { if (ret != RET_OK) {
input_op_datas_.erase(op_uuid); input_op_datas_.erase(op_uuid);
context->SetFailed(ret); context->SetFailed(ret);
@ -87,7 +75,7 @@ void LiteOpActor::IsolateInputData(std::vector<std::shared_ptr<LiteOpActor>> *ac
for (QuantArg quant : old_tensor->quant_params()) { for (QuantArg quant : old_tensor->quant_params()) {
new_tensor->AddQuantParam(quant); new_tensor->AddQuantParam(quant);
} }
isolate_input_map_.insert(std::make_pair(old_tensor, new_tensor)); isolate_input_map_.insert(std::make_pair(new_tensor, old_tensor));
int ref_count = 0; int ref_count = 0;
/* set op input for calculate */ /* set op input for calculate */
@ -126,8 +114,8 @@ int LiteOpActor::LiteActorInit(std::vector<std::shared_ptr<LiteOpActor>> *actors
int LiteOpActor::ResizeGraphInput(const std::vector<mindspore::tensor::MSTensor *> &inputs, int LiteOpActor::ResizeGraphInput(const std::vector<mindspore::tensor::MSTensor *> &inputs,
const std::vector<std::vector<int>> &dims) { const std::vector<std::vector<int>> &dims) {
for (auto map : isolate_input_map_) { for (auto map : isolate_input_map_) {
auto src_tensor = map.first; auto isolate_tensor = map.first;
auto isolate_tensor = map.second; auto src_tensor = map.second;
for (size_t i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < inputs.size(); i++) {
if (src_tensor == inputs[i]) { if (src_tensor == inputs[i]) {
isolate_tensor->set_shape(dims[i]); isolate_tensor->set_shape(dims[i]);
@ -225,23 +213,6 @@ int LiteOpActor::CompileArrow() {
return ret; return ret;
} }
int LiteOpActor::CheckInputData() {
if (kernel_->in_tensors().size() != inputs_data_.size()) {
MS_LOG(ERROR) << "kernel:" << kernel_->name() << " inputs_data_.size(): " << inputs_data_.size()
<< " vs kernel_->in_tensors().size(): " << kernel_->in_tensors().size() << " are not equal.";
return RET_PARAM_INVALID;
}
for (size_t i = 0; i < inputs_data_.size(); ++i) {
if (kernel_->in_tensors()[i]->shape() != inputs_data_[i]->shape()) {
MS_LOG(ERROR) << "inputs_data_[" << i << "].shape: " << inputs_data_[i]->shape() << " vs kernel_->in_tensors()["
<< i << "].shape: " << kernel_->in_tensors()[i]->shape() << " are not equal.";
return RET_PARAM_INVALID;
}
}
return RET_OK;
}
void LiteOpActor::MoveInputData(Tensor *dst_tensor, Tensor *src_tensor) { void LiteOpActor::MoveInputData(Tensor *dst_tensor, Tensor *src_tensor) {
MS_ASSERT(src_tensor != dst_tensor); MS_ASSERT(src_tensor != dst_tensor);
@ -302,6 +273,11 @@ int LiteOpActor::SetInputData() {
for (size_t i = 0; i < inputs_data_.size(); ++i) { for (size_t i = 0; i < inputs_data_.size(); ++i) {
auto dst_tensor = kernel_->in_tensors()[i]; auto dst_tensor = kernel_->in_tensors()[i];
auto src_tensor = inputs_data_[i]; auto src_tensor = inputs_data_[i];
/* infershape done in runtime */
dst_tensor->set_shape(src_tensor->shape());
dst_tensor->set_format(src_tensor->format());
if (src_tensor->data_type() != dst_tensor->data_type()) { if (src_tensor->data_type() != dst_tensor->data_type()) {
CopyInputData(dst_tensor, src_tensor); CopyInputData(dst_tensor, src_tensor);
} else { } else {
@ -567,10 +543,7 @@ void LiteSwitchOpActor::AsyncFalseBranchOutput(OpContext<Tensor> *context) {
} }
} }
int MindrtInit(bool enable_parallel) { int MindrtInit() { return mindspore::Initialize("tcp://127.0.0.1:8080", "", "", ""); }
int thread_count = enable_parallel ? 2 : 1;
return mindspore::Initialize("tcp://127.0.0.1:8080", "", "", "", thread_count);
}
void MindrtTerminate(const std::vector<std::shared_ptr<LiteOpActor>> &actor_list) { void MindrtTerminate(const std::vector<std::shared_ptr<LiteOpActor>> &actor_list) {
for (const auto &actor : actor_list) { for (const auto &actor : actor_list) {

View File

@ -42,7 +42,7 @@ class LiteOpActor : public OpActor<lite::Tensor> {
} }
~LiteOpActor() override { ~LiteOpActor() override {
for (auto map : isolate_input_map_) { for (auto map : isolate_input_map_) {
auto isolate_input_tensor = map.second; auto isolate_input_tensor = map.first;
isolate_input_tensor->set_data(nullptr); isolate_input_tensor->set_data(nullptr);
delete isolate_input_tensor; delete isolate_input_tensor;
} }
@ -67,7 +67,6 @@ class LiteOpActor : public OpActor<lite::Tensor> {
void SetPartialMap(const std::unordered_map<size_t, AID> &partial_map) { subgraph_index_to_actor = partial_map; } void SetPartialMap(const std::unordered_map<size_t, AID> &partial_map) { subgraph_index_to_actor = partial_map; }
protected: protected:
int CheckInputData();
int SetInputData(); int SetInputData();
void SetOutputData(OpContext<Tensor> *context); void SetOutputData(OpContext<Tensor> *context);
void AsyncOutput(OpContext<Tensor> *context); void AsyncOutput(OpContext<Tensor> *context);
@ -89,7 +88,7 @@ class LiteOpActor : public OpActor<lite::Tensor> {
private: private:
kernel::LiteKernel *partial_node_ = nullptr; kernel::LiteKernel *partial_node_ = nullptr;
kernel::LiteKernel *call_node_ = nullptr; kernel::LiteKernel *call_node_ = nullptr;
std::unordered_map<Tensor *, Tensor *> isolate_input_map_; std::unordered_map<Tensor *, Tensor *> isolate_input_map_; /* <calculate-tensor, src-input-tensor> */
}; };
class LiteSwitchOpActor : public LiteOpActor { class LiteSwitchOpActor : public LiteOpActor {
@ -104,14 +103,7 @@ class LiteSwitchOpActor : public LiteOpActor {
return; return;
} }
int ret = CheckInputData(); auto ret = SetInputData();
if (ret != RET_OK) {
input_op_datas_.erase(op_uuid);
context->SetFailed(ret);
return;
}
ret = SetInputData();
if (ret != RET_OK) { if (ret != RET_OK) {
input_op_datas_.erase(op_uuid); input_op_datas_.erase(op_uuid);
context->SetFailed(ret); context->SetFailed(ret);
@ -182,7 +174,7 @@ class LiteSwitchOpActor : public LiteOpActor {
std::vector<OpDataPtr<Tensor>> false_branch_outputs_data_; std::vector<OpDataPtr<Tensor>> false_branch_outputs_data_;
}; };
int MindrtInit(bool subgraph_split = false); int MindrtInit();
void MindrtTerminate(const std::vector<std::shared_ptr<LiteOpActor>> &); void MindrtTerminate(const std::vector<std::shared_ptr<LiteOpActor>> &);
std::vector<std::shared_ptr<LiteOpActor>> CreateOpActor(const std::vector<kernel::LiteKernel *> &kernels, std::vector<std::shared_ptr<LiteOpActor>> CreateOpActor(const std::vector<kernel::LiteKernel *> &kernels,

View File

@ -77,9 +77,7 @@ int MindrtExecutor::Resize(const std::vector<mindspore::tensor::MSTensor *> &inp
int MindrtExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels, const std::vector<Tensor *> &inputs, int MindrtExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels, const std::vector<Tensor *> &inputs,
const std::vector<Tensor *> &outputs, const lite::InnerContext *ctx) { const std::vector<Tensor *> &outputs, const lite::InnerContext *ctx) {
MS_ASSERT(kernels.size() != 0); auto ret = MindrtInit();
auto ret = MindrtInit(ctx->enable_parallel_);
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "MindrtInit failed"; MS_LOG(ERROR) << "MindrtInit failed";
return ret; return ret;

View File

@ -40,6 +40,7 @@ kernel::LiteKernel *NPUPassUtils::CreateNchw2NhwcKernel(const std::vector<Tensor
} }
memset(transpose_param, 0, sizeof(TransposeParameter)); memset(transpose_param, 0, sizeof(TransposeParameter));
transpose_param->op_parameter_.type_ = schema::PrimitiveType_Transpose; transpose_param->op_parameter_.type_ = schema::PrimitiveType_Transpose;
transpose_param->op_parameter_.thread_num_ = ctx->thread_num_;
transpose_param->perm_[0] = 0; transpose_param->perm_[0] = 0;
transpose_param->perm_[1] = 2; transpose_param->perm_[1] = 2;
transpose_param->perm_[2] = 3; transpose_param->perm_[2] = 3;
@ -76,6 +77,7 @@ kernel::LiteKernel *NPUPassUtils::CreateNhwc2NchwKernel(const std::vector<Tensor
} }
memset(transpose_param, 0, sizeof(TransposeParameter)); memset(transpose_param, 0, sizeof(TransposeParameter));
transpose_param->op_parameter_.type_ = schema::PrimitiveType_Transpose; transpose_param->op_parameter_.type_ = schema::PrimitiveType_Transpose;
transpose_param->op_parameter_.thread_num_ = ctx->thread_num_;
transpose_param->perm_[0] = 0; transpose_param->perm_[0] = 0;
transpose_param->perm_[1] = 3; transpose_param->perm_[1] = 3;
transpose_param->perm_[2] = 1; transpose_param->perm_[2] = 1;

View File

@ -88,13 +88,7 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
if (context_->enable_parallel_) { if (context_->enable_parallel_) {
auto search_sub_graph = auto search_sub_graph =
SearchSubGraph(context_, src_model_, src_tensors_, &op_parameters_, &graph_output_node_indexes_); SearchSubGraph(context_, src_model_, src_tensors_, &op_parameters_, &graph_output_node_indexes_);
search_sub_graph.SubGraphSplit();
bool offline_parallel_enable = src_model_->all_nodes_.front()->device_type_ != kDefaultDeviceType;
if (offline_parallel_enable) {
search_sub_graph.SubGraphSplitByOffLineParallel();
} else {
search_sub_graph.SubGraphSplitByOutput();
}
} }
ret = ScheduleSubGraphToKernels(kMainSubGraphIndex, dst_kernels, nullptr, nullptr); ret = ScheduleSubGraphToKernels(kMainSubGraphIndex, dst_kernels, nullptr, nullptr);
@ -111,14 +105,7 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
} }
} }
FindAllInoutKernels(*dst_kernels); FindAllInoutKernels(*dst_kernels);
// origin kernel init
for (size_t i = 0; i < dst_kernels->size(); i++) {
ret = (*dst_kernels)[i]->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Kernel " << (*dst_kernels)[i]->name() << " Init failed.";
return ret;
}
}
ret = RunPass(dst_kernels); ret = RunPass(dst_kernels);
if (ret != RET_OK) { if (ret != RET_OK) {
MS_LOG(ERROR) << "Schedule run pass failed."; MS_LOG(ERROR) << "Schedule run pass failed.";
@ -679,13 +666,6 @@ kernel::LiteKernel *Scheduler::SchedulePartialToKernel(const lite::Model::Node *
MS_LOG(ERROR) << "Schedule partial failed, name: " << src_node->name_; MS_LOG(ERROR) << "Schedule partial failed, name: " << src_node->name_;
return nullptr; return nullptr;
} }
for (auto kernel : sub_kernels) {
ret = kernel->Init();
if (ret != RET_OK) {
MS_LOG(ERROR) << "Schedule partial kernel init failed, name: " << kernel->name();
return nullptr;
}
}
FindAllInoutKernels(sub_kernels); FindAllInoutKernels(sub_kernels);
ret = RunPass(&sub_kernels); ret = RunPass(&sub_kernels);
@ -725,6 +705,7 @@ int Scheduler::ScheduleSubGraphToKernels(size_t subgraph_index, std::vector<kern
MS_ASSERT(dst_kernels->empty()); MS_ASSERT(dst_kernels->empty());
auto subgraph = src_model_->sub_graphs_.at(subgraph_index); auto subgraph = src_model_->sub_graphs_.at(subgraph_index);
for (auto node_index : subgraph->node_indices_) { for (auto node_index : subgraph->node_indices_) {
auto ret = RET_OK;
auto node = src_model_->all_nodes_[node_index]; auto node = src_model_->all_nodes_[node_index];
MS_ASSERT(node != nullptr); MS_ASSERT(node != nullptr);
auto *primitive = node->primitive_; auto *primitive = node->primitive_;
@ -735,8 +716,11 @@ int Scheduler::ScheduleSubGraphToKernels(size_t subgraph_index, std::vector<kern
kernel = SchedulePartialToKernel(node); kernel = SchedulePartialToKernel(node);
} else { // kernel } else { // kernel
kernel = ScheduleNodeToKernel(node, prefer_data_type); kernel = ScheduleNodeToKernel(node, prefer_data_type);
if (kernel != nullptr) {
ret = kernel->Init();
} }
if (kernel == nullptr) { }
if (kernel == nullptr || ret != RET_OK) {
MS_LOG(ERROR) << "FindBackendKernel return nullptr, name: " << node->name_ MS_LOG(ERROR) << "FindBackendKernel return nullptr, name: " << node->name_
<< ", type: " << PrimitiveTypeName(prim_type); << ", type: " << PrimitiveTypeName(prim_type);
return RET_ERROR; return RET_ERROR;

View File

@ -559,4 +559,13 @@ void SearchSubGraph::SubGraphSplitByOffLineParallel() {
MS_LOG(DEBUG) << "end to split offline parallel subgraph"; MS_LOG(DEBUG) << "end to split offline parallel subgraph";
} }
void SearchSubGraph::SubGraphSplit() {
bool offline_parallel_enable = model_->all_nodes_.front()->device_type_ != kDefaultDeviceType;
if (offline_parallel_enable) {
SubGraphSplitByOffLineParallel();
} else {
SubGraphSplitByOutput();
}
return;
}
} // namespace mindspore::lite } // namespace mindspore::lite

View File

@ -73,14 +73,19 @@ class SearchSubGraph {
~SearchSubGraph() = default; ~SearchSubGraph() = default;
public: public:
void SubGraphSplitByOutput(); void SubGraphSplit();
void SubGraphSplitByMiddle();
void SubGraphSplitByOffLineParallel();
private: private:
void SubGraphSplitByOutput();
void InitSearchSubGraphByOutput(); void InitSearchSubGraphByOutput();
private:
void SubGraphSplitByMiddle();
void InitSearchSubGraphByMiddle(); void InitSearchSubGraphByMiddle();
private:
void SubGraphSplitByOffLineParallel();
private: private:
void InitSearchTensor(); void InitSearchTensor();
void InitSearchParallelSubGraph(); void InitSearchParallelSubGraph();

View File

@ -1,4 +1,5 @@
# mindrt enable parallel model # mindrt enable parallel model
# model run both CPU-CPU & GPU-CPU # model run both CPU-CPU & GPU-CPU
# model_file ### accuracy_limit ### enable_fp16(true or false) # model_file ### accuracy_limit ### enable_fp16(true or false)
mtk_model_normalize_object_scene_ps_20200519_f32.tflite 0.5 false mtk_model_normalize_object_scene_ps_20200519_f32.tflite;0.5;false
# end

View File

@ -277,17 +277,17 @@ function Run_mindrt_parallel() {
data_path="/data/local/tmp/input_output/" data_path="/data/local/tmp/input_output/"
output=${data_path}'output/'${model_name}'.ms.out' output=${data_path}'output/'${model_name}'.ms.out'
input=${model_name}.ms.bin input=${data_path}'input/'${model_name}'.ms.bin'
model=${model_name}'.ms' model=${model_name}'.ms'
echo ${model_name} >> "${run_parallel_log_file}" echo ${model_name} >> "${run_parallel_log_file}"
echo "run mindrt parallel test : ${model_name}" echo "run mindrt parallel test : ${model_name}"
########## RUN CPU-GPU parallel ########## RUN CPU-CPU parallel
echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test' > adb_run_cmd.txt echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test' >> adb_run_cmd.txt
echo './benchmark --enableParallel=true --device=GPU --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> adb_run_cmd.txt echo './benchmark --enableParallel=true --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> adb_run_cmd.txt
echo './benchmark --enableParallel=true --device=GPU --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> "${run_parallel_log_file}" echo './benchmark --enableParallel=true --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> "${run_parallel_log_file}"
adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_parallel_log_file}" adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_parallel_log_file}"
if [ $? = 0 ]; then if [ $? = 0 ]; then
@ -296,12 +296,12 @@ function Run_mindrt_parallel() {
run_result='mindrt_parallel_CPU_GPU: '${model_name}' failed'; echo ${run_result} >> ${run_parallel_result_file}; return 1 run_result='mindrt_parallel_CPU_GPU: '${model_name}' failed'; echo ${run_result} >> ${run_parallel_result_file}; return 1
fi fi
########## RUN CPU-CPU parallel ########## RUN CPU-GPU parallel
echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test' > adb_run_cmd.txt echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test' >> adb_run_cmd.txt
echo './benchmark --enableParallel=true --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> adb_run_cmd.txt echo './benchmark --enableParallel=true --device=GPU --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> adb_run_cmd.txt
echo './benchmark --enableParallel=true --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> "${run_parallel_log_file}" echo './benchmark --enableParallel=true --device=GPU --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> "${run_parallel_log_file}"
adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_parallel_log_file}" adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_parallel_log_file}"
if [ $? = 0 ]; then if [ $? = 0 ]; then
@ -486,7 +486,7 @@ if [[ $backend == "all" || $backend == "gpu" ]]; then
if [[ ${Run_mindrt_parallel_status} != 0 ]];then if [[ ${Run_mindrt_parallel_status} != 0 ]];then
echo "Run_mindrt_parallel failed" echo "Run_mindrt_parallel failed"
cat ${run_gpu_log_file} cat ${run_parallel_log_file}
fi fi
echo "Run_parallel is ended" echo "Run_parallel is ended"