diff --git a/mindspore/lite/src/inner_kernel.h b/mindspore/lite/src/inner_kernel.h index d91218f584b..d7be21a01b3 100644 --- a/mindspore/lite/src/inner_kernel.h +++ b/mindspore/lite/src/inner_kernel.h @@ -40,13 +40,6 @@ class InnerKernel : public Kernel { const lite::Context *ctx) : op_parameter_(parameter), in_tensors_(std::move(in_tensors)), out_tensors_(std::move(out_tensors)) { context_ = ctx; - if (parameter != nullptr && parameter->thread_num_ == 0) { - if (ctx != nullptr) { - op_parameter_->thread_num_ = ctx->thread_num_; - } else { - op_parameter_->thread_num_ = 1; - } - } } virtual ~InnerKernel() { diff --git a/mindspore/lite/src/lite_mindrt.cc b/mindspore/lite/src/lite_mindrt.cc index 05db2fbc29d..9eaf08c410f 100644 --- a/mindspore/lite/src/lite_mindrt.cc +++ b/mindspore/lite/src/lite_mindrt.cc @@ -28,24 +28,12 @@ namespace mindspore::lite { void LiteOpActor::RunOpData(OpData *inputs, OpContext *context) { auto op_uuid = context->sequential_num_; input_op_datas_[op_uuid].push_back(inputs); - inputs_data_[inputs->index_] = inputs->data_; - /* in-case infershape done in runtime */ - kernel_->in_tensors()[inputs->index_]->set_shape(inputs->data_->shape()); - kernel_->in_tensors()[inputs->index_]->set_format(inputs->data_->format()); - if (input_op_datas_[op_uuid].size() < kernel_->in_tensors().size()) { return; } - auto ret = CheckInputData(); - if (ret != RET_OK) { - input_op_datas_.erase(op_uuid); - context->SetFailed(ret); - return; - } - - ret = SetInputData(); + auto ret = SetInputData(); if (ret != RET_OK) { input_op_datas_.erase(op_uuid); context->SetFailed(ret); @@ -87,7 +75,7 @@ void LiteOpActor::IsolateInputData(std::vector> *ac for (QuantArg quant : old_tensor->quant_params()) { new_tensor->AddQuantParam(quant); } - isolate_input_map_.insert(std::make_pair(old_tensor, new_tensor)); + isolate_input_map_.insert(std::make_pair(new_tensor, old_tensor)); int ref_count = 0; /* set op input for calculate */ @@ -126,8 +114,8 @@ int LiteOpActor::LiteActorInit(std::vector> *actors int LiteOpActor::ResizeGraphInput(const std::vector &inputs, const std::vector> &dims) { for (auto map : isolate_input_map_) { - auto src_tensor = map.first; - auto isolate_tensor = map.second; + auto isolate_tensor = map.first; + auto src_tensor = map.second; for (size_t i = 0; i < inputs.size(); i++) { if (src_tensor == inputs[i]) { isolate_tensor->set_shape(dims[i]); @@ -225,23 +213,6 @@ int LiteOpActor::CompileArrow() { return ret; } -int LiteOpActor::CheckInputData() { - if (kernel_->in_tensors().size() != inputs_data_.size()) { - MS_LOG(ERROR) << "kernel:" << kernel_->name() << " inputs_data_.size(): " << inputs_data_.size() - << " vs kernel_->in_tensors().size(): " << kernel_->in_tensors().size() << " are not equal."; - return RET_PARAM_INVALID; - } - - for (size_t i = 0; i < inputs_data_.size(); ++i) { - if (kernel_->in_tensors()[i]->shape() != inputs_data_[i]->shape()) { - MS_LOG(ERROR) << "inputs_data_[" << i << "].shape: " << inputs_data_[i]->shape() << " vs kernel_->in_tensors()[" - << i << "].shape: " << kernel_->in_tensors()[i]->shape() << " are not equal."; - return RET_PARAM_INVALID; - } - } - return RET_OK; -} - void LiteOpActor::MoveInputData(Tensor *dst_tensor, Tensor *src_tensor) { MS_ASSERT(src_tensor != dst_tensor); @@ -302,6 +273,11 @@ int LiteOpActor::SetInputData() { for (size_t i = 0; i < inputs_data_.size(); ++i) { auto dst_tensor = kernel_->in_tensors()[i]; auto src_tensor = inputs_data_[i]; + + /* infershape done in runtime */ + dst_tensor->set_shape(src_tensor->shape()); + dst_tensor->set_format(src_tensor->format()); + if (src_tensor->data_type() != dst_tensor->data_type()) { CopyInputData(dst_tensor, src_tensor); } else { @@ -567,10 +543,7 @@ void LiteSwitchOpActor::AsyncFalseBranchOutput(OpContext *context) { } } -int MindrtInit(bool enable_parallel) { - int thread_count = enable_parallel ? 2 : 1; - return mindspore::Initialize("tcp://127.0.0.1:8080", "", "", "", thread_count); -} +int MindrtInit() { return mindspore::Initialize("tcp://127.0.0.1:8080", "", "", ""); } void MindrtTerminate(const std::vector> &actor_list) { for (const auto &actor : actor_list) { diff --git a/mindspore/lite/src/lite_mindrt.h b/mindspore/lite/src/lite_mindrt.h index 9de49171450..e3d70630b11 100644 --- a/mindspore/lite/src/lite_mindrt.h +++ b/mindspore/lite/src/lite_mindrt.h @@ -42,7 +42,7 @@ class LiteOpActor : public OpActor { } ~LiteOpActor() override { for (auto map : isolate_input_map_) { - auto isolate_input_tensor = map.second; + auto isolate_input_tensor = map.first; isolate_input_tensor->set_data(nullptr); delete isolate_input_tensor; } @@ -67,7 +67,6 @@ class LiteOpActor : public OpActor { void SetPartialMap(const std::unordered_map &partial_map) { subgraph_index_to_actor = partial_map; } protected: - int CheckInputData(); int SetInputData(); void SetOutputData(OpContext *context); void AsyncOutput(OpContext *context); @@ -89,7 +88,7 @@ class LiteOpActor : public OpActor { private: kernel::LiteKernel *partial_node_ = nullptr; kernel::LiteKernel *call_node_ = nullptr; - std::unordered_map isolate_input_map_; + std::unordered_map isolate_input_map_; /* */ }; class LiteSwitchOpActor : public LiteOpActor { @@ -104,14 +103,7 @@ class LiteSwitchOpActor : public LiteOpActor { return; } - int ret = CheckInputData(); - if (ret != RET_OK) { - input_op_datas_.erase(op_uuid); - context->SetFailed(ret); - return; - } - - ret = SetInputData(); + auto ret = SetInputData(); if (ret != RET_OK) { input_op_datas_.erase(op_uuid); context->SetFailed(ret); @@ -182,7 +174,7 @@ class LiteSwitchOpActor : public LiteOpActor { std::vector> false_branch_outputs_data_; }; -int MindrtInit(bool subgraph_split = false); +int MindrtInit(); void MindrtTerminate(const std::vector> &); std::vector> CreateOpActor(const std::vector &kernels, diff --git a/mindspore/lite/src/mindrt_executor.cc b/mindspore/lite/src/mindrt_executor.cc index 7efb2d7f75a..1636c42678b 100644 --- a/mindspore/lite/src/mindrt_executor.cc +++ b/mindspore/lite/src/mindrt_executor.cc @@ -77,9 +77,7 @@ int MindrtExecutor::Resize(const std::vector &inp int MindrtExecutor::Prepare(const std::vector &kernels, const std::vector &inputs, const std::vector &outputs, const lite::InnerContext *ctx) { - MS_ASSERT(kernels.size() != 0); - - auto ret = MindrtInit(ctx->enable_parallel_); + auto ret = MindrtInit(); if (ret != RET_OK) { MS_LOG(ERROR) << "MindrtInit failed"; return ret; diff --git a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc index cad7a82fa36..dfc7274f7c1 100644 --- a/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc +++ b/mindspore/lite/src/runtime/agent/npu/optimizer/npu_pass_utils.cc @@ -40,6 +40,7 @@ kernel::LiteKernel *NPUPassUtils::CreateNchw2NhwcKernel(const std::vectorop_parameter_.type_ = schema::PrimitiveType_Transpose; + transpose_param->op_parameter_.thread_num_ = ctx->thread_num_; transpose_param->perm_[0] = 0; transpose_param->perm_[1] = 2; transpose_param->perm_[2] = 3; @@ -76,6 +77,7 @@ kernel::LiteKernel *NPUPassUtils::CreateNhwc2NchwKernel(const std::vectorop_parameter_.type_ = schema::PrimitiveType_Transpose; + transpose_param->op_parameter_.thread_num_ = ctx->thread_num_; transpose_param->perm_[0] = 0; transpose_param->perm_[1] = 3; transpose_param->perm_[2] = 1; diff --git a/mindspore/lite/src/scheduler.cc b/mindspore/lite/src/scheduler.cc index 9d9bc2ad8da..70ec73a1147 100644 --- a/mindspore/lite/src/scheduler.cc +++ b/mindspore/lite/src/scheduler.cc @@ -88,13 +88,7 @@ int Scheduler::Schedule(std::vector *dst_kernels) { if (context_->enable_parallel_) { auto search_sub_graph = SearchSubGraph(context_, src_model_, src_tensors_, &op_parameters_, &graph_output_node_indexes_); - - bool offline_parallel_enable = src_model_->all_nodes_.front()->device_type_ != kDefaultDeviceType; - if (offline_parallel_enable) { - search_sub_graph.SubGraphSplitByOffLineParallel(); - } else { - search_sub_graph.SubGraphSplitByOutput(); - } + search_sub_graph.SubGraphSplit(); } ret = ScheduleSubGraphToKernels(kMainSubGraphIndex, dst_kernels, nullptr, nullptr); @@ -111,14 +105,7 @@ int Scheduler::Schedule(std::vector *dst_kernels) { } } FindAllInoutKernels(*dst_kernels); - // origin kernel init - for (size_t i = 0; i < dst_kernels->size(); i++) { - ret = (*dst_kernels)[i]->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Kernel " << (*dst_kernels)[i]->name() << " Init failed."; - return ret; - } - } + ret = RunPass(dst_kernels); if (ret != RET_OK) { MS_LOG(ERROR) << "Schedule run pass failed."; @@ -679,13 +666,6 @@ kernel::LiteKernel *Scheduler::SchedulePartialToKernel(const lite::Model::Node * MS_LOG(ERROR) << "Schedule partial failed, name: " << src_node->name_; return nullptr; } - for (auto kernel : sub_kernels) { - ret = kernel->Init(); - if (ret != RET_OK) { - MS_LOG(ERROR) << "Schedule partial kernel init failed, name: " << kernel->name(); - return nullptr; - } - } FindAllInoutKernels(sub_kernels); ret = RunPass(&sub_kernels); @@ -725,6 +705,7 @@ int Scheduler::ScheduleSubGraphToKernels(size_t subgraph_index, std::vectorempty()); auto subgraph = src_model_->sub_graphs_.at(subgraph_index); for (auto node_index : subgraph->node_indices_) { + auto ret = RET_OK; auto node = src_model_->all_nodes_[node_index]; MS_ASSERT(node != nullptr); auto *primitive = node->primitive_; @@ -735,8 +716,11 @@ int Scheduler::ScheduleSubGraphToKernels(size_t subgraph_index, std::vectorInit(); + } } - if (kernel == nullptr) { + if (kernel == nullptr || ret != RET_OK) { MS_LOG(ERROR) << "FindBackendKernel return nullptr, name: " << node->name_ << ", type: " << PrimitiveTypeName(prim_type); return RET_ERROR; diff --git a/mindspore/lite/src/sub_graph_split.cc b/mindspore/lite/src/sub_graph_split.cc index 850ffbec3cd..f83e786b20d 100644 --- a/mindspore/lite/src/sub_graph_split.cc +++ b/mindspore/lite/src/sub_graph_split.cc @@ -559,4 +559,13 @@ void SearchSubGraph::SubGraphSplitByOffLineParallel() { MS_LOG(DEBUG) << "end to split offline parallel subgraph"; } +void SearchSubGraph::SubGraphSplit() { + bool offline_parallel_enable = model_->all_nodes_.front()->device_type_ != kDefaultDeviceType; + if (offline_parallel_enable) { + SubGraphSplitByOffLineParallel(); + } else { + SubGraphSplitByOutput(); + } + return; +} } // namespace mindspore::lite diff --git a/mindspore/lite/src/sub_graph_split.h b/mindspore/lite/src/sub_graph_split.h index 3fd45a8374e..3bda86ba4b3 100644 --- a/mindspore/lite/src/sub_graph_split.h +++ b/mindspore/lite/src/sub_graph_split.h @@ -73,14 +73,19 @@ class SearchSubGraph { ~SearchSubGraph() = default; public: - void SubGraphSplitByOutput(); - void SubGraphSplitByMiddle(); - void SubGraphSplitByOffLineParallel(); + void SubGraphSplit(); private: + void SubGraphSplitByOutput(); void InitSearchSubGraphByOutput(); + + private: + void SubGraphSplitByMiddle(); void InitSearchSubGraphByMiddle(); + private: + void SubGraphSplitByOffLineParallel(); + private: void InitSearchTensor(); void InitSearchParallelSubGraph(); diff --git a/mindspore/lite/test/config/models_mindrt_parallel.cfg b/mindspore/lite/test/config/models_mindrt_parallel.cfg index 86ec61c6c6c..7f2c9e7b909 100644 --- a/mindspore/lite/test/config/models_mindrt_parallel.cfg +++ b/mindspore/lite/test/config/models_mindrt_parallel.cfg @@ -1,4 +1,5 @@ # mindrt enable parallel model # model run both CPU-CPU & GPU-CPU # model_file ### accuracy_limit ### enable_fp16(true or false) -mtk_model_normalize_object_scene_ps_20200519_f32.tflite 0.5 false \ No newline at end of file +mtk_model_normalize_object_scene_ps_20200519_f32.tflite;0.5;false +# end \ No newline at end of file diff --git a/mindspore/lite/test/st/scripts/run_benchmark_gpu.sh b/mindspore/lite/test/st/scripts/run_benchmark_gpu.sh index 5011498e2f7..7c57f941e09 100644 --- a/mindspore/lite/test/st/scripts/run_benchmark_gpu.sh +++ b/mindspore/lite/test/st/scripts/run_benchmark_gpu.sh @@ -277,17 +277,17 @@ function Run_mindrt_parallel() { data_path="/data/local/tmp/input_output/" output=${data_path}'output/'${model_name}'.ms.out' - input=${model_name}.ms.bin + input=${data_path}'input/'${model_name}'.ms.bin' model=${model_name}'.ms' echo ${model_name} >> "${run_parallel_log_file}" echo "run mindrt parallel test : ${model_name}" - ########## RUN CPU-GPU parallel + ########## RUN CPU-CPU parallel echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test' > adb_run_cmd.txt + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test' >> adb_run_cmd.txt - echo './benchmark --enableParallel=true --device=GPU --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> adb_run_cmd.txt - echo './benchmark --enableParallel=true --device=GPU --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> "${run_parallel_log_file}" + echo './benchmark --enableParallel=true --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> adb_run_cmd.txt + echo './benchmark --enableParallel=true --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> "${run_parallel_log_file}" adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_parallel_log_file}" if [ $? = 0 ]; then @@ -296,12 +296,12 @@ function Run_mindrt_parallel() { run_result='mindrt_parallel_CPU_GPU: '${model_name}' failed'; echo ${run_result} >> ${run_parallel_result_file}; return 1 fi - ########## RUN CPU-CPU parallel + ########## RUN CPU-GPU parallel echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt - echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test' > adb_run_cmd.txt + echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test' >> adb_run_cmd.txt - echo './benchmark --enableParallel=true --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> adb_run_cmd.txt - echo './benchmark --enableParallel=true --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> "${run_parallel_log_file}" + echo './benchmark --enableParallel=true --device=GPU --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> adb_run_cmd.txt + echo './benchmark --enableParallel=true --device=GPU --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> "${run_parallel_log_file}" adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_parallel_log_file}" if [ $? = 0 ]; then @@ -486,7 +486,7 @@ if [[ $backend == "all" || $backend == "gpu" ]]; then if [[ ${Run_mindrt_parallel_status} != 0 ]];then echo "Run_mindrt_parallel failed" - cat ${run_gpu_log_file} + cat ${run_parallel_log_file} fi echo "Run_parallel is ended"