forked from mindspore-Ecosystem/mindspore
commit
d27205ad6c
|
@ -40,13 +40,6 @@ class InnerKernel : public Kernel {
|
|||
const lite::Context *ctx)
|
||||
: op_parameter_(parameter), in_tensors_(std::move(in_tensors)), out_tensors_(std::move(out_tensors)) {
|
||||
context_ = ctx;
|
||||
if (parameter != nullptr && parameter->thread_num_ == 0) {
|
||||
if (ctx != nullptr) {
|
||||
op_parameter_->thread_num_ = ctx->thread_num_;
|
||||
} else {
|
||||
op_parameter_->thread_num_ = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
virtual ~InnerKernel() {
|
||||
|
|
|
@ -28,24 +28,12 @@ namespace mindspore::lite {
|
|||
void LiteOpActor::RunOpData(OpData<lite::Tensor> *inputs, OpContext<lite::Tensor> *context) {
|
||||
auto op_uuid = context->sequential_num_;
|
||||
input_op_datas_[op_uuid].push_back(inputs);
|
||||
|
||||
inputs_data_[inputs->index_] = inputs->data_;
|
||||
/* in-case infershape done in runtime */
|
||||
kernel_->in_tensors()[inputs->index_]->set_shape(inputs->data_->shape());
|
||||
kernel_->in_tensors()[inputs->index_]->set_format(inputs->data_->format());
|
||||
|
||||
if (input_op_datas_[op_uuid].size() < kernel_->in_tensors().size()) {
|
||||
return;
|
||||
}
|
||||
|
||||
auto ret = CheckInputData();
|
||||
if (ret != RET_OK) {
|
||||
input_op_datas_.erase(op_uuid);
|
||||
context->SetFailed(ret);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = SetInputData();
|
||||
auto ret = SetInputData();
|
||||
if (ret != RET_OK) {
|
||||
input_op_datas_.erase(op_uuid);
|
||||
context->SetFailed(ret);
|
||||
|
@ -87,7 +75,7 @@ void LiteOpActor::IsolateInputData(std::vector<std::shared_ptr<LiteOpActor>> *ac
|
|||
for (QuantArg quant : old_tensor->quant_params()) {
|
||||
new_tensor->AddQuantParam(quant);
|
||||
}
|
||||
isolate_input_map_.insert(std::make_pair(old_tensor, new_tensor));
|
||||
isolate_input_map_.insert(std::make_pair(new_tensor, old_tensor));
|
||||
|
||||
int ref_count = 0;
|
||||
/* set op input for calculate */
|
||||
|
@ -126,8 +114,8 @@ int LiteOpActor::LiteActorInit(std::vector<std::shared_ptr<LiteOpActor>> *actors
|
|||
int LiteOpActor::ResizeGraphInput(const std::vector<mindspore::tensor::MSTensor *> &inputs,
|
||||
const std::vector<std::vector<int>> &dims) {
|
||||
for (auto map : isolate_input_map_) {
|
||||
auto src_tensor = map.first;
|
||||
auto isolate_tensor = map.second;
|
||||
auto isolate_tensor = map.first;
|
||||
auto src_tensor = map.second;
|
||||
for (size_t i = 0; i < inputs.size(); i++) {
|
||||
if (src_tensor == inputs[i]) {
|
||||
isolate_tensor->set_shape(dims[i]);
|
||||
|
@ -225,23 +213,6 @@ int LiteOpActor::CompileArrow() {
|
|||
return ret;
|
||||
}
|
||||
|
||||
int LiteOpActor::CheckInputData() {
|
||||
if (kernel_->in_tensors().size() != inputs_data_.size()) {
|
||||
MS_LOG(ERROR) << "kernel:" << kernel_->name() << " inputs_data_.size(): " << inputs_data_.size()
|
||||
<< " vs kernel_->in_tensors().size(): " << kernel_->in_tensors().size() << " are not equal.";
|
||||
return RET_PARAM_INVALID;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < inputs_data_.size(); ++i) {
|
||||
if (kernel_->in_tensors()[i]->shape() != inputs_data_[i]->shape()) {
|
||||
MS_LOG(ERROR) << "inputs_data_[" << i << "].shape: " << inputs_data_[i]->shape() << " vs kernel_->in_tensors()["
|
||||
<< i << "].shape: " << kernel_->in_tensors()[i]->shape() << " are not equal.";
|
||||
return RET_PARAM_INVALID;
|
||||
}
|
||||
}
|
||||
return RET_OK;
|
||||
}
|
||||
|
||||
void LiteOpActor::MoveInputData(Tensor *dst_tensor, Tensor *src_tensor) {
|
||||
MS_ASSERT(src_tensor != dst_tensor);
|
||||
|
||||
|
@ -302,6 +273,11 @@ int LiteOpActor::SetInputData() {
|
|||
for (size_t i = 0; i < inputs_data_.size(); ++i) {
|
||||
auto dst_tensor = kernel_->in_tensors()[i];
|
||||
auto src_tensor = inputs_data_[i];
|
||||
|
||||
/* infershape done in runtime */
|
||||
dst_tensor->set_shape(src_tensor->shape());
|
||||
dst_tensor->set_format(src_tensor->format());
|
||||
|
||||
if (src_tensor->data_type() != dst_tensor->data_type()) {
|
||||
CopyInputData(dst_tensor, src_tensor);
|
||||
} else {
|
||||
|
@ -567,10 +543,7 @@ void LiteSwitchOpActor::AsyncFalseBranchOutput(OpContext<Tensor> *context) {
|
|||
}
|
||||
}
|
||||
|
||||
int MindrtInit(bool enable_parallel) {
|
||||
int thread_count = enable_parallel ? 2 : 1;
|
||||
return mindspore::Initialize("tcp://127.0.0.1:8080", "", "", "", thread_count);
|
||||
}
|
||||
int MindrtInit() { return mindspore::Initialize("tcp://127.0.0.1:8080", "", "", ""); }
|
||||
|
||||
void MindrtTerminate(const std::vector<std::shared_ptr<LiteOpActor>> &actor_list) {
|
||||
for (const auto &actor : actor_list) {
|
||||
|
|
|
@ -42,7 +42,7 @@ class LiteOpActor : public OpActor<lite::Tensor> {
|
|||
}
|
||||
~LiteOpActor() override {
|
||||
for (auto map : isolate_input_map_) {
|
||||
auto isolate_input_tensor = map.second;
|
||||
auto isolate_input_tensor = map.first;
|
||||
isolate_input_tensor->set_data(nullptr);
|
||||
delete isolate_input_tensor;
|
||||
}
|
||||
|
@ -67,7 +67,6 @@ class LiteOpActor : public OpActor<lite::Tensor> {
|
|||
void SetPartialMap(const std::unordered_map<size_t, AID> &partial_map) { subgraph_index_to_actor = partial_map; }
|
||||
|
||||
protected:
|
||||
int CheckInputData();
|
||||
int SetInputData();
|
||||
void SetOutputData(OpContext<Tensor> *context);
|
||||
void AsyncOutput(OpContext<Tensor> *context);
|
||||
|
@ -89,7 +88,7 @@ class LiteOpActor : public OpActor<lite::Tensor> {
|
|||
private:
|
||||
kernel::LiteKernel *partial_node_ = nullptr;
|
||||
kernel::LiteKernel *call_node_ = nullptr;
|
||||
std::unordered_map<Tensor *, Tensor *> isolate_input_map_;
|
||||
std::unordered_map<Tensor *, Tensor *> isolate_input_map_; /* <calculate-tensor, src-input-tensor> */
|
||||
};
|
||||
|
||||
class LiteSwitchOpActor : public LiteOpActor {
|
||||
|
@ -104,14 +103,7 @@ class LiteSwitchOpActor : public LiteOpActor {
|
|||
return;
|
||||
}
|
||||
|
||||
int ret = CheckInputData();
|
||||
if (ret != RET_OK) {
|
||||
input_op_datas_.erase(op_uuid);
|
||||
context->SetFailed(ret);
|
||||
return;
|
||||
}
|
||||
|
||||
ret = SetInputData();
|
||||
auto ret = SetInputData();
|
||||
if (ret != RET_OK) {
|
||||
input_op_datas_.erase(op_uuid);
|
||||
context->SetFailed(ret);
|
||||
|
@ -182,7 +174,7 @@ class LiteSwitchOpActor : public LiteOpActor {
|
|||
std::vector<OpDataPtr<Tensor>> false_branch_outputs_data_;
|
||||
};
|
||||
|
||||
int MindrtInit(bool subgraph_split = false);
|
||||
int MindrtInit();
|
||||
void MindrtTerminate(const std::vector<std::shared_ptr<LiteOpActor>> &);
|
||||
|
||||
std::vector<std::shared_ptr<LiteOpActor>> CreateOpActor(const std::vector<kernel::LiteKernel *> &kernels,
|
||||
|
|
|
@ -77,9 +77,7 @@ int MindrtExecutor::Resize(const std::vector<mindspore::tensor::MSTensor *> &inp
|
|||
|
||||
int MindrtExecutor::Prepare(const std::vector<kernel::LiteKernel *> &kernels, const std::vector<Tensor *> &inputs,
|
||||
const std::vector<Tensor *> &outputs, const lite::InnerContext *ctx) {
|
||||
MS_ASSERT(kernels.size() != 0);
|
||||
|
||||
auto ret = MindrtInit(ctx->enable_parallel_);
|
||||
auto ret = MindrtInit();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "MindrtInit failed";
|
||||
return ret;
|
||||
|
|
|
@ -40,6 +40,7 @@ kernel::LiteKernel *NPUPassUtils::CreateNchw2NhwcKernel(const std::vector<Tensor
|
|||
}
|
||||
memset(transpose_param, 0, sizeof(TransposeParameter));
|
||||
transpose_param->op_parameter_.type_ = schema::PrimitiveType_Transpose;
|
||||
transpose_param->op_parameter_.thread_num_ = ctx->thread_num_;
|
||||
transpose_param->perm_[0] = 0;
|
||||
transpose_param->perm_[1] = 2;
|
||||
transpose_param->perm_[2] = 3;
|
||||
|
@ -76,6 +77,7 @@ kernel::LiteKernel *NPUPassUtils::CreateNhwc2NchwKernel(const std::vector<Tensor
|
|||
}
|
||||
memset(transpose_param, 0, sizeof(TransposeParameter));
|
||||
transpose_param->op_parameter_.type_ = schema::PrimitiveType_Transpose;
|
||||
transpose_param->op_parameter_.thread_num_ = ctx->thread_num_;
|
||||
transpose_param->perm_[0] = 0;
|
||||
transpose_param->perm_[1] = 3;
|
||||
transpose_param->perm_[2] = 1;
|
||||
|
|
|
@ -88,13 +88,7 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
|
|||
if (context_->enable_parallel_) {
|
||||
auto search_sub_graph =
|
||||
SearchSubGraph(context_, src_model_, src_tensors_, &op_parameters_, &graph_output_node_indexes_);
|
||||
|
||||
bool offline_parallel_enable = src_model_->all_nodes_.front()->device_type_ != kDefaultDeviceType;
|
||||
if (offline_parallel_enable) {
|
||||
search_sub_graph.SubGraphSplitByOffLineParallel();
|
||||
} else {
|
||||
search_sub_graph.SubGraphSplitByOutput();
|
||||
}
|
||||
search_sub_graph.SubGraphSplit();
|
||||
}
|
||||
|
||||
ret = ScheduleSubGraphToKernels(kMainSubGraphIndex, dst_kernels, nullptr, nullptr);
|
||||
|
@ -111,14 +105,7 @@ int Scheduler::Schedule(std::vector<kernel::LiteKernel *> *dst_kernels) {
|
|||
}
|
||||
}
|
||||
FindAllInoutKernels(*dst_kernels);
|
||||
// origin kernel init
|
||||
for (size_t i = 0; i < dst_kernels->size(); i++) {
|
||||
ret = (*dst_kernels)[i]->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Kernel " << (*dst_kernels)[i]->name() << " Init failed.";
|
||||
return ret;
|
||||
}
|
||||
}
|
||||
|
||||
ret = RunPass(dst_kernels);
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Schedule run pass failed.";
|
||||
|
@ -679,13 +666,6 @@ kernel::LiteKernel *Scheduler::SchedulePartialToKernel(const lite::Model::Node *
|
|||
MS_LOG(ERROR) << "Schedule partial failed, name: " << src_node->name_;
|
||||
return nullptr;
|
||||
}
|
||||
for (auto kernel : sub_kernels) {
|
||||
ret = kernel->Init();
|
||||
if (ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "Schedule partial kernel init failed, name: " << kernel->name();
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
FindAllInoutKernels(sub_kernels);
|
||||
ret = RunPass(&sub_kernels);
|
||||
|
@ -725,6 +705,7 @@ int Scheduler::ScheduleSubGraphToKernels(size_t subgraph_index, std::vector<kern
|
|||
MS_ASSERT(dst_kernels->empty());
|
||||
auto subgraph = src_model_->sub_graphs_.at(subgraph_index);
|
||||
for (auto node_index : subgraph->node_indices_) {
|
||||
auto ret = RET_OK;
|
||||
auto node = src_model_->all_nodes_[node_index];
|
||||
MS_ASSERT(node != nullptr);
|
||||
auto *primitive = node->primitive_;
|
||||
|
@ -735,8 +716,11 @@ int Scheduler::ScheduleSubGraphToKernels(size_t subgraph_index, std::vector<kern
|
|||
kernel = SchedulePartialToKernel(node);
|
||||
} else { // kernel
|
||||
kernel = ScheduleNodeToKernel(node, prefer_data_type);
|
||||
if (kernel != nullptr) {
|
||||
ret = kernel->Init();
|
||||
}
|
||||
if (kernel == nullptr) {
|
||||
}
|
||||
if (kernel == nullptr || ret != RET_OK) {
|
||||
MS_LOG(ERROR) << "FindBackendKernel return nullptr, name: " << node->name_
|
||||
<< ", type: " << PrimitiveTypeName(prim_type);
|
||||
return RET_ERROR;
|
||||
|
|
|
@ -559,4 +559,13 @@ void SearchSubGraph::SubGraphSplitByOffLineParallel() {
|
|||
MS_LOG(DEBUG) << "end to split offline parallel subgraph";
|
||||
}
|
||||
|
||||
void SearchSubGraph::SubGraphSplit() {
|
||||
bool offline_parallel_enable = model_->all_nodes_.front()->device_type_ != kDefaultDeviceType;
|
||||
if (offline_parallel_enable) {
|
||||
SubGraphSplitByOffLineParallel();
|
||||
} else {
|
||||
SubGraphSplitByOutput();
|
||||
}
|
||||
return;
|
||||
}
|
||||
} // namespace mindspore::lite
|
||||
|
|
|
@ -73,14 +73,19 @@ class SearchSubGraph {
|
|||
~SearchSubGraph() = default;
|
||||
|
||||
public:
|
||||
void SubGraphSplitByOutput();
|
||||
void SubGraphSplitByMiddle();
|
||||
void SubGraphSplitByOffLineParallel();
|
||||
void SubGraphSplit();
|
||||
|
||||
private:
|
||||
void SubGraphSplitByOutput();
|
||||
void InitSearchSubGraphByOutput();
|
||||
|
||||
private:
|
||||
void SubGraphSplitByMiddle();
|
||||
void InitSearchSubGraphByMiddle();
|
||||
|
||||
private:
|
||||
void SubGraphSplitByOffLineParallel();
|
||||
|
||||
private:
|
||||
void InitSearchTensor();
|
||||
void InitSearchParallelSubGraph();
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
# mindrt enable parallel model
|
||||
# model run both CPU-CPU & GPU-CPU
|
||||
# model_file ### accuracy_limit ### enable_fp16(true or false)
|
||||
mtk_model_normalize_object_scene_ps_20200519_f32.tflite 0.5 false
|
||||
mtk_model_normalize_object_scene_ps_20200519_f32.tflite;0.5;false
|
||||
# end
|
|
@ -277,17 +277,17 @@ function Run_mindrt_parallel() {
|
|||
|
||||
data_path="/data/local/tmp/input_output/"
|
||||
output=${data_path}'output/'${model_name}'.ms.out'
|
||||
input=${model_name}.ms.bin
|
||||
input=${data_path}'input/'${model_name}'.ms.bin'
|
||||
model=${model_name}'.ms'
|
||||
echo ${model_name} >> "${run_parallel_log_file}"
|
||||
echo "run mindrt parallel test : ${model_name}"
|
||||
|
||||
########## RUN CPU-GPU parallel
|
||||
########## RUN CPU-CPU parallel
|
||||
echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt
|
||||
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test' > adb_run_cmd.txt
|
||||
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test' >> adb_run_cmd.txt
|
||||
|
||||
echo './benchmark --enableParallel=true --device=GPU --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> adb_run_cmd.txt
|
||||
echo './benchmark --enableParallel=true --device=GPU --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> "${run_parallel_log_file}"
|
||||
echo './benchmark --enableParallel=true --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> adb_run_cmd.txt
|
||||
echo './benchmark --enableParallel=true --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> "${run_parallel_log_file}"
|
||||
|
||||
adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_parallel_log_file}"
|
||||
if [ $? = 0 ]; then
|
||||
|
@ -296,12 +296,12 @@ function Run_mindrt_parallel() {
|
|||
run_result='mindrt_parallel_CPU_GPU: '${model_name}' failed'; echo ${run_result} >> ${run_parallel_result_file}; return 1
|
||||
fi
|
||||
|
||||
########## RUN CPU-CPU parallel
|
||||
########## RUN CPU-GPU parallel
|
||||
echo 'cd /data/local/tmp/benchmark_test' > adb_run_cmd.txt
|
||||
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test' > adb_run_cmd.txt
|
||||
echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/data/local/tmp/benchmark_test' >> adb_run_cmd.txt
|
||||
|
||||
echo './benchmark --enableParallel=true --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> adb_run_cmd.txt
|
||||
echo './benchmark --enableParallel=true --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> "${run_parallel_log_file}"
|
||||
echo './benchmark --enableParallel=true --device=GPU --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> adb_run_cmd.txt
|
||||
echo './benchmark --enableParallel=true --device=GPU --enableFp16='${fp16}' --accuracyThreshold='${limit}' --modelFile='${model}' --inDataFile='${input}' --benchmarkDataFile='${output} >> "${run_parallel_log_file}"
|
||||
|
||||
adb -s ${device_id} shell < adb_run_cmd.txt >> "${run_parallel_log_file}"
|
||||
if [ $? = 0 ]; then
|
||||
|
@ -486,7 +486,7 @@ if [[ $backend == "all" || $backend == "gpu" ]]; then
|
|||
|
||||
if [[ ${Run_mindrt_parallel_status} != 0 ]];then
|
||||
echo "Run_mindrt_parallel failed"
|
||||
cat ${run_gpu_log_file}
|
||||
cat ${run_parallel_log_file}
|
||||
fi
|
||||
|
||||
echo "Run_parallel is ended"
|
||||
|
|
Loading…
Reference in New Issue