diff --git a/mindspore/lite/CMakeLists.txt b/mindspore/lite/CMakeLists.txt
index a861eb8f86..5c89c4a99b 100644
--- a/mindspore/lite/CMakeLists.txt
+++ b/mindspore/lite/CMakeLists.txt
@@ -166,7 +166,6 @@ if (BUILD_DEVICE)
         add_compile_definitions(ENABLE_ARM32)
     endif ()
     if (PLATFORM_ARM64)
-        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8.2-a+dotprod+fp16")
         add_compile_definitions(ENABLE_ARM64)
         if (ENABLE_FP16)
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8.2-a+dotprod+fp16")
diff --git a/mindspore/lite/include/context.h b/mindspore/lite/include/context.h
index 64ffa42fd4..7206c31cfc 100644
--- a/mindspore/lite/include/context.h
+++ b/mindspore/lite/include/context.h
@@ -20,6 +20,7 @@
 #include <string>
 #include <memory>
 #include "include/ms_tensor.h"
+#include "include/thread_pool_config.h"
 
 namespace mindspore::lite {
 /// \brief Allocator defined a memory pool for malloc memory and free memory dynamically.
@@ -27,13 +28,6 @@ namespace mindspore::lite {
 /// \note List public class and interface for reference.
 class Allocator;
 
-/// \brief CpuBindMode defined for holding bind cpu strategy argument.
-enum CpuBindMode {
-  MID_CPU = -1,   /**< bind middle cpu first */
-  HIGHER_CPU = 1, /**< bind higher cpu first */
-  NO_BIND = 0     /**< no bind */
-};
-
 /// \brief DeviceType defined for holding user's preferred backend.
 typedef enum {
   DT_CPU, /**< CPU device type */
diff --git a/mindspore/lite/include/thread_pool_config.h b/mindspore/lite/include/thread_pool_config.h
new file mode 100644
index 0000000000..8a5dead47d
--- /dev/null
+++ b/mindspore/lite/include/thread_pool_config.h
@@ -0,0 +1,35 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_LITE_INCLUDE_THREAD_POOL_CONFIG_H_
+#define MINDSPORE_LITE_INCLUDE_THREAD_POOL_CONFIG_H_
+
+/// \brief CpuBindMode defined for holding bind cpu strategy argument.
+typedef enum Mode {
+  MID_CPU = -1,   /**< bind middle cpu first */
+  HIGHER_CPU = 1, /**< bind higher cpu first */
+  NO_BIND = 0     /**< no bind */
+} CpuBindMode;
+
+/// \brief ThreadPoolId defined for specifying which thread pool to use.
+typedef enum Id {
+  THREAD_POOL_DEFAULT = 0, /**< default thread pool id */
+  THREAD_POOL_SECOND = 1,  /**< the second thread pool id */
+  THREAD_POOL_THIRD = 2,   /**< the third thread pool id */
+  THREAD_POOL_FOURTH = 3   /**< the fourth thread pool id */
+} ThreadPoolId;
+
+#endif  // LITE_MINDSPORE_LITE_INCLUDE_THREAD_POOL_CONFIG_H_
diff --git a/mindspore/lite/src/CMakeLists.txt b/mindspore/lite/src/CMakeLists.txt
index fbedb15bcf..1fef039dbf 100644
--- a/mindspore/lite/src/CMakeLists.txt
+++ b/mindspore/lite/src/CMakeLists.txt
@@ -3,7 +3,7 @@ set(LITE_SRC
         ${CMAKE_CURRENT_SOURCE_DIR}/common/ms_tensor_utils.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/runtime/allocator.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/runtime/runtime_api.cc
-        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/thread_pool.cc
+        ${CMAKE_CURRENT_SOURCE_DIR}/runtime/thread_pool.c
         ${CMAKE_CURRENT_SOURCE_DIR}/runtime/workspace_pool.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/ir/tensor.cc
         ${CMAKE_CURRENT_SOURCE_DIR}/context.cc
diff --git a/mindspore/lite/src/lite_session.cc b/mindspore/lite/src/lite_session.cc
index 7c253852ac..d08ecb7ae9 100644
--- a/mindspore/lite/src/lite_session.cc
+++ b/mindspore/lite/src/lite_session.cc
@@ -247,7 +247,6 @@ std::vector<mindspore::tensor::MSTensor *> LiteSession::GetInputs() const { retu
 
 int LiteSession::RunGraph(const session::KernelCallBack &before, const session::KernelCallBack &after) {
   MS_EXCEPTION_IF_NULL(this->context_);
-  SetMaxWokerNum(context_->thread_num_);
   if (before == nullptr && after == nullptr) {
     return executor->Run(this->inputs_, this->outputs_, this->kernels_, this->context_->allocator.get());
   } else {
@@ -264,7 +263,7 @@ int LiteSession::Init(Context *context) {
   }
   this->context_->float16_priority = context->float16_priority;
   this->context_->cpu_bind_mode_ = context->cpu_bind_mode_;
-  ConfigThreadPool(context->cpu_bind_mode_, context->thread_num_);
+  ConfigThreadPool(THREAD_POOL_DEFAULT, context->thread_num_, context->cpu_bind_mode_);
   auto ret = KernelRegistry::GetInstance()->Init();
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "KernelRegistry Init Failed.";
@@ -283,7 +282,7 @@ int LiteSession::Init(Context *context) {
 
 void LiteSession::BindThread(bool if_bind) {
   if (this->context_->cpu_bind_mode_ != NO_BIND) {
-    DoAllThreadBind(if_bind, static_cast<int>(this->context_->cpu_bind_mode_));
+    BindThreads(THREAD_POOL_DEFAULT, if_bind, this->context_->cpu_bind_mode_);
   }
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/prior_box.cc b/mindspore/lite/src/runtime/kernel/arm/base/prior_box.cc
index 518a7896b9..9cf795c862 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/prior_box.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/prior_box.cc
@@ -153,7 +153,7 @@ int PriorBoxCPUKernel::PriorBoxImpl(int task_id) {
   return ret;
 }
 
-int RunPriorBox(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int RunPriorBox(void *cdata, int task_id) {
   auto prior_box = reinterpret_cast<PriorBoxCPUKernel *>(cdata);
 
   auto error_code = prior_box->PriorBoxImpl(task_id);
@@ -170,7 +170,7 @@ int PriorBoxCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail! Ret error code[" << prepare_ret << "]";
     return prepare_ret;
   }
-  int error_code = LiteBackendParallelLaunch(RunPriorBox, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, RunPriorBox, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "PriorBox run error, error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc b/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
index 60287713e2..a1ba123cf5 100644
--- a/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/base/quant_dtype_cast.cc
@@ -95,7 +95,7 @@ int QuantDTypeCastCPUKernel::QuantDTypeCast(int task_id) {
   return RET_OK;
 }
 
-int QuantDTypeCastRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int QuantDTypeCastRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<QuantDTypeCastCPUKernel *>(cdata);
   auto ret = g_kernel->QuantDTypeCast(task_id);
   if (ret != RET_OK) {
@@ -119,7 +119,7 @@ int QuantDTypeCastCPUKernel::Run() {
     int8_ptr_ = reinterpret_cast<int8_t *>(out_tensors_[0]->Data());
   }
 
-  auto ret = LiteBackendParallelLaunch(QuantDTypeCastRun, this, thread_n_num_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, QuantDTypeCastRun, this, thread_n_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc
index b2fbf6e81d..46dc6dc63e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/activation_fp16.cc
@@ -92,7 +92,7 @@ int ActivationFp16CPUKernel::DoActivation(int task_id) {
   return error_code;
 }
 
-int ActivationRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ActivationRun(void *cdata, int task_id) {
   auto activation_kernel = reinterpret_cast<ActivationFp16CPUKernel *>(cdata);
   auto error_code = activation_kernel->DoActivation(task_id);
   if (error_code != RET_OK) {
@@ -115,7 +115,7 @@ int ActivationFp16CPUKernel::Run() {
     return ret;
   }
 
-  int error_code = LiteBackendParallelLaunch(ActivationRun, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ActivationRun, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc
index 7cb45b00f0..69521196e4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/arithmetic_fp16.cc
@@ -362,7 +362,7 @@ int ArithmeticFP16CPUKernel::DoArithmetic(int task_id) {
   return RET_OK;
 }
 
-static int ArithmeticsRun_Fp16(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int ArithmeticsRun_Fp16(void *cdata, int task_id) {
   auto arithmetic_kernel = reinterpret_cast<ArithmeticFP16CPUKernel *>(cdata);
   auto error_code = arithmetic_kernel->DoArithmetic(task_id);
   if (error_code != RET_OK) {
@@ -413,7 +413,7 @@ int ArithmeticFP16CPUKernel::Run() {
     Float32ToFloat16(reinterpret_cast<float *>(in_tensors_[1]->Data()), input1_fp16_,
                      arithmeticParameter_->in_elements_num1_);
   }
-  ret = LiteBackendParallelLaunch(ArithmeticsRun_Fp16, this, context_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ArithmeticsRun_Fp16, this, context_->thread_num_);
   return ret;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
index 8faee1f705..8805e384a8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/batchnorm_fp16.cc
@@ -65,7 +65,7 @@ int BatchnormFp16CPUKernel::Run() {
     input_ = in_tensors_.at(0)->Data();
     output_ = out_tensors_.at(0)->Data();
   }
-  ret = LiteBackendParallelLaunch(BatchNormRun, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, BatchNormRun, this, op_parameter_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc
index d67ab064d8..da776d2cc9 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/cast_fp16.cc
@@ -30,13 +30,13 @@ using mindspore::schema::PrimitiveType_Cast;
 
 namespace mindspore::kernel {
 namespace {
-int CastRun(int thread_id, LiteParallelGroupEnv *penv, void *cdata) {
+int CastRun(void *cdata, int task_id) {
   if (cdata == nullptr) {
     MS_LOG(ERROR) << "input cdata is nullptr!";
     return RET_ERROR;
   }
 
-  return reinterpret_cast<CastFp16CPUKernel *>(cdata)->DoCast(thread_id);
+  return reinterpret_cast<CastFp16CPUKernel *>(cdata)->DoCast(task_id);
 }
 }  // namespace
 
@@ -91,7 +91,7 @@ int CastFp16CPUKernel::Run() {
   if (data_num_ == 0) {
     return RET_OK;
   }
-  return LiteBackendParallelLaunch(CastRun, this, op_parameter_->thread_num_);
+  return ParallelLaunch(THREAD_POOL_DEFAULT, CastRun, this, op_parameter_->thread_num_);
 }
 
 kernel::LiteKernel *CpuCastFp16KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
index 2ccc2957fe..1708b11105 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_1x1_fp16.cc
@@ -194,7 +194,7 @@ int Convolution1x1FP16CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-static int Convolution1x1Fp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int Convolution1x1Fp16Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<Convolution1x1FP16CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -222,7 +222,7 @@ int Convolution1x1FP16CPUKernel::Run() {
       execute_input_ + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_,
       execute_output_ + batch_index * matmul_param_->row_ * matmul_param_->col_);
 
-    int error_code = LiteBackendParallelLaunch(Convolution1x1Fp16Impl, this, thread_count_);
+    int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, Convolution1x1Fp16Impl, this, thread_count_);
     if (error_code != RET_OK) {
       MS_LOG(ERROR) << "conv1x1 fp16 error error_code[" << error_code << "]";
       return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
index 8c9343ba3f..5dff52083a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_3x3_fp16.cc
@@ -197,7 +197,7 @@ int Convolution3x3FP16CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-static int Convolution3x3Fp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int Convolution3x3Fp16Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<Convolution3x3FP16CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -251,7 +251,7 @@ int Convolution3x3FP16CPUKernel::Run() {
   int in_channel = conv_param_->input_channel_;
   PackNHWCToNHWC8Fp16(reinterpret_cast<void *>(execute_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel);
 
-  int error_code = LiteBackendParallelLaunch(Convolution3x3Fp16Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, Convolution3x3Fp16Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv3x3 fp16 error error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
index 96f2b3d9b5..fadaa906a5 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_fp16.cc
@@ -98,7 +98,7 @@ int ConvolutionDepthwiseFp16CPUKernel::Execute(int task_id) {
   return RET_OK;
 }
 
-static int ConvDwFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int ConvDwFp16Run(void *cdata, int task_id) {
   auto conv_dw_fp16 = reinterpret_cast<ConvolutionDepthwiseFp16CPUKernel *>(cdata);
   auto ret = conv_dw_fp16->Execute(task_id);
   if (ret != RET_OK) {
@@ -125,7 +125,7 @@ int ConvolutionDepthwiseFp16CPUKernel::Run() {
     return ret;
   }
 
-  ret = LiteBackendParallelLaunch(ConvDwFp16Run, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwFp16Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConvDwFp16Run error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
index a7ebff8c6e..4e8aa956f8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_depthwise_slidewindow_fp16.cc
@@ -129,7 +129,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Execute(int task_id) {
   return RET_OK;
 }
 
-static int ConvDwSWFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int ConvDwSWFp16Run(void *cdata, int task_id) {
   auto conv_dw_fp16 = reinterpret_cast<ConvolutionDepthwiseSWFp16CPUKernel *>(cdata);
   auto ret = conv_dw_fp16->Execute(task_id);
   if (ret != RET_OK) {
@@ -171,7 +171,7 @@ int ConvolutionDepthwiseSWFp16CPUKernel::Run() {
     packed_output_ = execute_output_;
   }
 
-  ret = LiteBackendParallelLaunch(ConvDwSWFp16Run, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwSWFp16Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConvDwSWFp16Run error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
index ee040f8443..7f7ee28625 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_fp16.cc
@@ -177,7 +177,7 @@ int ConvolutionFP16CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-static int ConvolutionFp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int ConvolutionFp16Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<ConvolutionFP16CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -206,7 +206,7 @@ int ConvolutionFP16CPUKernel::Run() {
   int in_channel = conv_param_->input_channel_;
   convert_func_(reinterpret_cast<void *>(execute_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel);
 
-  int error_code = LiteBackendParallelLaunch(ConvolutionFp16Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionFp16Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv fp16 error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
index 7b7d1e17b3..27def2e8fb 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_sw_fp16.cc
@@ -186,7 +186,7 @@ int ConvolutionSWFP16CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-static int ConvolutionSWFp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int ConvolutionSWFp16Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<ConvolutionSWFP16CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -219,7 +219,7 @@ int ConvolutionSWFP16CPUKernel::Run() {
   int in_channel = conv_param_->input_channel_;
   convert_func_(reinterpret_cast<void *>(execute_input_), nhwc4_input_, in_batch, in_h * in_w, in_channel);
 
-  int error_code = LiteBackendParallelLaunch(ConvolutionSWFp16Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionSWFp16Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv fp16 error error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
index cb7526f827..4cb862a43f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/convolution_winograd_fp16.cc
@@ -347,7 +347,7 @@ int ConvolutionWinogradFP16CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-static int ConvolutionWinogradFp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int ConvolutionWinogradFp16Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<ConvolutionWinogradFP16CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -404,7 +404,7 @@ int ConvolutionWinogradFP16CPUKernel::Run() {
   int in_channel = conv_param_->input_channel_;
   PackNHWCToNHWC8Fp16(execute_input_, nhwc4_input_, in_batch, in_h * in_w, in_channel);
 
-  int error_code = LiteBackendParallelLaunch(ConvolutionWinogradFp16Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionWinogradFp16Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv winograd error error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
index 25656b1b4f..8018f43f63 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_depthwise_fp16.cc
@@ -137,7 +137,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::Execute(int task_id) {
   return RET_OK;
 }
 
-static int DeconvDwFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int DeconvDwFp16Run(void *cdata, int task_id) {
   auto deconv_dw_fp16 = reinterpret_cast<DeconvolutionDepthwiseFp16CPUKernel *>(cdata);
   auto ret = deconv_dw_fp16->Execute(task_id);
   if (ret != RET_OK) {
@@ -178,7 +178,7 @@ int DeconvolutionDepthwiseFp16CPUKernel::Run() {
   if (!need_align_) {
     packed_output_ = execute_output_;
   }
-  ret = LiteBackendParallelLaunch(DeconvDwFp16Run, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, DeconvDwFp16Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "DeconvDwFp16Run error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
index 0deb852cf9..817bb91497 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/deconvolution_fp16.cc
@@ -137,7 +137,7 @@ void DeConvolutionFp16CPUKernel::FreeRunBuf() {
   return;
 }
 
-static int DeConvFp16Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int DeConvFp16Run(void *cdata, int task_id) {
   auto deconv = reinterpret_cast<DeConvolutionFp16CPUKernel *>(cdata);
   auto error_code = deconv->DoDeconv(task_id);
   if (error_code != RET_OK) {
@@ -188,7 +188,7 @@ int DeConvolutionFp16CPUKernel::Run() {
   for (int batch_index = 0; batch_index < conv_param_->input_batch_; batch_index++) {
     RowMajor2Col16MajorFp16(execute_input_, pack_input_, input_plane_, conv_param_->input_channel_);
 
-    error_code = LiteBackendParallelLaunch(DeConvFp16Run, this, thread_count_);
+    error_code = ParallelLaunch(THREAD_POOL_DEFAULT, DeConvFp16Run, this, thread_count_);
     if (error_code != RET_OK) {
       MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]";
       return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc
index dddad9ae9b..7e0ace1d5e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/pooling_fp16.cc
@@ -89,7 +89,7 @@ int PoolingFp16CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-static int PoolingFp16Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int PoolingFp16Impl(void *cdata, int task_id) {
   auto pooling = reinterpret_cast<PoolingFp16CPUKernel *>(cdata);
   auto error_code = pooling->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -109,7 +109,7 @@ int PoolingFp16CPUKernel::Run() {
   auto input_ptr = reinterpret_cast<float *>(in_tensors_.at(kInputIndex)->Data());
   Float32ToFloat16(input_ptr, fp16_input_, ele_num);
 
-  int error_code = LiteBackendParallelLaunch(PoolingFp16Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, PoolingFp16Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc
index 3bc9d21ada..5b689b0595 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/reduce_fp16.cc
@@ -67,7 +67,7 @@ int ReduceFp16CPUKernel::CallReduceUnit(int task_id) {
   return ret;
 }
 
-static int ReduceImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int ReduceImpl(void *cdata, int task_id) {
   auto reduce = reinterpret_cast<ReduceFp16CPUKernel *>(cdata);
   auto error_code = reduce->CallReduceUnit(task_id);
   if (error_code != RET_OK) {
@@ -112,7 +112,7 @@ int ReduceFp16CPUKernel::Run() {
       inner_size_ *= tmp_shape_[k];
     }
     axis_size_ = tmp_shape_[axis];
-    auto error_code = LiteBackendParallelLaunch(ReduceImpl, this, context_->thread_num_);
+    auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceImpl, this, context_->thread_num_);
     if (error_code != RET_OK) {
       FreeTmpBuffer();
       MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc
index 8182d75f28..3a4e9f41cc 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/split_fp16.cc
@@ -63,7 +63,7 @@ int SplitFp16CPUKernel::Split(int task_id) {
   return RET_OK;
 }
 
-static int SplitRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int SplitRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<SplitFp16CPUKernel *>(cdata);
   auto ret = g_kernel->Split(task_id);
   if (ret != RET_OK) {
@@ -97,7 +97,7 @@ int SplitFp16CPUKernel::Run() {
       output_ptr_[i] = reinterpret_cast<float16_t *>(out_tensors_.at(i)->Data());
     }
   }
-  ret = LiteBackendParallelLaunch(SplitRun, this, thread_n_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SplitRun, this, thread_n_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "split error error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc
index 20e15f0fbf..eca0714e18 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp16/transpose_fp16.cc
@@ -117,7 +117,7 @@ int TransposeFp16CPUKernel::TransposeParallel(int task_id) {
   return RET_OK;
 }
 
-static int TransposeRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+static int TransposeRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<TransposeFp16CPUKernel *>(cdata);
   auto ret = g_kernel->TransposeParallel(task_id);
   if (ret != RET_OK) {
@@ -162,7 +162,7 @@ int TransposeFp16CPUKernel::Run() {
   in_shape_ = const_cast<int *>(in_tensor->shape().data());
   out_shape_ = const_cast<int *>(out_tensor->shape().data());
 
-  ret = LiteBackendParallelLaunch(TransposeRun, this, thread_h_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, TransposeRun, this, thread_h_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]";
     FreeFp16Buffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/activation.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/activation.cc
index 41b76206a9..3b61a0c7ca 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/activation.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/activation.cc
@@ -67,7 +67,7 @@ int ActivationCPUKernel::DoActivation(int task_id) {
   return RET_OK;
 }
 
-int ActivationRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ActivationRun(void *cdata, int task_id) {
   auto activation_kernel = reinterpret_cast<ActivationCPUKernel *>(cdata);
   auto error_code = activation_kernel->DoActivation(task_id);
   if (error_code != RET_OK) {
@@ -83,7 +83,7 @@ int ActivationCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare failed.";
     return ret;
   }
-  int error_code = LiteBackendParallelLaunch(ActivationRun, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ActivationRun, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/addn.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/addn.cc
index 67e1b24697..c5cb1b6d07 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/addn.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/addn.cc
@@ -28,13 +28,13 @@ using mindspore::schema::PrimitiveType_AddN;
 
 namespace mindspore::kernel {
 namespace {
-int AddNLaunch(int thread_id, LiteParallelGroupEnv *penv, void *cdata) {
+int AddNLaunch(void *cdata, int task_id) {
   if (cdata == nullptr) {
     MS_LOG(ERROR) << "Input cdata is nullptr!";
     return RET_NULL_PTR;
   }
   auto kernel = reinterpret_cast<AddNCPUKernel *>(cdata);
-  return kernel->AddNParallelRun(thread_id);
+  return kernel->AddNParallelRun(task_id);
 }
 }  // namespace
 
@@ -74,7 +74,7 @@ int AddNCPUKernel::Run() {
   in1_addr_ = input0_data;
   in2_addr_ = input1_data;
   out_addr_ = output_data;
-  ret = LiteBackendParallelLaunch(AddNLaunch, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, AddNLaunch, this, op_parameter_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "addn launch fail!ret: " << ret;
     return RET_ERROR;
@@ -82,7 +82,7 @@ int AddNCPUKernel::Run() {
   for (size_t i = 2; i < in_tensors_.size(); ++i) {
     in1_addr_ = reinterpret_cast<float *>(in_tensors_[i]->Data());
     in2_addr_ = output_data;
-    ret = LiteBackendParallelLaunch(AddNLaunch, this, op_parameter_->thread_num_);
+    ret = ParallelLaunch(THREAD_POOL_DEFAULT, AddNLaunch, this, op_parameter_->thread_num_);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "addn launch fail!ret: " << ret << ", input index: " << i;
       return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic.cc
index d3c322744a..6a72842ce5 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic.cc
@@ -163,7 +163,7 @@ int ArithmeticCPUKernel::DoArithmetic(int task_id) {
   return RET_OK;
 }
 
-int ArithmeticsRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ArithmeticsRun(void *cdata, int task_id) {
   auto arithmetic_kernel = reinterpret_cast<ArithmeticCPUKernel *>(cdata);
   auto error_code = arithmetic_kernel->DoArithmetic(task_id);
   if (error_code != RET_OK) {
@@ -193,7 +193,7 @@ int ArithmeticCPUKernel::Run() {
     ComputeStrides(arithmeticParameter_->out_shape_, arithmeticParameter_->out_strides_, arithmeticParameter_->ndim_);
   }
 
-  int error_code = LiteBackendParallelLaunch(ArithmeticsRun, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ArithmeticsRun, this, thread_count_);
 
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Arithmetic function error error_code[" << error_code << "]";
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self.cc
index 57fd294072..75d568b609 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/arithmetic_self.cc
@@ -41,7 +41,7 @@ int ArithmeticSelfCPUKernel::ReSize() {
   return RET_OK;
 }
 
-int ArithmeticSelfRuns(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ArithmeticSelfRuns(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<ArithmeticSelfCPUKernel *>(cdata);
   auto ret = g_kernel->DoArithmeticSelf(task_id);
   if (ret != RET_OK) {
@@ -80,7 +80,7 @@ int ArithmeticSelfCPUKernel::Run() {
   auto out_tensor = out_tensors_.at(0);
   in_ptr_ = reinterpret_cast<float *>(input_tensor->Data());
   out_ptr_ = reinterpret_cast<float *>(out_tensor->Data());
-  ret = LiteBackendParallelLaunch(ArithmeticSelfRuns, this, thread_sz_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ArithmeticSelfRuns, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ArithmeticSelfRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc
index 6bfa90c763..050b868d63 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.cc
@@ -75,7 +75,7 @@ int BatchnormCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail! Ret error code: " << ret;
     return ret;
   }
-  ret = LiteBackendParallelLaunch(BatchNormRun, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, BatchNormRun, this, op_parameter_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
   }
@@ -88,7 +88,7 @@ int BatchnormCPUKernel::DoExecute(int task_id) {
   return mindspore::lite::RET_OK;
 }
 
-int BatchNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int BatchNormRun(void *cdata, int task_id) {
   auto kernel = reinterpret_cast<BatchnormCPUKernel *>(cdata);
   auto ret = kernel->DoExecute(task_id);
   if (ret != RET_OK) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h
index 3261f4a06f..e759058618 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/batchnorm.h
@@ -48,7 +48,7 @@ class BatchnormCPUKernel : public LiteKernel {
   void *variance_ = nullptr;
 };
 
-int BatchNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int BatchNormRun(void *cdata, int task_id);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_FP32_BATCHNORM_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/cast.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/cast.cc
index 2e984644bf..4d10d0fb81 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/cast.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/cast.cc
@@ -30,13 +30,13 @@ using mindspore::schema::PrimitiveType_Cast;
 
 namespace mindspore::kernel {
 namespace {
-int CastRun(int thread_id, LiteParallelGroupEnv *penv, void *cdata) {
+int CastRun(void *cdata, int task_id) {
   if (cdata == nullptr) {
     MS_LOG(ERROR) << "input cdata is nullptr!";
     return RET_ERROR;
   }
 
-  return reinterpret_cast<CastCPUKernel *>(cdata)->DoCast(thread_id);
+  return reinterpret_cast<CastCPUKernel *>(cdata)->DoCast(task_id);
 }
 }  // namespace
 
@@ -111,7 +111,7 @@ int CastCPUKernel::Run() {
   if (data_num_ == 0) {
     return RET_OK;
   }
-  return LiteBackendParallelLaunch(CastRun, this, op_parameter_->thread_num_);
+  return ParallelLaunch(THREAD_POOL_DEFAULT, CastRun, this, op_parameter_->thread_num_);
 }
 
 kernel::LiteKernel *CpuCastFp32KernelCreator(const std::vector<lite::tensor::Tensor *> &inputs,
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/constant_of_shape.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/constant_of_shape.cc
index 54d49ef017..b3330d9479 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/constant_of_shape.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/constant_of_shape.cc
@@ -41,7 +41,7 @@ int ConstantOfShapeCPUKernel::DoExecute(int task_id) {
   return RET_OK;
 }
 
-int ConstantOfShapeRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConstantOfShapeRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<ConstantOfShapeCPUKernel *>(cdata);
   auto ret = g_kernel->DoExecute(task_id);
   if (ret != RET_OK) {
@@ -62,7 +62,7 @@ int ConstantOfShapeCPUKernel::Run() {
   param_->unit_ = UP_DIV(param_->element_sz_, thread_num);
   param_->op_parameter_.thread_num_ = thread_num;
   out_ptr_ = reinterpret_cast<float *>(out_tensors_.front()->Data());
-  auto ret = LiteBackendParallelLaunch(ConstantOfShapeRun, this, thread_num);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConstantOfShapeRun, this, thread_num);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConstantOfShapeRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
index 641a8570f3..32d9c6e3fa 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution.cc
@@ -172,7 +172,7 @@ int ConvolutionCPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int ConvolutionImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConvolutionImpl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<ConvolutionCPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -200,7 +200,7 @@ int ConvolutionCPUKernel::Run() {
   PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
                       conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
 
-  int error_code = LiteBackendParallelLaunch(ConvolutionImpl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionImpl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv error error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
index f4c63ec4b7..56e1cbe492 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_1x1.cc
@@ -149,7 +149,7 @@ int Convolution1x1CPUKernel::DoConv1x1(int task_id) {
   return RET_OK;
 }
 
-int Convolution1x1Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int Convolution1x1Run(void *cdata, int task_id) {
   auto conv1x1 = reinterpret_cast<Convolution1x1CPUKernel *>(cdata);
   auto error_code = conv1x1->DoConv1x1(task_id);
   if (error_code != RET_OK) {
@@ -179,7 +179,7 @@ int Convolution1x1CPUKernel::Run() {
     Pre1x1Trans(src_in + batch_index * conv_param_->input_h_ * conv_param_->input_w_ * conv_param_->input_channel_,
                 src_out + batch_index * matmul_param_->row_ * matmul_param_->col_);
 
-    int error_code = LiteBackendParallelLaunch(Convolution1x1Run, this, thread_count_);
+    int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, Convolution1x1Run, this, thread_count_);
     if (error_code != RET_OK) {
       MS_LOG(ERROR) << "conv1x1 strassen error error_code[" << error_code << "]";
       return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
index fbc15eb739..9d384c0f1e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_3x3.cc
@@ -212,7 +212,7 @@ int Convolution3x3CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int Convolution3x3Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int Convolution3x3Impl(void *cdata, int task_id) {
   auto conv3x3 = reinterpret_cast<Convolution3x3CPUKernel *>(cdata);
   auto error_code = conv3x3->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -262,7 +262,7 @@ int Convolution3x3CPUKernel::Run() {
   PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
                       conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
 
-  int error_code = LiteBackendParallelLaunch(Convolution3x3Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, Convolution3x3Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv3x3 error error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
index fe11e5fc1b..53ea4cf09f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise.cc
@@ -89,7 +89,7 @@ int ConvolutionDepthwiseCPUKernel::Execute(int task_id) {
   return RET_OK;
 }
 
-int ConvDwRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConvDwRun(void *cdata, int task_id) {
   auto conv_dw = reinterpret_cast<ConvolutionDepthwiseCPUKernel *>(cdata);
   auto ret = conv_dw->Execute(task_id);
   if (ret != RET_OK) {
@@ -116,7 +116,7 @@ int ConvolutionDepthwiseCPUKernel::Run() {
   auto output_tensor = out_tensors_.at(kOutputIndex);
   output_ptr_ = reinterpret_cast<float *>(output_tensor->Data());
 
-  ret = LiteBackendParallelLaunch(ConvDwRun, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwRun, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConvDwRun error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow.cc
index c21e5c53f2..10ed18bb03 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_depthwise_slidewindow.cc
@@ -123,7 +123,7 @@ int ConvolutionDepthwiseSWCPUKernel::Execute(int task_id) {
   return RET_OK;
 }
 
-int ConvDwSWRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConvDwSWRun(void *cdata, int task_id) {
   auto conv_dw = reinterpret_cast<ConvolutionDepthwiseSWCPUKernel *>(cdata);
   auto ret = conv_dw->Execute(task_id);
   if (ret != RET_OK) {
@@ -167,7 +167,7 @@ int ConvolutionDepthwiseSWCPUKernel::Run() {
     packed_output_ = output_ptr;
   }
 
-  ret = LiteBackendParallelLaunch(ConvDwSWRun, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwSWRun, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConvDwSWRun error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc
index 11c561133c..8489e8151c 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_slidewindow.cc
@@ -159,7 +159,7 @@ int ConvolutionSWCPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int ConvolutionSWImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConvolutionSWImpl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<ConvolutionSWCPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -187,7 +187,7 @@ int ConvolutionSWCPUKernel::Run() {
   PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
                       conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
 
-  int error_code = LiteBackendParallelLaunch(ConvolutionSWImpl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionSWImpl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv error error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
index ad5596d052..d31f1059de 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/convolution_winograd.cc
@@ -343,7 +343,7 @@ int ConvolutionWinogradCPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int ConvolutionWinogradImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConvolutionWinogradImpl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<ConvolutionWinogradCPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -395,7 +395,7 @@ int ConvolutionWinogradCPUKernel::Run() {
   PackNHWCToNHWC4Fp32(ori_input_data, nhwc4_input_, conv_param_->input_batch_,
                       conv_param_->input_h_ * conv_param_->input_w_, conv_param_->input_channel_);
 
-  int error_code = LiteBackendParallelLaunch(ConvolutionWinogradImpl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionWinogradImpl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv winograd error error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/crop.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/crop.cc
index b8c4bca55f..711db31678 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/crop.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/crop.cc
@@ -30,13 +30,13 @@ using mindspore::schema::PrimitiveType_Crop;
 
 namespace mindspore::kernel {
 namespace {
-int CropLaunch(int thread_id, LiteParallelGroupEnv *penv, void *cdata) {
+int CropLaunch(void *cdata, int task_id) {
   if (cdata == nullptr) {
     MS_LOG(ERROR) << "Input cdata is nullptr!";
     return RET_NULL_PTR;
   }
   auto kernel = reinterpret_cast<CropCPUKernel *>(cdata);
-  return kernel->CropParallelRun(thread_id);
+  return kernel->CropParallelRun(task_id);
 }
 }  // namespace
 
@@ -68,7 +68,7 @@ int CropCPUKernel::Run() {
     return RET_OK;
   }
 
-  auto ret = LiteBackendParallelLaunch(CropLaunch, this, param->op_parameter_.thread_num_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, CropLaunch, this, param->op_parameter_.thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Crop launch fail!ret: " << ret;
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
index 82eaca56ef..bc831df25a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution.cc
@@ -109,7 +109,7 @@ int DeConvolutionCPUKernel::InitParam() {
   return RET_OK;
 }
 
-int DeConvFp32Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int DeConvFp32Run(void *cdata, int task_id) {
   auto deconv = reinterpret_cast<DeConvolutionCPUKernel *>(cdata);
   auto error_code = deconv->DoDeconv(task_id);
   if (error_code != RET_OK) {
@@ -194,7 +194,7 @@ int DeConvolutionCPUKernel::Run() {
 
     RowMajor2Col12Major(input_ptr_, pack_input_, input_plane_, conv_param_->input_channel_);
 
-    error_code = LiteBackendParallelLaunch(DeConvFp32Run, this, thread_count_);
+    error_code = ParallelLaunch(THREAD_POOL_DEFAULT, DeConvFp32Run, this, thread_count_);
     if (error_code != RET_OK) {
       MS_LOG(ERROR) << "deconv fp32 run error! error_code[" << error_code << "]";
       return error_code;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
index 844a239b9f..10a097a047 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/deconvolution_depthwise.cc
@@ -134,7 +134,7 @@ int DeconvolutionDepthwiseCPUKernel::Execute(int task_id) {
   return RET_OK;
 }
 
-int DeconvDwRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int DeconvDwRun(void *cdata, int task_id) {
   auto deconv_dw = reinterpret_cast<DeconvolutionDepthwiseCPUKernel *>(cdata);
   auto ret = deconv_dw->Execute(task_id);
   if (ret != RET_OK) {
@@ -178,7 +178,7 @@ int DeconvolutionDepthwiseCPUKernel::Run() {
     packed_output_ = output_addr;
   }
 
-  ret = LiteBackendParallelLaunch(DeconvDwRun, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, DeconvDwRun, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "DeconvDwRun error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/elu.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/elu.cc
index 09636fccfb..bd54b2e2be 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/elu.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/elu.cc
@@ -46,7 +46,7 @@ int EluCPUKernel::DoExcute(int task_id) {
   return RET_OK;
 }
 
-int EluRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int EluRun(void *cdata, int task_id) {
   auto EluData = reinterpret_cast<EluCPUKernel *>(cdata);
   auto ret = EluData->DoExcute(task_id);
   if (ret != RET_OK) {
@@ -65,7 +65,7 @@ int EluCPUKernel::Run() {
   input_addr = reinterpret_cast<float *>(in_tensors_.front()->Data());
   output_addr = reinterpret_cast<float *>(out_tensors_.front()->Data());
 
-  auto ret = LiteBackendParallelLaunch(EluRun, this, elu_parameter_->thread_num_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, EluRun, this, elu_parameter_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Elu error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/embedding_lookup.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/embedding_lookup.cc
index ee0e316035..ef832f6257 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/embedding_lookup.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/embedding_lookup.cc
@@ -61,7 +61,7 @@ int EmbeddingLookupCPUKernel::DoExcute(int task_id) {
   return RET_OK;
 }
 
-int EmbeddingLookupRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int EmbeddingLookupRun(void *cdata, int task_id) {
   auto EmbeddingLookupData = reinterpret_cast<EmbeddingLookupCPUKernel *>(cdata);
   auto ret = EmbeddingLookupData->DoExcute(task_id);
   if (ret != RET_OK) {
@@ -102,7 +102,7 @@ int EmbeddingLookupCPUKernel::Run() {
   output_addr_ = reinterpret_cast<float *>(out_tensors_.front()->Data());
   ids_addr_ = reinterpret_cast<int *>(in_tensors_.back()->Data());
 
-  auto ret = LiteBackendParallelLaunch(EmbeddingLookupRun, this, embedding_lookup_parameter_->thread_num);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, EmbeddingLookupRun, this, embedding_lookup_parameter_->thread_num);
   context_->allocator->Free(input_addr_);
   context_->allocator->Free(embedding_lookup_parameter_->is_regulated_);
   if (ret != RET_OK) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/expandDims.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/expandDims.cc
index d196bc0a51..3a49462bb2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/expandDims.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/expandDims.cc
@@ -56,7 +56,7 @@ int ExpandDimsCPUKernel::DoExpandDims(int task_id) {
   return RET_OK;
 }
 
-int ExpandDimsRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ExpandDimsRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<ExpandDimsCPUKernel *>(cdata);
   auto ret = g_kernel->DoExpandDims(task_id);
   if (ret != RET_OK) {
@@ -74,7 +74,7 @@ int ExpandDimsCPUKernel::Run() {
   }
   in_ptr_ = reinterpret_cast<float *>(in_tensors_.at(0)->Data());
   out_ptr_ = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
-  auto ret = LiteBackendParallelLaunch(ExpandDimsRun, this, thread_sz_count_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, ExpandDimsRun, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ExpandDimsRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fill.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/fill.cc
index 561d92b6ee..3ae36bf99d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fill.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fill.cc
@@ -56,7 +56,7 @@ int FillCPUKernel::DoFill(int task_id) {
   return RET_OK;
 }
 
-int FillRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int FillRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<FillCPUKernel *>(cdata);
   auto ret = g_kernel->DoFill(task_id);
   if (ret != RET_OK) {
@@ -77,7 +77,7 @@ int FillCPUKernel::Run() {
   auto fill_data = reinterpret_cast<float *>(fillData->Data());
   src_data_ = fill_data[0];
   out_ptr_ = reinterpret_cast<float *>(output->Data());
-  auto ret = LiteBackendParallelLaunch(FillRun, this, thread_sz_count_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, FillRun, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "FillRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc
index 2aaea20be8..226f609a98 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/fullconnection.cc
@@ -94,7 +94,7 @@ void FullconnectionCPUKernel::InitMatrixB(float *src_ptr, float *dst_ptr) {
   RowMajor2Col8Major(src_ptr, dst_ptr, fc_param_->col_, fc_param_->deep_);
 }
 
-int FcFp32MatmulRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int FcFp32MatmulRun(void *cdata, int task_id) {
   auto fc = reinterpret_cast<FullconnectionCPUKernel *>(cdata);
   auto error_code = fc->DoMatmul(task_id);
   if (error_code != RET_OK) {
@@ -129,7 +129,7 @@ int FullconnectionCPUKernel::Run() {
   if (!fc_param_->a_const_) InitMatrixA(a_ptr, a_c12_ptr_);
   if (!fc_param_->b_const_) InitMatrixB(b_ptr, b_r8_ptr_);
 
-  LiteBackendParallelLaunch(FcFp32MatmulRun, this, thread_count_);
+  ParallelLaunch(THREAD_POOL_DEFAULT, FcFp32MatmulRun, this, thread_count_);
 
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/gather.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/gather.cc
index cd44d271c0..3a2a77497f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gather.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gather.cc
@@ -89,7 +89,7 @@ int GatherCPUKernel::DoGather(int task_id) {
   return error_code;
 }
 
-int GatherRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int GatherRun(void *cdata, int task_id) {
   auto gather_kernel = reinterpret_cast<GatherCPUKernel *>(cdata);
   auto error_code = gather_kernel->DoGather(task_id);
   if (error_code != RET_OK) {
@@ -112,7 +112,7 @@ int GatherCPUKernel::Run() {
     context_->allocator->Free(indices_data_);
     return RET_ERROR;
   }
-  int error_code = LiteBackendParallelLaunch(GatherRun, this, op_parameter_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, GatherRun, this, op_parameter_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Gather function error error_code[" << error_code << "]";
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd.cc
index 518d74589e..961178e734 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/gatherNd.cc
@@ -105,7 +105,7 @@ int GatherNdCPUKernel::DoGatherNd(int task_id) {
   return RET_OK;
 }
 
-int GatherNdRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int GatherNdRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<GatherNdCPUKernel *>(cdata);
   auto ret = g_kernel->DoGatherNd(task_id);
   if (ret != RET_OK) {
@@ -123,7 +123,7 @@ int GatherNdCPUKernel::Run() {
   }
   in_ptr_ = reinterpret_cast<float *>(in_tensors_.front()->Data());
   out_ptr_ = reinterpret_cast<float *>(out_tensors_.front()->Data());
-  auto ret = LiteBackendParallelLaunch(GatherNdRun, this, thread_sz_count_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, GatherNdRun, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "gatherNd error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/leaky_relu.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/leaky_relu.cc
index df623ff0e3..ec2a4cbc26 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/leaky_relu.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/leaky_relu.cc
@@ -30,7 +30,7 @@ using mindspore::schema::PrimitiveType_Prelu;
 
 namespace mindspore::kernel {
 namespace {
-int LeakyReluRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int LeakyReluRun(void *cdata, int task_id) {
   auto kernel_relu = reinterpret_cast<LeakyReluCPUKernel *>(cdata);
   auto ret = kernel_relu->DoExcute(task_id);
   if (ret != RET_OK) {
@@ -66,7 +66,7 @@ int LeakyReluCPUKernel::Run() {
   input_data = reinterpret_cast<float *>(input->Data());
   output_data = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
 
-  auto ret = LiteBackendParallelLaunch(LeakyReluRun, this, context_->thread_num_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, LeakyReluRun, this, context_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "PReluDwRun error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm.cc
index 85cc36f414..15de35e18b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/local_response_norm.cc
@@ -63,7 +63,7 @@ int LocalResponseNormCPUKernel::DoLocalResponseNorm(int task_id) {
   return RET_OK;
 }
 
-int LocalResponseNormRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int LocalResponseNormRun(void *cdata, int task_id) {
   auto lrn = reinterpret_cast<LocalResponseNormCPUKernel *>(cdata);
   auto error_code = lrn->DoLocalResponseNorm(task_id);
   if (error_code != RET_OK) {
@@ -79,7 +79,7 @@ int LocalResponseNormCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
     return prepare_ret;
   }
-  int error_code = LiteBackendParallelLaunch(LocalResponseNormRun, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, LocalResponseNormRun, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "LocalResponseNorm function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc
index c88a6423bd..61392a80a4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/matmul.cc
@@ -147,7 +147,7 @@ int MatmulCPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int MatmulFloatRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int MatmulFloatRun(void *cdata, int task_id) {
   auto op = reinterpret_cast<MatmulCPUKernel *>(cdata);
   auto error_code = op->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -178,7 +178,7 @@ int MatmulCPUKernel::Run() {
     a_ptr_ = a_c12_ptr_ + i * params_->row_12_ * params_->deep_;
     b_ptr_ = b_r8_ptr_ + i * params_->deep_ * params_->col_8_;
     c_ptr_ = c_src + i * params_->row_ * params_->col_;
-    LiteBackendParallelLaunch(MatmulFloatRun, this, thread_count_);
+    ParallelLaunch(THREAD_POOL_DEFAULT, MatmulFloatRun, this, thread_count_);
   }
   return RET_OK;
 }
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/one_hot.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/one_hot.cc
index 757482f02a..c0bb5f87eb 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/one_hot.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/one_hot.cc
@@ -81,7 +81,7 @@ int OneHotCPUKernel::ReSize() {
   return RET_OK;
 }
 
-int RunOneHot(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int RunOneHot(void *cdata, int task_id) {
   auto onehot_kernel = reinterpret_cast<OneHotCPUKernel *>(cdata);
   if (onehot_kernel == nullptr) {
     MS_LOG(ERROR) << "cast OneHotCPUKernel failed";
@@ -166,7 +166,7 @@ int OneHotCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
     return prepare_ret;
   }
-  int error_code = LiteBackendParallelLaunch(RunOneHot, this, context_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, RunOneHot, this, context_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "OneHot function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/pad.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/pad.cc
index 012f4ab4f4..51d9e9d1a1 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/pad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pad.cc
@@ -68,7 +68,7 @@ int PadCPUKernel::ReSize() {
   return RET_OK;
 }
 
-int PadImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PadImpl(void *cdata, int task_id) {
   auto padKernel = reinterpret_cast<PadCPUKernel *>(cdata);
   int error_code = padKernel->RunImpl(task_id);
   if (error_code != NNACL_OK) {
@@ -102,7 +102,7 @@ int PadCPUKernel::Run() {
   auto output_data = reinterpret_cast<float *>(output->Data());
   memset(output_data, 0, output_size * sizeof(float));
 
-  int error_code = LiteBackendParallelLaunch(PadImpl, this, context_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, PadImpl, this, context_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Pad run error, error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.cc
index 61009096d5..1bcfcaff33 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/pooling.cc
@@ -78,7 +78,7 @@ int PoolingCPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int PoolingImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PoolingImpl(void *cdata, int task_id) {
   auto pooling = reinterpret_cast<PoolingCPUKernel *>(cdata);
   auto error_code = pooling->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -94,7 +94,7 @@ int PoolingCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
     return prepare_ret;
   }
-  int error_code = LiteBackendParallelLaunch(PoolingImpl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, PoolingImpl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "pooling error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/power.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/power.cc
index 61212e207d..4b1cef6fdb 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/power.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/power.cc
@@ -30,7 +30,7 @@ int PowerCPUKernel::Init() { return RET_OK; }
 
 int PowerCPUKernel::ReSize() { return RET_OK; }
 
-int PowerImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PowerImpl(void *cdata, int task_id) {
   auto kernel = reinterpret_cast<PowerCPUKernel *>(cdata);
   auto ret = kernel->RunImpl(task_id);
   if (ret != RET_OK) {
@@ -46,7 +46,7 @@ int PowerCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
     return prepare_ret;
   }
-  auto ret = LiteBackendParallelLaunch(PowerImpl, this, thread_count_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, PowerImpl, this, thread_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "PowerCPUKernel error: " << ret;
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/prelu.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/prelu.cc
index 337f9edb6b..b28248611a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/prelu.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/prelu.cc
@@ -28,7 +28,7 @@ using mindspore::schema::PrimitiveType_CaffePReLU;
 
 namespace mindspore::kernel {
 namespace {
-int PReluRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PReluRun(void *cdata, int task_id) {
   auto PRelu = reinterpret_cast<PReluCPUKernel *>(cdata);
   auto ret = PRelu->DoExcute(task_id);
   if (ret != RET_OK) {
@@ -135,7 +135,7 @@ int PReluCPUKernel::Run() {
   auto negative_slope_tensor = in_tensors_.at(1);
   prelu_param_->slope_ = reinterpret_cast<float *>(negative_slope_tensor->Data());
 
-  auto ret = LiteBackendParallelLaunch(PReluRun, this, prelu_param_->op_parameter_.thread_num_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, PReluRun, this, prelu_param_->op_parameter_.thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "PRelu Run error: error_code[" << ret << "]";
     context_->allocator->Free(input_data_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
index aaf0b5a5cd..27125a0d4e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reduce.cc
@@ -95,7 +95,7 @@ int ReduceCPUKernel::CallReduceUnit(int task_id) {
   return ret;
 }
 
-int ReduceImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ReduceImpl(void *cdata, int task_id) {
   auto reduce = reinterpret_cast<ReduceCPUKernel *>(cdata);
   auto error_code = reduce->CallReduceUnit(task_id);
   if (error_code != RET_OK) {
@@ -125,7 +125,7 @@ int ReduceCPUKernel::Run() {
       inner_size_ *= tmp_shape_[k];
     }
     axis_size_ = tmp_shape_[axis];
-    auto error_code = LiteBackendParallelLaunch(ReduceImpl, this, context_->thread_num_);
+    auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceImpl, this, context_->thread_num_);
     if (error_code != RET_OK) {
       MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
       return RET_ERROR;
@@ -145,7 +145,7 @@ int ReduceCPUKernel::Run() {
   }
   axis_size_ = tmp_shape_[last_reduce_axis];
   dst_data_ = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
-  auto error_code = LiteBackendParallelLaunch(ReduceImpl, this, context_->thread_num_);
+  auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceImpl, this, context_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/resize.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/resize.cc
index b4b3c360a1..598284768a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/resize.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/resize.cc
@@ -38,7 +38,7 @@ int ResizeCPUKernel::Init() {
   return ReSize();
 }
 
-int ResizeImpl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ResizeImpl(void *cdata, int task_id) {
   auto resize = reinterpret_cast<ResizeCPUKernel *>(cdata);
   auto error_code = resize->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -94,7 +94,7 @@ int ResizeCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare failed.";
     return RET_ERROR;
   }
-  int error_code = LiteBackendParallelLaunch(ResizeImpl, this, context_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ResizeImpl, this, context_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Resize run error, error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/reverse.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/reverse.cc
index e61ff43cb8..4eb82488cc 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/reverse.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/reverse.cc
@@ -100,7 +100,7 @@ int ReverseCPUKernel::Init() {
   return ret;
 }
 
-int ReverseRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ReverseRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<ReverseCPUKernel *>(cdata);
   auto ret = g_kernel->DoReverse(task_id);
   if (ret != RET_OK) {
@@ -132,7 +132,7 @@ int ReverseCPUKernel::Run() {
   }
   in_ptr_ = reinterpret_cast<float *>(in_tensors_[0]->Data());
   out_ptr_ = reinterpret_cast<float *>(out_tensors_[0]->Data());
-  ret = LiteBackendParallelLaunch(ReverseRun, this, thread_sz_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ReverseRun, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Reverse run error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/roi_pooling.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/roi_pooling.cc
index 21718c5553..9256ada127 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/roi_pooling.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/roi_pooling.cc
@@ -72,7 +72,7 @@ int ROIPoolingCPUKernel::DoExecute(int task_id) {
   return RET_OK;
 }
 
-int ROIPoolingRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ROIPoolingRun(void *cdata, int task_id) {
   auto Data = reinterpret_cast<ROIPoolingCPUKernel *>(cdata);
   auto ret = Data->DoExecute(task_id);
   if (ret != RET_OK) {
@@ -91,7 +91,7 @@ int ROIPoolingCPUKernel::Run() {
   in_ptr_ = reinterpret_cast<float *>(in_tensors_.front()->Data());
   out_ptr_ = reinterpret_cast<float *>(out_tensors_.front()->Data());
   roi_ptr_ = reinterpret_cast<float *>(in_tensors_.at(1)->Data());
-  ret = LiteBackendParallelLaunch(ROIPoolingRun, this, param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ROIPoolingRun, this, param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ROIPooling error: error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc
index 7405331d12..aee1ff01ee 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/scale.cc
@@ -137,7 +137,7 @@ int ScaleCPUKernel::Scale(int task_id) {
   return RET_OK;
 }
 
-int ScaleRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ScaleRun(void *cdata, int task_id) {
   auto scale = reinterpret_cast<ScaleCPUKernel *>(cdata);
   auto ret = scale->Scale(task_id);
   if (ret != RET_OK) {
@@ -162,7 +162,7 @@ int ScaleCPUKernel::Run() {
   auto out_tensor = out_tensors_.front();
   output_ptr_ = reinterpret_cast<float *>(out_tensor->Data());
 
-  ret = LiteBackendParallelLaunch(ScaleRun, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ScaleRun, this, op_parameter_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/scatter_nd.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/scatter_nd.cc
index 28b7a3816d..04917fdcf2 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/scatter_nd.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/scatter_nd.cc
@@ -137,7 +137,7 @@ int ScatterNDCPUKernel::ScatterND(int task_id) {
   return RET_OK;
 }
 
-int ScatterNDRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ScatterNDRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<ScatterNDCPUKernel *>(cdata);
   auto ret = g_kernel->ScatterND(task_id);
   if (ret != RET_OK) {
@@ -153,7 +153,7 @@ int ScatterNDCPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
     return ret;
   }
-  ret = LiteBackendParallelLaunch(ScatterNDRun, this, thread_n_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ScatterNDRun, this, thread_n_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ScatterND error error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/slice.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/slice.cc
index d81023a373..dccbd7a40a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/slice.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/slice.cc
@@ -29,13 +29,13 @@ using mindspore::schema::PrimitiveType_Slice;
 
 namespace mindspore::kernel {
 namespace {
-int SliceLaunch(int thread_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SliceLaunch(void *cdata, int task_id) {
   if (cdata == nullptr) {
     MS_LOG(ERROR) << "Input cdata is nullptr!";
     return RET_NULL_PTR;
   }
   auto kernel = reinterpret_cast<SliceCPUKernel *>(cdata);
-  return kernel->SliceParallelRun(thread_id);
+  return kernel->SliceParallelRun(task_id);
 }
 }  // namespace
 
@@ -97,7 +97,7 @@ int SliceCPUKernel::Run() {
     DoSliceNoParallel(input_data, output_data, param);
     return RET_OK;
   }
-  ret = LiteBackendParallelLaunch(SliceLaunch, this, param->op_parameter_.thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SliceLaunch, this, param->op_parameter_.thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "slice launch fail!ret: " << ret;
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_depth.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_depth.cc
index 4e927e736b..e08f383894 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_depth.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/space_to_depth.cc
@@ -74,7 +74,7 @@ int SpaceToDepthCPUKernel::SpaceToDepth(int task_id) {
   return RET_OK;
 }
 
-int SpaceToDepthRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SpaceToDepthRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<SpaceToDepthCPUKernel *>(cdata);
   auto ret = g_kernel->SpaceToDepth(task_id);
   if (ret != RET_OK) {
@@ -93,7 +93,7 @@ int SpaceToDepthCPUKernel::Run() {
   input_ptr_ = reinterpret_cast<float *>(in_tensors_[0]->Data());
   output_ptr_ = reinterpret_cast<float *>(out_tensors_[0]->Data());
   if (in_tensors_[0]->GetFormat() == schema::Format_NHWC) {
-    ret = LiteBackendParallelLaunch(SpaceToDepthRun, this, thread_h_num_);
+    ret = ParallelLaunch(THREAD_POOL_DEFAULT, SpaceToDepthRun, this, thread_h_num_);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "SpaceToDepth error error_code[" << ret << "]";
       return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_to_dense.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_to_dense.cc
index 16f89d1819..3e4d0b92ca 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_to_dense.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/sparse_to_dense.cc
@@ -39,7 +39,7 @@ int SparseToDenseCPUKernel::DoExcute(int task_id) {
   return RET_OK;
 }
 
-int SparseToDenseRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SparseToDenseRun(void *cdata, int task_id) {
   auto s2ddata = reinterpret_cast<SparseToDenseCPUKernel *>(cdata);
   auto ret = s2ddata->DoExcute(task_id);
   if (ret != RET_OK) {
@@ -70,7 +70,7 @@ int SparseToDenseCPUKernel::Run() {
   std::vector<int> temp_shape = output0->shape();
   output_shape_ = reinterpret_cast<int *>(temp_shape.data());
 
-  ret = LiteBackendParallelLaunch(SparseToDenseRun, this, s2d_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SparseToDenseRun, this, s2d_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "SparseToDenseRun error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/split.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/split.cc
index 7fe0f68eae..cb56abae50 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/split.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/split.cc
@@ -62,7 +62,7 @@ int SplitCPUKernel::Split(int task_id) {
   return RET_OK;
 }
 
-int SplitRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SplitRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<SplitCPUKernel *>(cdata);
   auto ret = g_kernel->Split(task_id);
   if (ret != RET_OK) {
@@ -83,7 +83,7 @@ int SplitCPUKernel::Run() {
   for (int i = 0; i < param->num_split_; i++) {
     output_ptr_[i] = reinterpret_cast<float *>(out_tensors_.at(i)->Data());
   }
-  ret = LiteBackendParallelLaunch(SplitRun, this, thread_n_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SplitRun, this, thread_n_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose.cc
index 4985aa6d5a..283906e3d3 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/transpose.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/transpose.cc
@@ -72,7 +72,7 @@ int TransposeCPUKernel::TransposeParallel(int task_id) {
   return RET_OK;
 }
 
-int TransposeRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int TransposeRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<TransposeCPUKernel *>(cdata);
   auto ret = g_kernel->TransposeParallel(task_id);
   if (ret != RET_OK) {
@@ -101,7 +101,7 @@ int TransposeCPUKernel::Run() {
   in_shape_ = const_cast<int *>(in_tensor->shape().data());
   out_shape_ = const_cast<int *>(out_tensor->shape().data());
 
-  ret = LiteBackendParallelLaunch(TransposeRun, this, thread_h_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, TransposeRun, this, thread_h_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Tranpose error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/unsqueeze.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/unsqueeze.cc
index 57d545e24c..496c8e3f8a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/unsqueeze.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/unsqueeze.cc
@@ -55,7 +55,7 @@ int UnsqueezeCPUKernel::DoUnsqueeze(int task_id) {
   return RET_OK;
 }
 
-int UnsqueezeRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int UnsqueezeRun(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<UnsqueezeCPUKernel *>(cdata);
   auto ret = g_kernel->DoUnsqueeze(task_id);
   if (ret != RET_OK) {
@@ -73,7 +73,7 @@ int UnsqueezeCPUKernel::Run() {
   }
   in_ptr_ = reinterpret_cast<int8_t *>(in_tensors_.at(0)->Data());
   out_ptr_ = reinterpret_cast<int8_t *>(out_tensors_.at(0)->Data());
-  ret = LiteBackendParallelLaunch(UnsqueezeRun, this, thread_sz_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, UnsqueezeRun, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "UnsqueezeRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32/where.cc b/mindspore/lite/src/runtime/kernel/arm/fp32/where.cc
index 4a853aa2c0..3a35179a20 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32/where.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32/where.cc
@@ -38,7 +38,7 @@ int WhereCPUKernel::DoExcute(int task_id) {
   return RET_OK;
 }
 
-int WhereRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int WhereRun(void *cdata, int task_id) {
   auto wheredata = reinterpret_cast<WhereCPUKernel *>(cdata);
   auto ret = wheredata->DoExcute(task_id);
   if (ret != RET_OK) {
@@ -79,7 +79,7 @@ int WhereCPUKernel::Run() {
     MS_LOG(ERROR) << "Error, inputs' length are zero !!!";
     return RET_ERROR;
   }
-  ret = LiteBackendParallelLaunch(WhereRun, this, where_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, WhereRun, this, where_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "WhereDwRun error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc
index a49dcc663b..6cc4999499 100644
--- a/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/fp32_grad/activation_grad.cc
@@ -70,7 +70,7 @@ int ActivationGradCPUKernel::DoActivation(int task_id) {
   return RET_OK;
 }
 
-int ActivationGradRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ActivationGradRun(void *cdata, int task_id) {
   auto activationGrad_kernel = reinterpret_cast<ActivationGradCPUKernel *>(cdata);
   auto error_code = activationGrad_kernel->DoActivation(task_id);
   if (error_code != RET_OK) {
@@ -81,7 +81,7 @@ int ActivationGradRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
 }
 
 int ActivationGradCPUKernel::Run() {
-  int error_code = LiteBackendParallelLaunch(ActivationGradRun, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ActivationGradRun, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Activation function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc
index 8b03aaa9e3..f7eea6ccd1 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.cc
@@ -102,17 +102,17 @@ int QuantizedAddCPUKernel::Run() {
     TileDimensionsUint8(static_cast<uint8_t *>(in_tensors_.at(0)->Data()),
                         static_cast<uint8_t *>(in_tensors_.at(1)->Data()), reinterpret_cast<uint8_t *>(input0_data_),
                         reinterpret_cast<uint8_t *>(input1_data_), &tile_para);
-    ret = LiteBackendParallelLaunch(AddInt8Run, this, thread_count_);
+    ret = ParallelLaunch(THREAD_POOL_DEFAULT, AddInt8Run, this, thread_count_);
     ctx_->allocator->Free(input0_data_);
     ctx_->allocator->Free(input1_data_);
     return ret;
   }
 
-  ret = LiteBackendParallelLaunch(AddInt8Run, this, thread_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, AddInt8Run, this, thread_count_);
   return ret;
 }
 
-int AddInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int AddInt8Run(void *cdata, int task_id) {
   auto add = reinterpret_cast<QuantizedAddCPUKernel *>(cdata);
   add->DoExecute(task_id);
   return lite::RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h
index be83375b0d..77d76fbc18 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/add_int8.h
@@ -46,7 +46,7 @@ class QuantizedAddCPUKernel : public LiteKernel {
   int8_t *output_data_ = nullptr;
 };
 
-int AddInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int AddInt8Run(void *cdata, int task_id);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_ADD_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.cc
index c05aac9896..02fa869545 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_int8.cc
@@ -36,11 +36,11 @@ using mindspore::schema::PrimitiveType_NotEqual;
 
 namespace mindspore::kernel {
 namespace {
-int ArithmeticsInt8Launch(int thread_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ArithmeticsInt8Launch(void *cdata, int task_id) {
   auto arithmetic_kernel = reinterpret_cast<ArithmeticInt8CPUKernel *>(cdata);
-  auto error_code = arithmetic_kernel->DoArithmetic(thread_id);
+  auto error_code = arithmetic_kernel->DoArithmetic(task_id);
   if (error_code != RET_OK) {
-    MS_LOG(ERROR) << "ArithmeticsRun error thread_id[" << thread_id << "] error_code[" << error_code << "]";
+    MS_LOG(ERROR) << "ArithmeticsRun error thread_id[" << task_id << "] error_code[" << error_code << "]";
     return error_code;
   }
   return RET_OK;
@@ -151,7 +151,7 @@ int ArithmeticInt8CPUKernel::Run() {
     }
     TileDimensionsInt8(input_data0, input_data1, tile_data0_, tile_data1_, param);
   }
-  ret = LiteBackendParallelLaunch(ArithmeticsInt8Launch, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ArithmeticsInt8Launch, this, op_parameter_->thread_num_);
   if (param->broadcasting_) {
     context_->allocator->Free(tile_data0_);
     context_->allocator->Free(tile_data1_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_self_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_self_int8.cc
index 56b1a0fc2c..43c3a36123 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_self_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/arithmetic_self_int8.cc
@@ -65,7 +65,7 @@ int ArithmeticSelfInt8CPUKernel::ReSize() {
   return RET_OK;
 }
 
-int ArithmeticSelfInt8Runs(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ArithmeticSelfInt8Runs(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<ArithmeticSelfInt8CPUKernel *>(cdata);
   auto ret = g_kernel->DoArithmeticSelf(task_id);
   if (ret != RET_OK) {
@@ -104,7 +104,7 @@ int ArithmeticSelfInt8CPUKernel::Run() {
   auto out_tensor = out_tensors_.at(0);
   in_ptr_ = reinterpret_cast<int8_t *>(input_tensor->Data());
   out_ptr_ = reinterpret_cast<int8_t *>(out_tensor->Data());
-  ret = LiteBackendParallelLaunch(ArithmeticSelfInt8Runs, this, thread_sz_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ArithmeticSelfInt8Runs, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ArithmeticSelfRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/batchnorm_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/batchnorm_int8.cc
index e702bcc27d..846f6f16b4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/batchnorm_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/batchnorm_int8.cc
@@ -180,7 +180,7 @@ int BatchnormInt8CPUKernel::DoExecute(int task_id) {
   return RET_OK;
 }
 
-int BatchNormInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int BatchNormInt8Run(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<BatchnormInt8CPUKernel *>(cdata);
   auto ret = g_kernel->DoExecute(task_id);
   if (ret != RET_OK) {
@@ -199,7 +199,7 @@ int BatchnormInt8CPUKernel::Run() {
   in_addr_ = reinterpret_cast<int8_t *>(in_tensors_.at(0)->Data());
   out_addr_ = reinterpret_cast<int8_t *>(out_tensors_.at(0)->Data());
 
-  int ret = LiteBackendParallelLaunch(BatchNormInt8Run, this, batchnorm_param_->op_parameter_.thread_num_);
+  int ret = ParallelLaunch(THREAD_POOL_DEFAULT, BatchNormInt8Run, this, batchnorm_param_->op_parameter_.thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "BatchnormRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.cc
index d60bde5f3e..8aad2d5716 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.cc
@@ -104,12 +104,12 @@ int ConcatInt8CPUKernel::Run() {
   }
   output_data_ = reinterpret_cast<int8_t *>(out_tensors_.at(0)->Data());
 
-  ret = LiteBackendParallelLaunch(ConcatInt8Run, this, thread_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConcatInt8Run, this, thread_count_);
 
   return ret;
 }
 
-int ConcatInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConcatInt8Run(void *cdata, int task_id) {
   auto concat = reinterpret_cast<ConcatInt8CPUKernel *>(cdata);
   concat->DoExecute(task_id);
   return lite::RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.h
index 7a677034db..0f8780fd2f 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/concat_int8.h
@@ -56,7 +56,7 @@ class ConcatInt8CPUKernel : public ConcatBaseCPUKernel {
   int8_t *output_data_ = nullptr;
 };
 
-int ConcatInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int ConcatInt8Run(void *cdata, int task_id);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_CONCAT_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
index 9e8c7968be..a64d94d1f8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_1x1_int8.cc
@@ -217,7 +217,7 @@ int Convolution1x1Int8CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int Convolution1x1Int8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int Convolution1x1Int8Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<Convolution1x1Int8CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -253,7 +253,7 @@ int Convolution1x1Int8CPUKernel::Run() {
     PackInputSum16x4Int8(packed_input_, input_sum_, matmul_param_->deep_, matmul_param_->col_, matmul_param_->row_,
                          conv_param_);
 
-    int error_code = LiteBackendParallelLaunch(Convolution1x1Int8Impl, this, thread_count_);
+    int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, Convolution1x1Int8Impl, this, thread_count_);
     if (error_code != RET_OK) {
       MS_LOG(ERROR) << "conv1x1 fp16 error error_code[" << error_code << "]";
       return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc
index 5dbddbaba2..56e7557c4b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_3x3_int8.cc
@@ -213,7 +213,7 @@ int Convolution3x3Int8CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int Convolution3x3Int8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int Convolution3x3Int8Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<Convolution3x3Int8CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -238,7 +238,7 @@ int Convolution3x3Int8CPUKernel::Run() {
   auto input_addr = reinterpret_cast<int8_t *>(in_tensors_.at(kInputIndex)->Data());
   PackInputToC8Int8(input_addr, input_data_, conv_param_);
 
-  int error_code = LiteBackendParallelLaunch(Convolution3x3Int8Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, Convolution3x3Int8Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv3x3 int8 error error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
index 4cf2b00ec8..3b8bfa935a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_depthwise_int8.cc
@@ -128,7 +128,7 @@ int ConvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
   return RET_OK;
 }
 
-int ConvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConvDwInt8Run(void *cdata, int task_id) {
   auto conv_dw_int8 = reinterpret_cast<ConvolutionDepthwiseInt8CPUKernel *>(cdata);
   auto ret = conv_dw_int8->Execute(task_id);
   if (ret != RET_OK) {
@@ -164,7 +164,7 @@ int ConvolutionDepthwiseInt8CPUKernel::Run() {
     packed_output_ = output_addr;
   }
 
-  ret = LiteBackendParallelLaunch(ConvDwInt8Run, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ConvDwInt8Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "ConvDwInt8Run error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
index 28dfc57340..e3f6703741 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/convolution_int8.cc
@@ -338,7 +338,7 @@ int ConvolutionInt8CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int ConvolutionInt8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ConvolutionInt8Impl(void *cdata, int task_id) {
   auto conv = reinterpret_cast<ConvolutionInt8CPUKernel *>(cdata);
   auto error_code = conv->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -374,7 +374,7 @@ int ConvolutionInt8CPUKernel::Run() {
   convert_func_(ori_input_data, nhwc4_input_, conv_param_->input_batch_, conv_param_->input_h_ * conv_param_->input_w_,
                 conv_param_->input_channel_);
 
-  int error_code = LiteBackendParallelLaunch(ConvolutionInt8Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ConvolutionInt8Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "conv int8 error error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.cc
index 0feb6e6614..afc1c6545d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.cc
@@ -67,7 +67,7 @@ int CropInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
     return ret;
   }
-  ret = LiteBackendParallelLaunch(CropInt8Run, this, thread_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, CropInt8Run, this, thread_count_);
   return ret;
 }
 
@@ -91,7 +91,7 @@ void PadOffset(int input_dim, CropParameter *crop_para) {
   }
 }
 
-int CropInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int CropInt8Run(void *cdata, int task_id) {
   auto crop = reinterpret_cast<CropInt8CPUKernel *>(cdata);
   crop->DoExecute(task_id);
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.h
index 46aabf4354..3cbcaba8eb 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/crop_int8.h
@@ -46,7 +46,7 @@ class CropInt8CPUKernel : public CropBaseCPUKernel {
   CropParameter *crop_para_;
 };
 
-int CropInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int CropInt8Run(void *cdata, int task_id);
 void PadOffset(int input_dim, CropParameter *crop_para);
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
index 74916a7586..ba4dca80fc 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_depthwise_int8.cc
@@ -164,7 +164,7 @@ int DeconvolutionDepthwiseInt8CPUKernel::Execute(int task_id) {
   return RET_OK;
 }
 
-int DeconvDwInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int DeconvDwInt8Run(void *cdata, int task_id) {
   auto deconv_dw_int8 = reinterpret_cast<DeconvolutionDepthwiseInt8CPUKernel *>(cdata);
   auto ret = deconv_dw_int8->Execute(task_id);
   if (ret != RET_OK) {
@@ -196,7 +196,7 @@ int DeconvolutionDepthwiseInt8CPUKernel::Run() {
     packed_output_ = output_addr;
   }
 
-  ret = LiteBackendParallelLaunch(DeconvDwInt8Run, this, conv_param_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, DeconvDwInt8Run, this, conv_param_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "DeconvDwInt8Run error: error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc
index 55e8f60d50..8f4b06d55d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/deconvolution_int8.cc
@@ -216,7 +216,7 @@ void DeConvInt8CPUKernel::FreeRunBuf() {
   return;
 }
 
-int DeConvInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int DeConvInt8Run(void *cdata, int task_id) {
   auto deconv = reinterpret_cast<DeConvInt8CPUKernel *>(cdata);
   auto error_code = deconv->DoDeconv(task_id);
   if (error_code != RET_OK) {
@@ -272,7 +272,7 @@ int DeConvInt8CPUKernel::Run() {
     DeConvPackInputSum(input_ptr_, input_sum_, conv_param_->conv_quant_arg_.filter_quant_args_[0].zp_,
                        UP_ROUND(matmul_param_->row_, C4NUM), UP_ROUND(matmul_param_->deep_, C16NUM), support_optimize_);
 
-    error_code = LiteBackendParallelLaunch(DeConvInt8Run, this, thread_count_);
+    error_code = ParallelLaunch(THREAD_POOL_DEFAULT, DeConvInt8Run, this, thread_count_);
     if (error_code != RET_OK) {
       MS_LOG(ERROR) << "deconv int8 run error! error_code[" << error_code << "]";
       return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/div_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/div_int8.cc
index 3550bace2f..0249cccdf4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/div_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/div_int8.cc
@@ -87,7 +87,7 @@ int DivInt8CPUKernel::DoExecute(int task_id) {
   return ret;
 }
 
-int DivInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int DivInt8Run(void *cdata, int task_id) {
   auto div_kernel = reinterpret_cast<DivInt8CPUKernel *>(cdata);
   auto ret = div_kernel->DoExecute(task_id);
   if (ret != RET_OK) {
@@ -123,7 +123,7 @@ int DivInt8CPUKernel::Run() {
                         static_cast<uint8_t *>(in_tensors_.at(1)->Data()), reinterpret_cast<uint8_t *>(tile0_data_),
                         reinterpret_cast<uint8_t *>(tile1_data_), &tile_para);
   }
-  ret = LiteBackendParallelLaunch(DivInt8Run, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, DivInt8Run, this, op_parameter_->thread_num_);
   if (broadcast_) {
     context_->allocator->Free(tile0_data_);
     context_->allocator->Free(tile1_data_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc
index 54284d72d4..48e4ffec66 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/fullconnection_int8.cc
@@ -118,7 +118,7 @@ int FullconnectionInt8CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int FcInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int FcInt8Run(void *cdata, int task_id) {
   auto fc = reinterpret_cast<FullconnectionInt8CPUKernel *>(cdata);
   auto ret = fc->RunImpl(task_id);
   if (ret != RET_OK) {
@@ -137,7 +137,7 @@ int FullconnectionInt8CPUKernel::Run() {
   auto input_ptr = reinterpret_cast<int8_t *>(in_tensors_[0]->Data());
   RowMajor2Row4x16Major(input_ptr, fc_param_->row_, fc_param_->deep_, a_r4x16_ptr_, d16_);
   CalcInputSums(input_ptr, fc_param_->row_, fc_param_->deep_, quant_params_.weight.zp_, input_sums_);
-  LiteBackendParallelLaunch(FcInt8Run, this, thread_count_);
+  ParallelLaunch(THREAD_POOL_DEFAULT, FcInt8Run, this, thread_count_);
   return RET_OK;
 }
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/gatherNd_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/gatherNd_int8.cc
index 4a90d375d3..f5539b9195 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/gatherNd_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/gatherNd_int8.cc
@@ -114,7 +114,7 @@ int GatherNdInt8CPUKernel::DoGatherNd(int task_id) {
   return RET_OK;
 }
 
-int GatherNdInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int GatherNdInt8Run(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<GatherNdInt8CPUKernel *>(cdata);
   auto ret = g_kernel->DoGatherNd(task_id);
   if (ret != RET_OK) {
@@ -132,7 +132,7 @@ int GatherNdInt8CPUKernel::Run() {
   }
   in_ptr_ = reinterpret_cast<int8_t *>(in_tensors_.front()->Data());
   out_ptr_ = reinterpret_cast<int8_t *>(out_tensors_.front()->Data());
-  auto ret = LiteBackendParallelLaunch(GatherNdInt8Run, this, thread_sz_count_);
+  auto ret = ParallelLaunch(THREAD_POOL_DEFAULT, GatherNdInt8Run, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "gatherNd error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/gather_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/gather_int8.cc
index da2e3e6ac5..749123770a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/gather_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/gather_int8.cc
@@ -111,7 +111,7 @@ int GatherInt8CPUKernel::DoGather(int task_id) {
   return RET_OK;
 }
 
-int GatherInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int GatherInt8Run(void *cdata, int task_id) {
   auto gather_kernel = reinterpret_cast<GatherInt8CPUKernel *>(cdata);
   auto error_code = gather_kernel->DoGather(task_id);
   if (error_code != RET_OK) {
@@ -127,7 +127,7 @@ int GatherInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << prepare_ret;
     return prepare_ret;
   }
-  int error_code = LiteBackendParallelLaunch(GatherInt8Run, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, GatherInt8Run, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Gather function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/hswish_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/hswish_int8.cc
index 686e14cf61..8ece51bc81 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/hswish_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/hswish_int8.cc
@@ -78,7 +78,7 @@ int HswishInt8CPUKernel::DoActivation(int task_id) {
   return RET_OK;
 }
 
-int HswishInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int HswishInt8Run(void *cdata, int task_id) {
   auto activation_kernel = reinterpret_cast<HswishInt8CPUKernel *>(cdata);
   auto error_code = activation_kernel->DoActivation(task_id);
   if (error_code != RET_OK) {
@@ -94,7 +94,7 @@ int HswishInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare failed.";
     return RET_ERROR;
   }
-  int error_code = LiteBackendParallelLaunch(HswishInt8Run, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, HswishInt8Run, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "HswishInt8Run function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.cc
index 9bc770b1cb..2593794921 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.cc
@@ -82,13 +82,13 @@ int LeakyReluInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
     return ret;
   }
-  ret = LiteBackendParallelLaunch(PreluInt8Run, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, PreluInt8Run, this, op_parameter_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "RunPreluParam failed. errorcode: ";
   }
   return RET_OK;
 }
-int PreluInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PreluInt8Run(void *cdata, int task_id) {
   auto prelu = reinterpret_cast<LeakyReluInt8CPUKernel *>(cdata);
   prelu->DoExecute(task_id);
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.h
index 9df765079a..c995f7313d 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/leaky_relu_int8.h
@@ -41,7 +41,7 @@ class LeakyReluInt8CPUKernel : public LeakyReluBaseCPUKernel {
  private:
   LeakyReluQuantArg quant_prelu_parm_;
 };
-int PreluInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int PreluInt8Run(void *cdata, int task_id);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_PRELU_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc
index 935f74d454..aa93c9d4c4 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/matmul_int8.cc
@@ -111,7 +111,7 @@ int MatmulInt8CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int MatmulInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int MatmulInt8Run(void *cdata, int task_id) {
   auto op = reinterpret_cast<MatmulInt8CPUKernel *>(cdata);
   auto ret = op->RunImpl(task_id);
   if (ret != RET_OK) {
@@ -152,7 +152,7 @@ int MatmulInt8CPUKernel::Run() {
     auto &q = quant_params_;
     CalcInputSums(cur_a_ptr, params_->row_, params_->deep_, q.weight.zp_, input_sums_);
     CalcWeightBiasSums(cur_b_ptr, params_->deep_, params_->col_, q.input.zp_, q.weight.zp_, NULL, weight_bias_sums_);
-    ret = LiteBackendParallelLaunch(MatmulInt8Run, this, thread_count_);
+    ret = ParallelLaunch(THREAD_POOL_DEFAULT, MatmulInt8Run, this, thread_count_);
     if (ret != RET_OK) {
       MS_LOG(ERROR) << "MatmulInt8Run error: [" << ret << "]";
       return ret;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.cc
index 42ddfe3d6f..d4cad12b42 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.cc
@@ -86,17 +86,17 @@ int MulInt8CPUKernel::Run() {
     }
     TileDimensionsInt8(static_cast<int8_t *>(in_tensors_.at(0)->Data()),
                        static_cast<int8_t *>(in_tensors_.at(1)->Data()), input0_data_, input1_data_, &tile_para);
-    ret = LiteBackendParallelLaunch(MulInt8Run, this, thread_count_);
+    ret = ParallelLaunch(THREAD_POOL_DEFAULT, MulInt8Run, this, thread_count_);
     ctx_->allocator->Free(input0_data_);
     ctx_->allocator->Free(input1_data_);
     return ret;
   }
 
-  ret = LiteBackendParallelLaunch(MulInt8Run, this, thread_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, MulInt8Run, this, thread_count_);
   return ret;
 }
 
-int MulInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int MulInt8Run(void *cdata, int task_id) {
   auto mul = reinterpret_cast<MulInt8CPUKernel *>(cdata);
   mul->DoExecute(task_id);
   return lite::RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.h
index 36d9984cac..9f00e2e8e1 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/mul_int8.h
@@ -46,7 +46,7 @@ class MulInt8CPUKernel : public LiteKernel {
   int8_t *output_data_ = nullptr;
 };
 
-int MulInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int MulInt8Run(void *cdata, int task_id);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_MUL_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/pad_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/pad_int8.cc
index 0f67fa9d9d..f836cfa22a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/pad_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/pad_int8.cc
@@ -112,7 +112,7 @@ int PadInt8CPUKernel::RunImpl(int task_id) {
   return PadConstant4D(in_data_, out_data_, in_dims_, out_dims_, pad_param_->paddings_, task_id, context_->thread_num_);
 }
 
-int PadInt8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PadInt8Impl(void *cdata, int task_id) {
   auto resize = reinterpret_cast<PadInt8CPUKernel *>(cdata);
   auto error_code = resize->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -132,7 +132,7 @@ int PadInt8CPUKernel::Run() {
   out_data_ = reinterpret_cast<int8_t *>(out_tensors_[0]->Data());
 
   memset(out_data_, pad_param_->pad_quant_arg_.constant_value_[0], out_tensors_[0]->ElementsNum() * sizeof(int8_t));
-  int error_code = LiteBackendParallelLaunch(PadInt8Impl, this, context_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, PadInt8Impl, this, context_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Resize run error, error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/pooling_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/pooling_int8.cc
index f9e990bab4..72749cddb3 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/pooling_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/pooling_int8.cc
@@ -69,7 +69,7 @@ int PoolingInt8CPUKernel::RunImpl(int task_id) {
   return RET_OK;
 }
 
-int PoolingInt8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PoolingInt8Impl(void *cdata, int task_id) {
   auto pooling = reinterpret_cast<PoolingInt8CPUKernel *>(cdata);
   auto error_code = pooling->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -85,7 +85,7 @@ int PoolingInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare failed.";
     return RET_ERROR;
   }
-  int error_code = LiteBackendParallelLaunch(PoolingInt8Impl, this, thread_count_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, PoolingInt8Impl, this, thread_count_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "poolingInt8 error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/power_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/power_int8.cc
index 9e7aa10d62..5abe82c9b9 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/power_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/power_int8.cc
@@ -88,7 +88,7 @@ int PowerInt8CPUKernel::DoPower(int task_id) {
   return ret;
 }
 
-int PowerInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int PowerInt8Run(void *cdata, int task_id) {
   auto power_kernel = reinterpret_cast<PowerInt8CPUKernel *>(cdata);
   auto ret = power_kernel->DoPower(task_id);
   if (ret != RET_OK) {
@@ -103,7 +103,7 @@ int PowerInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare failed.";
     return ret;
   }
-  ret = LiteBackendParallelLaunch(PowerInt8Run, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, PowerInt8Run, this, op_parameter_->thread_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "PowerInt8Run error, error_code[" << ret << "]";
   }
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc
index 90a2e5aad8..2e498bbcc8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/reduce_int8.cc
@@ -235,7 +235,7 @@ int ReduceInt8CPUKernel::ReSize() {
   return ret;
 }
 
-int ReduceInt8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ReduceInt8Impl(void *cdata, int task_id) {
   auto reduce = reinterpret_cast<ReduceInt8CPUKernel *>(cdata);
   auto error_code = reduce->CallReduceUnit(task_id);
   if (error_code != RET_OK) {
@@ -284,7 +284,7 @@ int ReduceInt8CPUKernel::Run() {
       inner_size_ *= tmp_shape_[k];
     }
     axis_size_ = tmp_shape_[axis];
-    auto error_code = LiteBackendParallelLaunch(ReduceInt8Impl, this, context_->thread_num_);
+    auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceInt8Impl, this, context_->thread_num_);
     if (error_code != RET_OK) {
       FreeTmpBuffer();
       MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
@@ -321,7 +321,7 @@ int ReduceInt8CPUKernel::Run() {
   axis_size_ = tmp_shape_[last_reduce_axis];
   last_dst_data_ = reinterpret_cast<int8_t *>(out_tensors_.at(0)->Data());
   is_last_axis_ = true;
-  auto error_code = LiteBackendParallelLaunch(ReduceInt8Impl, this, context_->thread_num_);
+  auto error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReduceInt8Impl, this, context_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Reduce run error, error_code[" << error_code << "]";
     FreeTmpBuffer();
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/relux_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/relux_int8.cc
index 8292653610..8ec6f39d58 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/relux_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/relux_int8.cc
@@ -58,7 +58,7 @@ int ReluXInt8CPUKernel::DoActivation(int task_id) {
   return RET_OK;
 }
 
-int ReluXInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ReluXInt8Run(void *cdata, int task_id) {
   auto activation_kernel = reinterpret_cast<ReluXInt8CPUKernel *>(cdata);
   auto error_code = activation_kernel->DoActivation(task_id);
   if (error_code != RET_OK) {
@@ -74,7 +74,7 @@ int ReluXInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
     return ret;
   }
-  int error_code = LiteBackendParallelLaunch(ReluXInt8Run, this, op_parameter_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ReluXInt8Run, this, op_parameter_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "ReluXInt8Run function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.cc
index 02149c3847..a730a61c48 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.cc
@@ -60,11 +60,11 @@ int ReshapeInt8CPUKernel::Run() {
   elements_num_ = in_tensors_.at(kInputIndex)->ElementsNum();
   count_unit_ = op_parameter_->thread_num_ > 1 ? UP_DIV(elements_num_, op_parameter_->thread_num_) : elements_num_;
 
-  ret = LiteBackendParallelLaunch(ReshapeInt8Run, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, ReshapeInt8Run, this, op_parameter_->thread_num_);
   return ret;
 }
 
-int ReshapeInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ReshapeInt8Run(void *cdata, int task_id) {
   auto reshape = reinterpret_cast<ReshapeInt8CPUKernel *>(cdata);
   reshape->DoExecute(task_id);
   return lite::RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.h
index 13f2450342..61115acdd8 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/reshape_int8.h
@@ -46,7 +46,7 @@ class ReshapeInt8CPUKernel : public ReshapeBaseCPUKernel {
   int8_t *output_data_ = nullptr;
 };
 
-int ReshapeInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int ReshapeInt8Run(void *cdata, int task_id);
 }  // namespace mindspore::kernel
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_KERNEL_ARM_INT8_RESHAPE_INT8_H_
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/resize_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/resize_int8.cc
index 7af0c3f853..aab798265a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/resize_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/resize_int8.cc
@@ -56,7 +56,7 @@ int ResizeInt8CPUKernel::Init() {
   return ReSize();
 }
 
-int ResizeInt8Impl(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int ResizeInt8Impl(void *cdata, int task_id) {
   auto resize = reinterpret_cast<ResizeInt8CPUKernel *>(cdata);
   auto error_code = resize->RunImpl(task_id);
   if (error_code != RET_OK) {
@@ -124,7 +124,7 @@ int ResizeInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare failed.";
     return RET_ERROR;
   }
-  int error_code = LiteBackendParallelLaunch(ResizeInt8Impl, this, context_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, ResizeInt8Impl, this, context_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "Resize run error, error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/sigmoid_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/sigmoid_int8.cc
index 750fdcea9f..2add85dd2e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/sigmoid_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/sigmoid_int8.cc
@@ -79,7 +79,7 @@ int SigmoidInt8CPUKernel::DoActivation(int task_id) {
   return RET_OK;
 }
 
-int SigmoidInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SigmoidInt8Run(void *cdata, int task_id) {
   auto activation_kernel = reinterpret_cast<SigmoidInt8CPUKernel *>(cdata);
   auto error_code = activation_kernel->DoActivation(task_id);
   if (error_code != RET_OK) {
@@ -95,7 +95,7 @@ int SigmoidInt8CPUKernel::Run() {
     MS_LOG(ERROR) << "Prepare fail!ret: " << ret;
     return ret;
   }
-  int error_code = LiteBackendParallelLaunch(SigmoidInt8Run, this, op_parameter_->thread_num_);
+  int error_code = ParallelLaunch(THREAD_POOL_DEFAULT, SigmoidInt8Run, this, op_parameter_->thread_num_);
   if (error_code != RET_OK) {
     MS_LOG(ERROR) << "SigmoidInt8Run function error error_code[" << error_code << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/slice_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/slice_int8.cc
index 1ba2df8f25..631273ec16 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/slice_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/slice_int8.cc
@@ -68,7 +68,7 @@ int SliceInt8CPUKernel::DoSlice(int task_id) {
   return ret;
 }
 
-int SliceInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SliceInt8Run(void *cdata, int task_id) {
   auto slice_kernel = reinterpret_cast<SliceInt8CPUKernel *>(cdata);
   auto ret = slice_kernel->DoSlice(task_id);
   if (ret != RET_OK) {
@@ -90,7 +90,7 @@ int SliceInt8CPUKernel::Run() {
   if (param_->size_[1] < param_->op_parameter_.thread_num_) {
     ret = SliceInt8NoParallel(input_data, output_data, param_);
   } else {
-    ret = LiteBackendParallelLaunch(SliceInt8Run, this, op_parameter_->thread_num_);
+    ret = ParallelLaunch(THREAD_POOL_DEFAULT, SliceInt8Run, this, op_parameter_->thread_num_);
   }
 
   if (ret != RET_OK) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/softmax_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/softmax_int8.cc
index 20f52429e7..1b16a00336 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/softmax_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/softmax_int8.cc
@@ -94,7 +94,7 @@ int SoftmaxInt8CPUKernel::DoSoftmax(int task_id) {
   return RET_OK;
 }
 
-int SoftmaxRun(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SoftmaxRun(void *cdata, int task_id) {
   auto softmax_kernel = reinterpret_cast<SoftmaxInt8CPUKernel *>(cdata);
   auto error_code = softmax_kernel->DoSoftmax(task_id);
   if (error_code != RET_OK) {
@@ -122,7 +122,7 @@ int SoftmaxInt8CPUKernel::Run() {
     context_->allocator->Free(sum_data_);
     return RET_ERROR;
   }
-  ret = LiteBackendParallelLaunch(SoftmaxRun, this, thread_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SoftmaxRun, this, thread_count_);
   context_->allocator->Free(exp_data_);
   context_->allocator->Free(sum_data_);
   if (ret != RET_OK) {
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/split_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/split_int8.cc
index b69891f45d..676da57151 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/split_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/split_int8.cc
@@ -71,7 +71,7 @@ int SplitInt8CPUKernel::Split(int task_id) {
   return RET_OK;
 }
 
-int SplitInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SplitInt8Run(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<SplitInt8CPUKernel *>(cdata);
   auto ret = g_kernel->Split(task_id);
   if (ret != RET_OK) {
@@ -94,7 +94,7 @@ int SplitInt8CPUKernel::Run() {
     output_ptr_.push_back(reinterpret_cast<int8_t *>(out_tensors_.at(i)->Data()));
   }
 
-  ret = LiteBackendParallelLaunch(SplitInt8Run, this, thread_n_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SplitInt8Run, this, thread_n_num_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "Scale error error_code[" << ret << "]";
     return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.cc
index 311a3e41f4..c1e4196b6e 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.cc
@@ -157,7 +157,7 @@ int SqueezeInt8CPUKernel::Run() {
     free(*(inputs_array + i));
   }
 
-  ret = LiteBackendParallelLaunch(SqueezeInt8Run, this, thread_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SqueezeInt8Run, this, thread_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "RunSqueezeParam failed. errorcode: ";
   }
@@ -165,7 +165,7 @@ int SqueezeInt8CPUKernel::Run() {
   return ret;
 }
 
-int SqueezeInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SqueezeInt8Run(void *cdata, int task_id) {
   auto Squeeze = reinterpret_cast<SqueezeInt8CPUKernel *>(cdata);
   Squeeze->DoExecute(task_id);
   return RET_OK;
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.h b/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.h
index 128e32425e..6d205ce62b 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.h
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/squeeze_int8.h
@@ -44,7 +44,7 @@ class SqueezeInt8CPUKernel : public SqueezeBaseCPUKernel {
   SqueezeQuantArg *quant_Squeeze_parm_;
 };
 
-int SqueezeInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata);
+int SqueezeInt8Run(void *cdata, int task_id);
 
 }  // namespace mindspore::kernel
 
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.cc
index 9ffca499ac..9a4f705072 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/sub_int8.cc
@@ -110,7 +110,7 @@ int SubInt8CPUKernel::DoExecute(int task_id) {
   return RET_OK;
 }
 
-int SubInt8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int SubInt8Run(void *cdata, int task_id) {
   auto sub_kernel = reinterpret_cast<SubInt8CPUKernel *>(cdata);
   auto ret = sub_kernel->DoExecute(task_id);
   if (ret != RET_OK) {
@@ -147,7 +147,7 @@ int SubInt8CPUKernel::Run() {
                         static_cast<uint8_t *>(in_tensors_.at(1)->Data()), reinterpret_cast<uint8_t *>(tile0_data_),
                         reinterpret_cast<uint8_t *>(tile1_data_), &tile_para);
   }
-  ret = LiteBackendParallelLaunch(SubInt8Run, this, op_parameter_->thread_num_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, SubInt8Run, this, op_parameter_->thread_num_);
   if (broadcast_) {
     context_->allocator->Free(tile0_data_);
     context_->allocator->Free(tile1_data_);
diff --git a/mindspore/lite/src/runtime/kernel/arm/int8/unsqueeze_int8.cc b/mindspore/lite/src/runtime/kernel/arm/int8/unsqueeze_int8.cc
index 35c7200b56..98f3d9067a 100644
--- a/mindspore/lite/src/runtime/kernel/arm/int8/unsqueeze_int8.cc
+++ b/mindspore/lite/src/runtime/kernel/arm/int8/unsqueeze_int8.cc
@@ -70,7 +70,7 @@ int Unsqueezeint8CPUKernel::DoUnsqueeze(int task_id) {
   return RET_OK;
 }
 
-int UnsqueezeIn8Run(int task_id, LiteParallelGroupEnv *penv, void *cdata) {
+int UnsqueezeIn8Run(void *cdata, int task_id) {
   auto g_kernel = reinterpret_cast<Unsqueezeint8CPUKernel *>(cdata);
   auto ret = g_kernel->DoUnsqueeze(task_id);
   if (ret != RET_OK) {
@@ -88,7 +88,7 @@ int Unsqueezeint8CPUKernel::Run() {
   }
   in_ptr_ = reinterpret_cast<float *>(in_tensors_.at(0)->Data());
   out_ptr_ = reinterpret_cast<float *>(out_tensors_.at(0)->Data());
-  ret = LiteBackendParallelLaunch(UnsqueezeIn8Run, this, thread_sz_count_);
+  ret = ParallelLaunch(THREAD_POOL_DEFAULT, UnsqueezeIn8Run, this, thread_sz_count_);
   if (ret != RET_OK) {
     MS_LOG(ERROR) << "UnsqueezeRun error error_code[" << ret << "]";
     return ret;
diff --git a/mindspore/lite/src/runtime/parallel_executor.cc b/mindspore/lite/src/runtime/parallel_executor.cc
index 5d4c983cea..0d14f7b41d 100644
--- a/mindspore/lite/src/runtime/parallel_executor.cc
+++ b/mindspore/lite/src/runtime/parallel_executor.cc
@@ -16,26 +16,22 @@
 
 #include <utility>
 #include "src/runtime/parallel_executor.h"
-using mindspore::predict::ThreadPool;
-using mindspore::predict::TvmEnv;
+#include "include/thread_pool_config.h"
+#include "src/runtime/runtime_api.h"
+
 #define MAX_THREAD_NUM 8
 namespace mindspore::lite {
-ParallelExecutor::~ParallelExecutor() {
-  delete pool;
-  pool = nullptr;
-}
+ParallelExecutor::~ParallelExecutor() {}
 int ParallelExecutor::Prepare(std::vector<mindspore::kernel::LiteKernel *> &kernels) {
-  pool = new ThreadPool();
-  if (pool == nullptr) {
+  int status = ConfigThreadPool(THREAD_POOL_DEFAULT, MAX_THREAD_NUM, NO_BIND);
+  if (status != 0) {
     MS_LOG(ERROR) << "Memory error: fail to new ThreadPool";
     return RET_ERROR;
   }
-  pool->ConfigMaxThreadNum(MAX_THREAD_NUM);
-  pool->ConfigThreadPool(NO_BIND, MAX_THREAD_NUM);
   return RET_OK;
 }
 
-static int RunKernel(int index, TvmEnv *env, void *data) {
+static int RunKernel(void *data, int index) {
   ParallelExecutor *executor = reinterpret_cast<ParallelExecutor *>(data);
   auto kernel = executor->GetReadyKernel(index);
   auto ret = kernel->Run();
@@ -84,7 +80,7 @@ int ParallelExecutor::Run(std::vector<tensor::Tensor *> &in_tensors, std::vector
   std::vector<kernel::LiteKernel *> newReadyKernels;
   while (readyKernels.size() > 0) {
     results.resize(readyKernels.size(), RET_OK);
-    pool->LaunchWork(RunKernel, this, readyKernels.size());
+    ParallelLaunch(THREAD_POOL_DEFAULT, RunKernel, this, readyKernels.size());
 
     if (std::find_if(results.begin(), results.end(), [](const int &ret) { return (ret != 0); }) != results.end()) {
       return RET_ERROR;
diff --git a/mindspore/lite/src/runtime/parallel_executor.h b/mindspore/lite/src/runtime/parallel_executor.h
index 492d599110..95dfbbd58f 100644
--- a/mindspore/lite/src/runtime/parallel_executor.h
+++ b/mindspore/lite/src/runtime/parallel_executor.h
@@ -23,7 +23,6 @@
 #include "src/lite_kernel.h"
 #include "include/lite_session.h"
 #include "src/executor.h"
-#include "src/runtime/thread_pool.h"
 
 namespace mindspore::lite {
 class ParallelExecutor : public Executor {
@@ -40,7 +39,6 @@ class ParallelExecutor : public Executor {
   inline void SetResult(const int index, const int result) { results.at(index) = result; }
 
  private:
-  predict::ThreadPool *pool;
   std::unordered_map<kernel::LiteKernel *, size_t> refCount;
   std::vector<kernel::LiteKernel *> readyKernels;
   std::vector<int> results;
diff --git a/mindspore/lite/src/runtime/runtime_api.cc b/mindspore/lite/src/runtime/runtime_api.cc
index fa7170404f..374796cdaf 100644
--- a/mindspore/lite/src/runtime/runtime_api.cc
+++ b/mindspore/lite/src/runtime/runtime_api.cc
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2019 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,26 +14,19 @@
  * limitations under the License.
  */
 
+#include "src/runtime/runtime_api.h"
 #include <mutex>
 #include <string>
-#include "src/runtime/runtime_api.h"
 #include "src/runtime/workspace_pool.h"
-#include "src/runtime/thread_pool.h"
 #include "utils/log_adapter.h"
 
 static std::mutex gWorkspaceMutex;
 #ifdef __cplusplus
 extern "C" {
 #endif
-void LiteAPISetLastError(const char *msg) {
-  MS_LOG(ERROR) << "The lite api set last error is " << msg;
-}
+void LiteAPISetLastError(const char *msg) { MS_LOG(ERROR) << "The lite api set last error is " << msg; }
 
-void *LiteBackendAllocWorkspace(int deviceType,
-                                int deviceId,
-                                uint64_t size,
-                                int dtypeCode,
-                                int dtypeBits) {
+void *LiteBackendAllocWorkspace(int deviceType, int deviceId, uint64_t size, int dtypeCode, int dtypeBits) {
   std::lock_guard<std::mutex> lock(gWorkspaceMutex);
   auto p = mindspore::predict::WorkspacePool::GetInstance();
   if (p == nullptr) {
@@ -52,54 +45,6 @@ int LiteBackendFreeWorkspace(int deviceType, int deviceId, void *ptr) {
   p->FreeWorkSpaceMem(ptr);
   return 0;
 }
-
-void SetMaxWokerNum(int num) {
-  auto p = mindspore::predict::GlobalThreadPool();
-  if (p == nullptr) {
-    MS_LOG(ERROR) << "Get thread pool instance failed";
-    return;
-  }
-  if (num < 0) {
-    LiteAPISetLastError("The number of work thread is less than 0");
-    return;
-  }
-  p->ConfigMaxThreadNum(num);
-}
-
-void ConfigThreadPool(int mode, int nthreads) {
-  auto p = mindspore::predict::GlobalThreadPool();
-  if (p == nullptr) {
-    MS_LOG(ERROR) << "Get thread pool instance failed";
-    return;
-  }
-  p->ConfigThreadPool(mode, nthreads);
-}
-
-int LiteBackendParallelLaunch(FTVMParallelLambda flambda, void *cdata, int num_task) {
-  auto p = mindspore::predict::GlobalThreadPool();
-  if (p == nullptr) {
-    MS_LOG(ERROR) << "Get thread pool instance failed";
-    return -1;
-  }
-  if (!p->LaunchWork(flambda, cdata, num_task)) {
-    MS_LOG(ERROR) << "launch thread pool work failed";
-    return -1;
-  }
-  return 0;
-}
-
-void DoAllThreadBind(bool ifBind, int mode) {
-  auto p = mindspore::predict::GlobalThreadPool();
-  if (p == nullptr) {
-    MS_LOG(ERROR) << "Get thread pool instance failed";
-    return;
-  }
-  if (!p->BindAllThreads(ifBind, mode)) {
-    MS_LOG(ERROR) << "do thread cpu bind failed";
-  }
-}
-
 #ifdef __cplusplus
 }
 #endif
-
diff --git a/mindspore/lite/src/runtime/runtime_api.h b/mindspore/lite/src/runtime/runtime_api.h
index cd3942d79e..bd6d23380d 100644
--- a/mindspore/lite/src/runtime/runtime_api.h
+++ b/mindspore/lite/src/runtime/runtime_api.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2020 Huawei Technologies Co., Ltd
+ * Copyright 2019 Huawei Technologies Co., Ltd
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,10 +13,10 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
-#ifndef MINDSPORE_LITE_SRC_RUNTIME_RUNTIME_API_H_
-#define MINDSPORE_LITE_SRC_RUNTIME_RUNTIME_API_H_
+#ifndef PREDICT_SRC_RUNTIME_RUNTIME_API_H_
+#define PREDICT_SRC_RUNTIME_RUNTIME_API_H_
 #include <memory>
+#include "include/thread_pool_config.h"
 
 #ifndef INTERNAL_API_DLL
 #ifdef _WIN32
@@ -32,26 +32,16 @@
 
 #ifdef __cplusplus
 extern "C" {
+#include "src/runtime/thread_pool.h"
+
 #endif
 
-typedef struct {
-  void *sync_handle;
-  int32_t num_task;
-} LiteParallelGroupEnv;
-typedef int (*FTVMParallelLambda)(int task_id, LiteParallelGroupEnv *penv, void *cdata);
 INTERNAL_API_DLL void LiteAPISetLastError(const char *msg);
 INTERNAL_API_DLL void *LiteBackendAllocWorkspace(int deviceType, int deviceId, uint64_t size, int dtypeCode,
                                                  int dtypeBits);
 INTERNAL_API_DLL int LiteBackendFreeWorkspace(int deviceType, int deviceId, void *ptr);
-INTERNAL_API_DLL void SetMaxWokerNum(int num);
-INTERNAL_API_DLL void ConfigThreadPool(int mode, int nthreads);
-INTERNAL_API_DLL inline void CfgThreadPool(int nthread) { ConfigThreadPool(-1, nthread); }
-INTERNAL_API_DLL int LiteBackendParallelLaunch(FTVMParallelLambda flambda, void *cdata, int num_task);
 INTERNAL_API_DLL int LiteBackendRegisterSystemLibSymbol(const char *name, void *ptr);
-INTERNAL_API_DLL void DoAllThreadBind(bool ifBind, int mode);
-
 #ifdef __cplusplus
 }
 #endif
-#endif  // MINDSPORE_LITE_SRC_RUNTIME_RUNTIME_API_H_
-
+#endif  // PREDICT_SRC_RUNTIME_RUNTIME_API_H_
diff --git a/mindspore/lite/src/runtime/thread_pool.c b/mindspore/lite/src/runtime/thread_pool.c
new file mode 100644
index 0000000000..42a85cef0b
--- /dev/null
+++ b/mindspore/lite/src/runtime/thread_pool.c
@@ -0,0 +1,797 @@
+/**
+ * Copyright 2020 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "src/runtime/thread_pool.h"
+#include "include/thread_pool_config.h"
+#define _GNU_SOURCE
+#include <pthread.h>
+#include <stdatomic.h>
+#include <semaphore.h>
+#include <string.h>
+#include <stdlib.h>
+
+#ifdef __ANDROID__
+#define BIND_CORE
+#include <unistd.h>
+#include <sched.h>
+#endif
+
+#ifdef THREAD_POOL_DEBUG
+#include <stdio.h>
+#define LOG_INFO(content, args...) \
+  { printf("[INFO] %s|%d|%s: " #content "\r\n", __FILE__, __LINE__, __func__, ##args); }
+#else
+#define LOG_INFO(content, args...)
+#endif
+
+#define RET_TP_OK (0)
+#define RET_TP_ERROR (1)
+#define RET_TP_SYSTEM_ERROR (-1)
+
+#define MAX_TASK_NUM (2)
+#define MAX_THREAD_NUM (8)
+#define MAX_THREAD_POOL_NUM (4)
+#define DEFAULT_SPIN_COUNT (30000)
+
+typedef struct {
+  int (*func)(void *arg, int);
+  void *content;
+} Task;
+
+typedef struct Thread {
+  int thread_pool_id;
+  int thread_id;
+  struct Thread *next;
+  pthread_t pthread;
+  Task *task_list[MAX_TASK_NUM];
+  atomic_int task_size;
+  atomic_int head;
+  atomic_int tail;
+  atomic_bool activate;
+  atomic_bool is_running;
+  sem_t sem;
+} Thread;
+
+typedef struct {
+  Thread *head;
+  Thread *tail;
+  pthread_mutex_t lock;
+  int size;
+} ThreadList;
+
+typedef struct ThreadPool {
+  ThreadList *thread_list;
+  int thread_num;
+  CpuBindMode mode;
+  atomic_bool is_alive;
+} ThreadPool;
+
+static ThreadPool thread_pool_list[MAX_THREAD_POOL_NUM];
+static atomic_int thread_pool_refcount[MAX_THREAD_POOL_NUM] = {ATOMIC_VAR_INIT(0)};
+static atomic_bool thread_pool_is_created[MAX_THREAD_POOL_NUM] = {ATOMIC_VAR_INIT(false)};
+
+ThreadPool *GetInstance(int thread_pool_id) {
+  if (thread_pool_id < 0 || thread_pool_id >= MAX_THREAD_POOL_NUM) {
+    LOG_INFO("invaid context id: %d", thread_pool_id);
+    // DestroyThreadPool(thread_pool_id);
+    return NULL;
+  }
+  return &thread_pool_list[thread_pool_id];
+}
+
+Thread *GetThread(int thread_pool_id, int thread_id) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed, thread_pool_id: %d, thread_id: %d", thread_pool_id, thread_id);
+    return NULL;
+  }
+  ThreadList *thread_list = thread_pool->thread_list;
+  if (thread_list == NULL) {
+    LOG_INFO("thead list is null");
+    return NULL;
+  }
+  if (thread_id >= thread_list->size) {
+    LOG_INFO("invalid thread id: %d, thread_pool_id: %d, thread size: %d", thread_id, thread_pool_id,
+             thread_list->size);
+    return NULL;
+  }
+  if (thread_id == 0) {
+    return thread_list->head;
+  }
+  Thread *thread = thread_list->head;
+  while (thread != NULL) {
+    if (thread->thread_id == thread_id) {
+      break;
+    }
+    thread = thread->next;
+  }
+  return thread;
+}
+
+void FreeThread(ThreadList *thread_list, Thread *thread) {
+  if (thread_list == NULL) {
+    LOG_INFO("thead list is null");
+    return;
+  }
+  if (thread == NULL) {
+    LOG_INFO("thread is nullptr");
+    return;
+  }
+  // only support sequential release
+  thread_list->head = thread->next;
+  sem_post(&thread->sem);
+  while (thread != NULL && !thread->is_running) {
+    sem_destroy(&thread->sem);
+    free(thread);
+    thread = NULL;
+  }
+}
+
+#ifdef BIND_CORE
+#define MAX_CORE_NUM (16)
+static int gCoreNum = 8;
+static int gHigNum = 0;
+static int gMidNum = 0;
+static int cpu_cores[MAX_CORE_NUM];
+static bool run_once = true;
+
+#define MAX_CPU_ID (9)
+#define MAX_PATH_SIZE (256)
+typedef struct {
+  int core_id;
+  int max_freq;
+} CpuInfo;
+
+int GetCpuCoreNum() { return (int)sysconf(_SC_NPROCESSORS_CONF); }
+
+static int ConcatCPUPath(int cpuID, const char *str1, const char *str2, char *str3) {
+  if (cpuID > MAX_CPU_ID || str1 == NULL || str2 == NULL) {
+    return RET_TP_ERROR;
+  }
+  memset(str3, 0, strlen(str3));
+  char *tmp = str3;
+  char id = cpuID + '0';
+  memcpy(tmp, str1, strlen(str1));
+  tmp += strlen(str1);
+  memcpy(tmp, &id, 1);
+  tmp += 1;
+  memcpy(tmp, str2, strlen(str2));
+  return RET_TP_OK;
+}
+
+int GetMaxFrequence(int core_id) {
+  char path[MAX_PATH_SIZE] = "";
+  int ret = ConcatCPUPath(core_id, "/sys/devices/system/cpu/cpufreq/stats/cpu", "/time_in_state", path);
+  if (ret != RET_TP_OK) {
+    LOG_INFO("parse cpuid from /sys/devices/system/cpu/cpufreq/stats/cpu/time_in_state failed!");
+    return RET_TP_ERROR;
+  }
+  FILE *fp = fopen(path, "rb");
+  if (fp == NULL) {
+    ret = ConcatCPUPath(core_id, "/sys/devices/system/cpu/cpufreq/stats/cpu", "/cpufreq/stats/time_in_state", path);
+    if (ret != RET_TP_OK) {
+      LOG_INFO("parse cpuid from /sys/devices/system/cpu/cpufreq/stats/cpu/cpufreq/stats/time_instate failed!");
+      return RET_TP_ERROR;
+    }
+    fp = fopen(path, "rb");
+    if (fp == NULL) {
+      ret = ConcatCPUPath(core_id, "/sys/devices/system/cpu/cpu", "/cpufreq/cpuinfo_max_freq", path);
+      if (ret != RET_TP_OK) {
+        LOG_INFO("parse cpuid from /sys/devices/system/cpu/cpufreq/cpuinfo_max_freq failed!");
+        return RET_TP_ERROR;
+      }
+      fp = fopen(path, "rb");
+      if (fp == NULL) {
+        LOG_INFO("GetCPUMaxFreq failed, cannot find cpuinfo_max_freq.");
+        return RET_TP_ERROR;
+      }
+      int maxFreq = -1;
+      int result __attribute__((unused));
+      result = fscanf(fp, "%d", &maxFreq);
+      fclose(fp);
+      return maxFreq;
+    }
+  }
+  int maxFreq = -1;
+  while (feof(fp) == 0) {
+    int freq = 0;
+    int tmp = fscanf(fp, "%d", &freq);
+    if (tmp != 1) {
+      break;
+    }
+    if (freq > maxFreq) {
+      maxFreq = freq;
+    }
+  }
+  fclose(fp);
+  return maxFreq;
+}
+
+int SortCpuProcessor() {
+  gCoreNum = GetCpuCoreNum();
+  if (gCoreNum <= 0) {
+    LOG_INFO("invalid cpu count");
+    return RET_TP_ERROR;
+  }
+  CpuInfo freq_set[gCoreNum];
+  for (int i = 0; i < gCoreNum; ++i) {
+    int max_freq = GetMaxFrequence(i);
+    freq_set[i].core_id = i;
+    freq_set[i].max_freq = max_freq;
+  }
+  // sort core id by frequency
+  for (int i = 0; i < gCoreNum; ++i) {
+    for (int j = i + 1; j < gCoreNum; ++j) {
+      if (freq_set[i].max_freq <= freq_set[j].max_freq) {
+        CpuInfo temp = freq_set[i];
+        freq_set[i] = freq_set[j];
+        freq_set[j] = temp;
+      }
+    }
+  }
+  for (int i = 0; i < gCoreNum; ++i) {
+    cpu_cores[i] = freq_set[i].core_id;
+    LOG_INFO("sorted_order: %d, frequency: %d", freq_set[i].core_id, freq_set[i].max_freq);
+  }
+  gHigNum = 0;
+  gMidNum = 0;
+  int max_freq = freq_set[0].max_freq;
+  int min_freq = freq_set[gCoreNum - 1].max_freq;
+  int little = 0;
+  for (int i = 0; i < gCoreNum; ++i) {
+    if (freq_set[i].max_freq == max_freq) {
+      gHigNum++;
+    }
+    if (freq_set[i].max_freq == min_freq) {
+      little++;
+    }
+  }
+  gMidNum = gCoreNum - gHigNum - little;
+  if (gHigNum == gCoreNum || max_freq == min_freq) {
+    // fix MTK800
+    gHigNum = 2;
+    gMidNum = 2;
+    LOG_INFO("core frequency may be wrong.");
+  }
+  LOG_INFO("gCoreNum: %d, gHigNum: %d, gMidNum: %d, gLitNum: %d", gCoreNum, gHigNum, gMidNum, little);
+  return RET_TP_OK;
+}
+
+#ifndef CPU_SET
+#define CPU_SETSIZE 1024
+#define __NCPUBITS (8 * sizeof(unsigned long))
+typedef struct {
+  unsigned long __bits[CPU_SETSIZE / __NCPUBITS];
+} cpu_set_t;
+#define CPU_SET(cpu, cpusetp) ((cpusetp)->__bits[(cpu) / __NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
+#define CPU_ZERO(cpusetp) memset((cpusetp), 0, sizeof(cpu_set_t))
+#endif  // CPU_SET
+
+int SetAffinity(pthread_t thread_id, cpu_set_t *cpuSet) {
+#ifdef __ANDROID__
+#if __ANDROID_API__ >= 21
+  LOG_INFO("thread: %d, mask: %lu", pthread_gettid_np(thread_id), cpuSet->__bits[0]);
+  int ret = sched_setaffinity(pthread_gettid_np(thread_id), sizeof(cpu_set_t), cpuSet);
+  if (ret != RET_TP_OK) {
+    LOG_INFO("bind thread %d to cpu failed. ERROR %d", pthread_gettid_np(thread_id), ret);
+    return RET_TP_OK;
+  }
+#endif
+#else
+#ifdef __APPLE__
+  LOG_INFO("not bind thread to apple's cpu.");
+  return RET_TP_ERROR;
+#else
+  int ret = pthread_setaffinity_np(thread_id, sizeof(cpu_set_t), cpuSet);
+  if (ret != RET_TP_OK) {
+    LOG_INFO("set thread: %lu to cpu failed", thread_id);
+    return RET_TP_SYSTEM_ERROR;
+  }
+#endif  // __APPLE__
+#endif
+  return RET_TP_OK;
+}
+
+int BindMasterThread(int thread_pool_id, bool is_bind) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return RET_TP_ERROR;
+  }
+  cpu_set_t mask;
+  CPU_ZERO(&mask);
+  if (is_bind) {
+    unsigned int attach_id;
+    if (thread_pool->mode == MID_CPU) {
+      attach_id = cpu_cores[gHigNum + gMidNum - 1];
+    } else {
+      attach_id = cpu_cores[0];
+    }
+    LOG_INFO("mode: %d, attach id: %u", thread_pool->mode, attach_id);
+    CPU_SET(attach_id, &mask);
+  } else {
+    for (int i = 0; i < gHigNum + gMidNum; ++i) {
+      CPU_SET(cpu_cores[i], &mask);
+    }
+  }
+  int ret = SetAffinity(pthread_self(), &mask);
+  if (ret != RET_TP_OK) {
+    LOG_INFO("set master thread affinity failed");
+    return RET_TP_ERROR;
+  }
+  LOG_INFO("BindMasterThread success.");
+  return RET_TP_OK;
+}
+
+int BindSalverThreads(int thread_pool_id, bool is_bind) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return RET_TP_ERROR;
+  }
+  cpu_set_t mask;
+  if (is_bind && thread_pool->mode != NO_BIND) {
+    unsigned int attach_id;
+    for (int i = 0; i < thread_pool->thread_num - 1; ++i) {
+      if (thread_pool->mode == MID_CPU) {
+        int core_id = gHigNum + gMidNum - i - 2;
+        if (core_id >= 0) {
+          attach_id = cpu_cores[core_id];
+        } else {
+          attach_id = cpu_cores[0];
+        }
+      } else {
+        attach_id = cpu_cores[i + 1];
+      }
+      LOG_INFO("mode: %d, attach id: %u", thread_pool->mode, attach_id);
+      CPU_ZERO(&mask);
+      CPU_SET(attach_id, &mask);
+      Thread *thread = GetThread(thread_pool_id, i);
+      if (thread == NULL) {
+        LOG_INFO("get thread failed, thread_pool_id: %d, thread_id: %d", thread_pool_id, i);
+        return false;
+      }
+      int ret = SetAffinity(thread->pthread, &mask);
+      if (ret != RET_TP_OK) {
+        LOG_INFO("set thread affinity failed");
+        return RET_TP_ERROR;
+      }
+    }
+  } else {
+    CPU_ZERO(&mask);
+    for (int i = 0; i < gHigNum + gMidNum; ++i) {
+      CPU_SET(cpu_cores[i], &mask);
+    }
+    for (int i = 0; i < thread_pool->thread_num - 1; ++i) {
+      Thread *thread = GetThread(thread_pool_id, i);
+      if (thread == NULL) {
+        LOG_INFO("get thread failed, thread_pool_id: %d, thread_id: %d", thread_pool_id, i);
+        return false;
+      }
+      int ret = SetAffinity(thread->pthread, &mask);
+      if (ret != RET_TP_OK) {
+        LOG_INFO("set thread affinity failed");
+        return RET_TP_ERROR;
+      }
+    }
+  }
+  LOG_INFO("BindSalverThreads success");
+  return RET_TP_OK;
+}
+#endif
+
+int BindThreads(int thread_pool_id, bool is_bind, CpuBindMode mode) {
+#ifdef BIND_CORE
+  if (mode == NO_BIND) {
+    return RET_TP_OK;
+  }
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return RET_TP_ERROR;
+  }
+  thread_pool->mode = mode;
+  int ret = BindMasterThread(thread_pool_id, is_bind);
+  if (ret != RET_TP_OK) {
+    LOG_INFO("bind master thread failed.");
+  }
+  ret = BindSalverThreads(thread_pool_id, is_bind);
+  if (ret != RET_TP_OK) {
+    LOG_INFO("bind salver thread failed.");
+  }
+  return ret;
+#else
+  return RET_TP_OK;
+#endif
+}
+
+bool PushTaskToQueue(int thread_pool_id, int thread_id, Task *task) {
+  Thread *thread = GetThread(thread_pool_id, thread_id);
+  if (thread == NULL) {
+    LOG_INFO("get thread failed, thread_pool_id: %d, thread_id: %d", thread_pool_id, thread_id);
+    return false;
+  }
+  const int tail_index = atomic_load_explicit(&thread->tail, memory_order_relaxed);
+  int next = (tail_index + 1) % MAX_TASK_NUM;
+  if (next == atomic_load_explicit(&thread->head, memory_order_acquire)) {
+    return false;
+  }
+  thread->task_list[tail_index] = task;
+  atomic_store_explicit(&thread->tail, next, memory_order_release);
+  atomic_fetch_add_explicit(&thread->task_size, 1, memory_order_relaxed);
+  // atomic_store_explicit(&thread->task_size, thread->task_size + 1, memory_order_relaxed);
+  sem_post(&thread->sem);
+  return true;
+}
+
+bool PopTaskFromQueue(Thread *thread, Task **task) {
+  if (thread == NULL) {
+    LOG_INFO("thread is nullptr");
+    return false;
+  }
+  if (thread->task_size == 0) {
+    return false;
+  }
+  const int head_index = atomic_load_explicit(&thread->head, memory_order_relaxed);
+  if (head_index == atomic_load_explicit(&thread->tail, memory_order_acquire)) {
+    return false;
+  }
+  *task = thread->task_list[head_index];
+  atomic_store_explicit(&thread->head, (head_index + 1) % MAX_TASK_NUM, memory_order_release);
+  return true;
+}
+
+void WaitAllThread(int thread_pool_id) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return;
+  }
+  bool k_success_flag = false;
+  while (!k_success_flag) {
+    k_success_flag = true;
+    for (int i = 0; i < thread_pool->thread_num - 1; ++i) {
+      Thread *thread = GetThread(thread_pool_id, i);
+      if (thread == NULL) {
+        LOG_INFO("get thread failed, thread_pool_id: %d, thread_id: %d", thread_pool_id, i);
+        return;
+      }
+      if (thread->task_size != 0) {
+        k_success_flag = false;
+        break;
+      }
+    }
+  }
+}
+
+int DistributeTask(int thread_pool_id, Task *task, int task_num) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return RET_TP_ERROR;
+  }
+  if (task_num > thread_pool->thread_num || task_num <= 1) {
+    LOG_INFO("invalid task num: %d, thread num: %d", task_num, thread_pool->thread_num);
+    return RET_TP_ERROR;
+  }
+  bool k_success_flag = false;
+  int size = thread_pool->thread_num < task_num ? thread_pool->thread_num : task_num;
+  for (int i = 0; i < size - 1; ++i) {
+    do {
+      k_success_flag = true;
+      if (!PushTaskToQueue(thread_pool_id, i, task)) {
+        k_success_flag = false;
+      }
+    } while (!k_success_flag);
+  }
+  // master thread
+  task->func(task->content, size - 1);
+  // wait
+  WaitAllThread(thread_pool_id);
+  return RET_TP_OK;
+}
+
+int AddTask(int thread_pool_id, int func(void *, int), void *content, int task_num) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return RET_TP_ERROR;
+  }
+  // if single thread, run master thread
+  if (thread_pool->thread_num <= 1 || task_num <= 1) {
+    for (int i = 0; i < task_num; ++i) {
+      func(content, i);
+    }
+    return RET_TP_OK;
+  }
+  Task task;
+  task.func = func;
+  task.content = content;
+  return DistributeTask(thread_pool_id, &task, task_num);
+}
+
+int ParallelLaunch(int thread_pool_id, int (*func)(void *, int), void *content, int task_num) {
+  return AddTask(thread_pool_id, func, content, task_num);
+}
+
+void ThreadRun(Thread *thread) {
+  ThreadPool *thread_pool = GetInstance(thread->thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return;
+  }
+  Task *task = NULL;
+  int thread_id = thread->thread_id;
+  int spin_count = 0;
+  thread->is_running = true;
+  while (thread_pool->is_alive) {
+    while (thread->activate) {
+      if (PopTaskFromQueue(thread, &task)) {
+        task->func(task->content, thread_id);
+        atomic_fetch_sub_explicit(&thread->task_size, 1, memory_order_relaxed);
+        // atomic_store_explicit(&thread->task_size, thread->task_size - 1, memory_order_relaxed);
+        spin_count = 0;
+        sem_trywait(&thread->sem);
+      } else {
+        sched_yield();
+        spin_count++;
+      }
+      if (spin_count == DEFAULT_SPIN_COUNT) {
+        break;
+      }
+    }
+    sem_wait(&thread->sem);
+  }
+  thread->is_running = false;
+}
+
+void PushThreadToList(int thread_pool_id, Thread *thread) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return;
+  }
+  ThreadList *thread_list = thread_pool->thread_list;
+  if (thread_list == NULL) {
+    LOG_INFO("thread list is null");
+    DestroyThreadPool(thread_pool_id);
+    return;
+  }
+  pthread_mutex_lock(&thread_list->lock);
+  if (thread_list->size == 0) {
+    thread_list->head = thread;
+    thread_list->tail = thread;
+  } else {
+    thread_list->tail->next = thread;
+    thread_list->tail = thread;
+  }
+  thread_list->size++;
+  pthread_mutex_unlock(&thread_list->lock);
+}
+
+int CreateNewThread(int thread_pool_id, int thread_id) {
+  LOG_INFO("thread_pool_id: %d, create thread: %d", thread_pool_id, thread_id);
+  Thread *thread = (Thread *)malloc(sizeof(Thread));
+  if (thread == NULL) {
+    LOG_INFO("create thread failed");
+    DestroyThreadPool(thread_pool_id);
+    return RET_TP_ERROR;
+  }
+  thread->thread_pool_id = thread_pool_id;
+  thread->thread_id = thread_id;
+  thread->head = ATOMIC_VAR_INIT(0);
+  thread->tail = ATOMIC_VAR_INIT(0);
+  thread->task_size = ATOMIC_VAR_INIT(0);
+  thread->activate = ATOMIC_VAR_INIT(true);
+  thread->is_running = ATOMIC_VAR_INIT(false);
+  thread->next = NULL;
+  sem_init(&thread->sem, 0, 0);
+  PushThreadToList(thread_pool_id, thread);
+  pthread_create(&thread->pthread, NULL, (void *)ThreadRun, thread);
+  pthread_detach(thread->pthread);
+  return RET_TP_OK;
+}
+
+int ReConfigThreadPool(int thread_pool_id, int thread_num, CpuBindMode mode) {
+  LOG_INFO("reconfig thread pool, thread_pool_id: %d, thread_num: %d, mode: %d", thread_pool_id, thread_num, mode);
+  if (thread_num <= 0 || thread_num > MAX_THREAD_NUM) {
+    LOG_INFO("invalid thread num: %d", thread_num);
+    return RET_TP_ERROR;
+  }
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return RET_TP_ERROR;
+  }
+  if (thread_num <= thread_pool->thread_num) {
+    LOG_INFO("no need to add thread");
+    return RET_TP_OK;
+  }
+  int curr_thread_num = thread_pool->thread_num;
+  thread_pool->thread_num = thread_num > MAX_THREAD_NUM ? MAX_THREAD_NUM : thread_num;
+  thread_pool->mode = mode;
+  if (thread_pool->thread_list == NULL) {
+    thread_pool->thread_list = (ThreadList *)malloc(sizeof(ThreadList));
+    if (thread_pool->thread_list == NULL) {
+      LOG_INFO("create thread list failed");
+      DestroyThreadPool(thread_pool_id);
+      return RET_TP_ERROR;
+    }
+    thread_pool->thread_list->head = NULL;
+    thread_pool->thread_list->tail = NULL;
+    thread_pool->thread_list->size = 0;
+    pthread_mutex_init(&thread_pool->thread_list->lock, NULL);
+  }
+  int add_thread_num = thread_pool->thread_num - curr_thread_num;
+  for (int i = curr_thread_num - 1, j = 0; j < add_thread_num; ++i, ++j) {
+    int ret = CreateNewThread(thread_pool_id, i);
+    if (ret != RET_TP_OK) {
+      LOG_INFO("create new thread failed");
+      return RET_TP_ERROR;
+    }
+  }
+  return BindThreads(thread_pool_id, true, mode);
+}
+
+int CreateThreadPool(int thread_pool_id, int thread_num, CpuBindMode mode) {
+  LOG_INFO("create thread pool, thread_pool_id: %d, thread_num: %d, mode: %d", thread_pool_id, thread_num, mode);
+  if (thread_num <= 0 || thread_num > MAX_THREAD_NUM) {
+    LOG_INFO("invalid thread num: %d", thread_num);
+    return RET_TP_ERROR;
+  }
+#ifdef BIND_CORE
+  if (run_once) {
+    SortCpuProcessor();
+    run_once = false;
+  }
+#endif
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return RET_TP_ERROR;
+  }
+  thread_pool->thread_num = thread_num > MAX_THREAD_NUM ? MAX_THREAD_NUM : thread_num;
+  thread_pool->is_alive = ATOMIC_VAR_INIT(true);
+  thread_pool->mode = mode;
+  thread_pool->thread_list = NULL;
+  if (thread_num > 1) {
+    thread_pool->thread_list = (ThreadList *)malloc(sizeof(ThreadList));
+    if (thread_pool->thread_list == NULL) {
+      LOG_INFO("create thread list failed");
+      DestroyThreadPool(thread_pool_id);
+      return RET_TP_ERROR;
+    }
+    thread_pool->thread_list->head = NULL;
+    thread_pool->thread_list->tail = NULL;
+    thread_pool->thread_list->size = 0;
+    pthread_mutex_init(&thread_pool->thread_list->lock, NULL);
+  }
+  for (int i = 0; i < thread_pool->thread_num - 1; ++i) {
+    int ret = CreateNewThread(thread_pool_id, i);
+    if (ret != RET_TP_OK) {
+      LOG_INFO("create thread %d failed", i);
+      DestroyThreadPool(thread_pool_id);
+      return RET_TP_ERROR;
+    }
+  }
+  return RET_TP_OK;
+}
+
+int ConfigThreadPool(int thread_pool_id, int thread_num, CpuBindMode mode) {
+  LOG_INFO("config: thread_pool_id: %d, thread_num: %d, mode: %d, is_created: %d, refcount: %d", thread_pool_id,
+           thread_num, mode, thread_pool_is_created[thread_pool_id], thread_pool_refcount[thread_pool_id]);
+  if (thread_pool_id >= MAX_THREAD_POOL_NUM) {
+    LOG_INFO("invalid context id: %d", thread_pool_id);
+    return RET_TP_ERROR;
+  }
+  if (thread_num <= 0 || thread_num > MAX_THREAD_NUM) {
+    LOG_INFO("invalid thread num: %d", thread_num);
+    return RET_TP_ERROR;
+  }
+  thread_pool_refcount[thread_pool_id] += 1;
+  int ret;
+  if (thread_pool_is_created[thread_pool_id]) {
+    ret = ReConfigThreadPool(thread_pool_id, thread_num, mode);
+    if (ret != RET_TP_OK) {
+      LOG_INFO("reconfig thread pool failed, thread_pool_id: %d, thread_num: %d, mode: %d", thread_pool_id, thread_num,
+               mode);
+    }
+  } else {
+    thread_pool_is_created[thread_pool_id] = true;
+    ret = CreateThreadPool(thread_pool_id, thread_num, mode);
+    if (ret != RET_TP_OK) {
+      LOG_INFO("create thread pool failed, thread_pool_id: %d, thread_num: %d, mode: %d", thread_pool_id, thread_num,
+               mode);
+    }
+  }
+  return ret;
+}
+
+void ActivateThreadPool(int thread_pool_id) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return;
+  }
+  ThreadList *thread_list = thread_pool->thread_list;
+  if (thread_list == NULL) {
+    LOG_INFO("thread pool: %d list is null", thread_pool_id);
+    return;
+  }
+  Thread *thread = thread_list->head;
+  while (thread != NULL) {
+    sem_post(&thread->sem);
+    thread->activate = true;
+    thread = thread->next;
+  }
+}
+
+void DeactivateThreadPool(int thread_pool_id) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return;
+  }
+  ThreadList *thread_list = thread_pool->thread_list;
+  if (thread_list == NULL) {
+    LOG_INFO("thread pool: %d list is null", thread_pool_id);
+    return;
+  }
+  Thread *thread = thread_list->head;
+  while (thread != NULL) {
+    thread->activate = false;
+    thread = thread->next;
+  }
+}
+
+void DestroyThreadPool(int thread_pool_id) {
+  thread_pool_refcount[thread_pool_id]--;
+  if (thread_pool_refcount[thread_pool_id] > 0) {
+    LOG_INFO("no need to free, thread_pool_id: %d, refcount: %d", thread_pool_id, thread_pool_refcount[thread_pool_id]);
+    return;
+  }
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return;
+  }
+  if (thread_pool->thread_list == NULL) {
+    LOG_INFO("thread pool: %d list is null", thread_pool_id);
+    return;
+  }
+  DeactivateThreadPool(thread_pool_id);
+  thread_pool_is_created[thread_pool_id] = false;
+  thread_pool->is_alive = false;
+  for (int i = 0; i < thread_pool->thread_num - 1; ++i) {
+    Thread *thread = GetThread(thread_pool_id, i);
+    if (thread != NULL) {
+      FreeThread(thread_pool->thread_list, thread);
+    }
+  }
+  free(thread_pool->thread_list);
+  thread_pool->thread_list = NULL;
+  LOG_INFO("destroy thread pool success, thread_pool_id: %d, refcount: %d", thread_pool_id,
+           thread_pool_refcount[thread_pool_id]);
+}
+
+int GetCurrentThreadNum(int thread_pool_id) {
+  ThreadPool *thread_pool = GetInstance(thread_pool_id);
+  if (thread_pool == NULL) {
+    LOG_INFO("get thread pool instane failed");
+    return 0;
+  }
+  return thread_pool->thread_num;
+}
diff --git a/mindspore/lite/src/runtime/thread_pool.cc b/mindspore/lite/src/runtime/thread_pool.cc
deleted file mode 100644
index ecbad2772f..0000000000
--- a/mindspore/lite/src/runtime/thread_pool.cc
+++ /dev/null
@@ -1,464 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "src/runtime/thread_pool.h"
-#include <algorithm>
-#include "utils/log_adapter.h"
-#ifdef MS_COMPILE_IOS
-#include <sys/types.h>
-#include <sys/sysctl.h>
-#include <mach/machine.h>
-#endif  // MS_COMPILE_IOS
-
-namespace mindspore {
-namespace predict {
-constexpr int kDefaultBigCount = 2;
-constexpr int kDefaultMidCount = 2;
-constexpr uint32_t kDefaultSpinCount = 300000;
-constexpr int kSmallCpuNum = 4;
-constexpr int kBigMidCpuNum = 4;
-constexpr int kDefaultThreadNum = 1;
-static unsigned int kDefaultMaxThreadNums = 8;
-static unsigned int localMaxThreadNums = 1;
-static ThreadPool globalThreadPool;
-
-ThreadPool *GlobalThreadPool() { return &globalThreadPool; }
-
-bool LiteQueue::Enqueue(ThreadPoolTask *task) {
-  const int tailIndex = tail.load(std::memory_order_relaxed);
-  // queue full
-  auto next = (tailIndex + 1) % kSingleThreadMaxTask;
-  if (next == head.load(std::memory_order_acquire)) {
-    return false;
-  }
-  buffer[tailIndex] = task;
-  tail.store(next, std::memory_order_release);
-  ++taskSize;
-  return true;
-}
-
-bool LiteQueue::Dequeue(ThreadPoolTask **out) {
-  if (taskSize == 0) {
-    return false;
-  }
-  // queue empty
-  const int headIndex = head.load(std::memory_order_relaxed);
-  if (headIndex == tail.load(std::memory_order_acquire)) {
-    return false;
-  }
-  *out = buffer[headIndex];
-  head.store((headIndex + 1) % kSingleThreadMaxTask, std::memory_order_release);
-  return true;
-}
-
-bool LiteThreadBind::Bind(bool ifBind, int numThreads, bool master) {
-  if (master) {
-    if (!BindMasterThread(ifBind, bindModel)) {
-      MS_LOG(ERROR) << "bind msater thread failed";
-      return false;
-    }
-    MS_LOG(DEBUG) << "bind master thread successful";
-  }
-  if (numThreads > static_cast<int>(sortedCpuIds.size())) {
-    MS_LOG(ERROR) << "thread num " << numThreads << " is larger than cores " << static_cast<int>(sortedCpuIds.size())
-                  << " in the system";
-    return true;
-  }
-
-  if (!BindThreads(ifBind)) {
-    MS_LOG(ERROR) << "action " << ifBind << " thread failed";
-    return false;
-  }
-  MS_LOG(DEBUG) << "action " << ifBind << " thread successful";
-  return true;
-}
-
-void LiteThreadBind::InitSortedCpuId() {
-  // mate10(970)|p20(970): 4big, 4small
-  // mate20(980)|p30(980)|mate30(990): 2big, 2mid, 4small
-  // note: p30's core 7 not allowed to be bind
-  int numCores = 0;
-#ifdef MS_COMPILE_IOS
-  size_t len = sizeof(numCores);
-  sysctlbyname("hw.ncpu", &numCores, &len, NULL, 0);
-  numCores = numCores > 1 ? numCores : 1;
-#else
-  numCores = static_cast<int>(std::thread::hardware_concurrency());
-#endif  // MS_COMPILE_IOS
-  if (numCores < 0) {
-    MS_LOG(ERROR) << "get numCores return invalid value: " << numCores;
-    sortedCpuIds.clear();
-    return;
-  }
-  if (numCores < kBigMidCpuNum) {
-    bigCore = 0;
-    midCore = numCores;
-  } else {
-    bigCore = kDefaultBigCount;
-    midCore = kDefaultMidCount;
-  }
-  sortedCpuIds.clear();
-  for (int i = numCores - 1; i >= 0; --i) {
-    sortedCpuIds.emplace_back(i);
-  }
-  if (sortedCpuIds.size() > kSmallCpuNum) {
-    sortedCpuIds.resize(bigCore + midCore);
-  }
-}
-
-bool LiteThreadBind::BindMasterThread(bool bindFlag, int mode) {
-  std::vector<int> cpu;
-  if (bindFlag) {
-    size_t cpuIndex;
-    if (mode == MID_CORE) {
-      cpuIndex = sortedCpuIds.size() - 1;
-    } else {
-      cpuIndex = 0;
-    }
-    cpu.emplace_back(sortedCpuIds[cpuIndex]);
-  } else {
-    // unbind master
-    cpu.assign(sortedCpuIds.begin(), sortedCpuIds.end());
-  }
-  cpu_set_t cpuSet;
-#ifndef CPU_SET
-  (void)memset(&cpuSet, 0, sizeof(cpu_set_t));
-#else
-  CPU_ZERO(&cpuSet);
-#endif
-  for (auto coreId : cpu) {
-#ifndef CPU_SET
-    CPU_SET_LOCAL(coreId, &cpuSet);
-#else
-    CPU_SET(coreId, &cpuSet);
-#endif
-  }
-  if (!SetCPUBind(pthread_self(), &cpuSet)) {
-    MS_LOG(ERROR) << "do master bind failed. mode: " << mode;
-    return false;
-  }
-  return true;
-}
-
-bool LiteThreadBind::BindThreads(bool bindFlag) {
-  if (bindFlag && bindModel != NO_BIND) {
-    size_t bindNums = std::min(sortedCpuIds.size(), threadIdList.size());
-    cpu_set_t cpuSet;
-    size_t coreIndex;
-    for (size_t i = 0; i < bindNums; ++i) {
-#ifndef CPU_SET
-      (void)memset(&cpuSet, 0, sizeof(cpu_set_t));
-#else
-      CPU_ZERO(&cpuSet);
-#endif
-      if (bindModel == MID_CORE) {
-        coreIndex = sortedCpuIds.size() - 2 - i;
-      } else {
-        coreIndex = i + 1;
-      }
-#ifndef CPU_SET
-      CPU_SET_LOCAL(sortedCpuIds[coreIndex], &cpuSet);
-#else
-      CPU_SET(sortedCpuIds[coreIndex], &cpuSet);
-#endif
-      if (!SetCPUBind(threadIdList[i], &cpuSet)) {
-        MS_LOG(ERROR) << "do SetCPUBind failed";
-        return false;
-      }
-    }
-  } else {
-    // unbind
-    size_t bindNums = std::min(sortedCpuIds.size(), threadIdList.size());
-    cpu_set_t cpuSet;
-#ifndef CPU_SET
-    (void)memset(&cpuSet, 0, sizeof(cpu_set_t));
-#else
-    CPU_ZERO(&cpuSet);
-#endif
-    for (auto coreId : sortedCpuIds) {
-#ifndef CPU_SET
-      CPU_SET_LOCAL(coreId, &cpuSet);
-#else
-      CPU_SET(coreId, &cpuSet);
-#endif
-    }
-    for (size_t i = 0; i < bindNums; ++i) {
-      if (!SetCPUBind(threadIdList[i], &cpuSet)) {
-        MS_LOG(ERROR) << "do SetCPUBind failed";
-        return false;
-      }
-    }
-  }
-  return true;
-}
-
-bool LiteThreadBind::SetCPUBind(pthread_t threadId, cpu_set_t *cpuSet) {
-#if defined(__ANDROID__)
-#if __ANDROID_API__ >= 21
-  int ret = sched_setaffinity(pthread_gettid_np(threadId), sizeof(cpu_set_t), cpuSet);
-  if (ret != 0) {
-    MS_LOG(ERROR) << "bind thread " << threadId << "to cpu failed.ERROR " << ret;
-  }
-#endif
-#else
-#ifdef __APPLE__
-  MS_LOG(ERROR) << "not bind thread to apple's cpu.";
-  return false;
-#else
-#ifndef _WIN32
-  int ret = pthread_setaffinity_np(threadId, sizeof(cpuSet), cpuSet);
-  if (ret != 0) {
-    MS_LOG(ERROR) << "bind thread " << threadId << " to cpu failed.ERROR " << ret;
-    return false;
-  }
-#endif
-#endif  // __APPLE__
-#endif
-  return true;
-}
-
-bool ThreadPool::SetThreadPool() {
-  std::lock_guard<std::mutex> Lock(poolMutex);
-  if (configThreadNums <= 0) {
-    MS_LOG(WARNING) << "numThreads " << configThreadNums << ", must be greater than 0";
-    configThreadNums = curThreadRunNums;
-  }
-  if (localMaxThreadNums == 0) {
-    localMaxThreadNums = 1;
-  } else if (localMaxThreadNums > kDefaultMaxThreadNums) {
-    localMaxThreadNums = kDefaultMaxThreadNums;
-  }
-  if (configThreadNums > static_cast<int>(kDefaultMaxThreadNums)) {
-    configThreadNums = kDefaultMaxThreadNums;
-  }
-  int addNum = 0;
-  if (configThreadNums > static_cast<int>(kDefaultMaxThreadNums)) {
-    addNum = configThreadNums - curThreadRunNums;
-  } else if (static_cast<int>(localMaxThreadNums) > curThreadNums) {
-    addNum = localMaxThreadNums - curThreadNums;
-  }
-  AddNewThread(addNum);
-  if (curThreadRunNums > static_cast<int>(localMaxThreadNums)) {
-    SubRunThread(localMaxThreadNums);
-  } else {
-    AddRunThread(localMaxThreadNums);
-  }
-  return true;
-}
-
-void ThreadPool::AddNewThread(int newNums) {
-  for (int i = curThreadNums - 1, j = 0; j < newNums; ++i, ++j) {
-    auto active = new std::atomic_bool{true};
-    auto queue = std::make_shared<LiteQueue>();
-    threadList.emplace_back([this, i, active, queue]() {
-      ThreadPoolTask *task = nullptr;
-      uint32_t spin_count = 0;
-      while (!exitRun) {
-        while (*active) {
-          if (queue->Dequeue(&task)) {
-            auto ret = task->first(i + 1, task->second.tvmParam, task->second.cdata);
-            if (ret != 0) {
-              errorInfo.emplace_back(std::make_pair(i + 1, std::make_pair(false, ret)));
-            }
-            queue->taskSize--;
-            spin_count = 0;
-          } else {
-            ++spin_count;
-          }
-          if (spin_count == kDefaultSpinCount) {
-            *(activateList[i]) = false;
-            --curThreadRunNums;
-            spin_count = 0;
-            break;
-          }
-          std::this_thread::yield();
-        }
-        std::unique_lock<std::mutex> queueLock(tMutex);
-        queueReady.wait(queueLock, [active, this] { return exitRun || *active; });
-      }
-    });
-    activateList.emplace_back(active);
-    queueList.emplace_back(queue);
-  }
-  curThreadNums += newNums;
-  curThreadRunNums += newNums;
-}
-
-bool ThreadPool::SetThreadCpuBind(bool ifBind, int mode, bool master) {
-  if (curThreadRunNums <= 0) {
-    MS_LOG(ERROR) << "no threads need to be bind, totalThreadNum : " << curThreadRunNums;
-    return false;
-  }
-  if (threadBind == nullptr) {
-    threadBind = std::unique_ptr<LiteThreadBind>(new LiteThreadBind());
-    if (threadBind == nullptr) {
-      MS_LOG(ERROR) << "create threadBind failed";
-      return false;
-    }
-    threadBind->threadIdList.resize(kDefaultMaxThreadNums);
-    threadBind->InitSortedCpuId();
-  }
-  threadBind->threadIdList.clear();
-  for (auto &it : threadList) {
-    threadBind->threadIdList.emplace_back(it.native_handle());
-  }
-  threadBind->bindModel = static_cast<AffinityMode>(mode);
-  if (!threadBind->Bind(ifBind, curThreadRunNums, master)) {
-    MS_LOG(ERROR) << "bind failed";
-    return false;
-  }
-  return true;
-}
-
-bool ThreadPool::AddTask(WorkFun &&worker, void *cdata, int numTask) {
-  if (numTask <= 0) {
-    numTask = curThreadRunNums;
-  }
-  TvmEnv env{};
-  env.num_task = numTask;
-  errorInfo.clear();
-  // single task, run master thread
-  if (curThreadRunNums <= 1) {
-    for (int i = 0; i < numTask; ++i) {
-      int ret = worker(i, &env, cdata);
-      if (ret != 0) {
-        errorInfo.emplace_back(std::make_pair(0, std::make_pair(false, ret)));
-      }
-    }
-    return CheckResult();
-  }
-  ThreadPoolTask task;
-  task.first = std::move(worker);
-  task.second.cdata = cdata;
-  task.second.tvmParam = &env;
-  return DistributeTask(&task, numTask);
-}
-
-bool ThreadPool::DistributeTask(ThreadPoolTask *task, int numTask) {
-  auto taskOri = *task;
-  if (numTask > curThreadRunNums) {
-    task->first = [taskOri, numTask, this](int task_id, TvmEnv *penv, void *cdata) -> int {
-      for (int i = task_id; i < numTask; i += curThreadRunNums) {
-        int ret = taskOri.first(i, penv, cdata);
-        if (ret != 0) {
-          errorInfo.emplace_back(std::make_pair(i + 1, std::make_pair(false, ret)));
-        }
-      }
-      return 0;
-    };
-  }
-  bool kSuccFlag;
-  auto size = std::min(curThreadRunNums, numTask);
-  for (int i = 0; i < size - 1; ++i) {
-    do {
-      kSuccFlag = true;
-      if (!queueList[i]->Enqueue(task)) {
-        std::this_thread::yield();
-        kSuccFlag = false;
-      }
-    } while (!kSuccFlag);
-  }
-  // master thread
-  int ret = task->first(0, task->second.tvmParam, task->second.cdata);
-  if (ret != 0) {
-    errorInfo.emplace_back(std::make_pair(0, std::make_pair(false, ret)));
-  }
-  kSuccFlag = false;
-  while (!kSuccFlag) {
-    std::this_thread::yield();
-    kSuccFlag = true;
-    for (int i = 0; i < curThreadRunNums - 1; ++i) {
-      if (queueList[i]->taskSize != 0) {
-        kSuccFlag = false;
-        break;
-      }
-    }
-  }
-  return CheckResult();
-}
-
-void ThreadPool::AddRunThread(int num) {
-  int activeNums = num - curThreadRunNums;
-  if (activeNums <= 0 || static_cast<int>(activateList.size()) < activeNums) {
-    return;
-  }
-  for (int i = curThreadRunNums - 1, j = 0; j < activeNums; ++i, ++j) {
-    *activateList[i] = true;
-  }
-  std::lock_guard<std::mutex> queueLock(tMutex);
-  queueReady.notify_all();
-  curThreadRunNums = num;
-}
-
-void ThreadPool::SubRunThread(int num) {
-  int deactiveNums = curThreadRunNums - num;
-  if (deactiveNums <= 0) {
-    return;
-  }
-  for (int i = num - 1, j = 0; j < deactiveNums; ++i, ++j) {
-    *activateList[i] = false;
-  }
-  curThreadRunNums = num;
-}
-
-bool ThreadPool::CheckResult() {
-  bool kSuccFlag = true;
-  for (auto result : errorInfo) {
-    if (result.second.first) {
-      MS_LOG(ERROR) << "task " << result.first << " failed, error code is " << result.second.second;
-      kSuccFlag = false;
-    }
-  }
-  return kSuccFlag;
-}
-
-bool ThreadPool::LaunchWork(WorkFun worker, void *cdata, int numTask) {
-  if (!SetThreadPool()) {
-    return false;
-  }
-  return AddTask(std::move(worker), cdata, numTask);
-}
-
-bool ThreadPool::BindAllThreads(bool ifBind, int mode, bool master) {
-  if (!SetThreadPool()) {
-    return false;
-  }
-  return SetThreadCpuBind(ifBind, mode, master);
-}
-
-void ThreadPool::ConfigThreadPool(int mode, int numThreads) {
-  configBindMode = mode;
-  configThreadNums = numThreads;
-}
-
-void ThreadPool::ConfigMaxThreadNum(unsigned int num) { localMaxThreadNums = num; }
-
-ThreadPool::~ThreadPool() {
-  curThreadRunNums = static_cast<int>(threadList.size() + 1);
-  exitRun = true;
-  SubRunThread(kDefaultThreadNum);
-  queueReady.notify_all();
-  for (auto &it : threadList) {
-    if (it.joinable()) {
-      it.join();
-    }
-  }
-  for (const auto &it : activateList) {
-    delete it;
-  }
-}
-}  // namespace predict
-}  // namespace mindspore
diff --git a/mindspore/lite/src/runtime/thread_pool.h b/mindspore/lite/src/runtime/thread_pool.h
index 6670f7a932..29f15bf325 100644
--- a/mindspore/lite/src/runtime/thread_pool.h
+++ b/mindspore/lite/src/runtime/thread_pool.h
@@ -17,111 +17,53 @@
 #ifndef MINDSPORE_LITE_SRC_RUNTIME_THREAD_POOL_H_
 #define MINDSPORE_LITE_SRC_RUNTIME_THREAD_POOL_H_
 
-#include <mutex>
-#include <condition_variable>
-#include <thread>
-#include <vector>
-#include <string>
-#include <atomic>
-#include <memory>
-#include <utility>
-#include <functional>
-#include <iostream>
-#include "src/runtime/runtime_api.h"
+#include <stdbool.h>
+#include "include/thread_pool_config.h"
 
-namespace mindspore {
-namespace predict {
-#ifndef CPU_SET
-const int CPU_SETSIZE = 1024;
-#define __NCPUBITS (8 * sizeof(uint64_t))
-typedef struct {
-  uint64_t __bits[CPU_SETSIZE / __NCPUBITS];
-} cpu_set_t;
+/**
+ * create thread pool and init
+ * @param thread_num
+ * @param mode
+ */
+int ConfigThreadPool(int context_id, int thread_num, CpuBindMode mode);
 
-#define CPU_SET_LOCAL(cpu, cpusetp) ((cpusetp)->__bits[(cpu) / __NCPUBITS] |= (1UL << ((cpu) % __NCPUBITS)))
-#endif
+/**
+ *
+ * @param session_index, support multi session
+ * @param job
+ * @param content
+ * @param task_num
+ */
+int ParallelLaunch(int context_id, int (*job)(void *, int), void *content, int task_num);
 
-constexpr int kSingleThreadMaxTask = 2;
-using TvmEnv = LiteParallelGroupEnv;
-using WorkFun = std::function<int(int, TvmEnv *, void *)>;
-using TaskParam = struct Param {
-  void *cdata;
-  TvmEnv *tvmParam;
-};
-using ThreadPoolTask = std::pair<WorkFun, TaskParam>;
-enum AffinityMode : int { BIG_CORE = 1, MID_CORE = -1, NO_BIND = 0 };
+/**
+ * bind each thread to specified cpu core
+ * @param is_bind
+ * @param mode
+ */
+int BindThreads(int context_id, bool is_bind, CpuBindMode mode);
 
-class LiteQueue {
- public:
-  LiteQueue() = default;
-  ~LiteQueue() = default;
-  bool Enqueue(ThreadPoolTask *task);
-  bool Dequeue(ThreadPoolTask **out);
-  std::atomic_int taskSize = {0};
+/**
+ * activate the thread pool
+ * @param context_id
+ */
+void ActivateThreadPool(int context_id);
 
- private:
-  std::atomic_int head = {0};
-  std::atomic_int tail = {0};
-  ThreadPoolTask *buffer[kSingleThreadMaxTask]{};
-};
+/**
+ * deactivate the thread pool
+ * @param context_id
+ */
+void DeactivateThreadPool(int context_id);
 
-class LiteThreadBind {
- public:
-  LiteThreadBind() = default;
-  ~LiteThreadBind() = default;
-  void InitSortedCpuId();
-  bool Bind(bool ifBind, int numThreads, bool master);
-  AffinityMode bindModel = MID_CORE;
-  std::vector<pthread_t> threadIdList;
+/**
+ *
+ * @return current thread num
+ */
+int GetCurrentThreadNum(int context_id);
 
- private:
-  bool BindMasterThread(bool bindFlag, int mode);
-  bool BindThreads(bool bindFlag);
-  bool SetCPUBind(pthread_t threadId, cpu_set_t *cpuSet);
-  int bigCore = 0;
-  int midCore = 0;
-  std::vector<unsigned int> sortedCpuIds{};
-};
-
-class ThreadPool {
- public:
-  ThreadPool() = default;
-  ~ThreadPool();
-  bool LaunchWork(WorkFun worker, void *cdata, int numTask);
-  void ConfigThreadPool(int mode, int numThreads);
-  void ConfigMaxThreadNum(unsigned int num);
-  bool BindAllThreads(bool ifBind, int mode, bool master = true);
-  ThreadPool(const ThreadPool &) = delete;
-  ThreadPool &operator=(const ThreadPool &) = delete;
-
- private:
-  bool SetThreadPool();
-  void AddNewThread(int newNums);
-  bool SetThreadCpuBind(bool ifBind, int mode, bool master);
-  bool AddTask(WorkFun &&worker, void *cdata, int numTask);
-  bool DistributeTask(ThreadPoolTask *task, int numTask);
-  void AddRunThread(int num);
-  void SubRunThread(int num);
-  bool CheckResult();
-
-  std::mutex poolMutex;
-  std::mutex tMutex;
-  std::condition_variable queueReady;
-  std::atomic_bool exitRun = {false};
-  std::vector<std::atomic_bool *> activateList{};
-  int curThreadNums = 1;
-  int curThreadRunNums = 1;
-  int configThreadNums = 1;
-  int configBindMode = -1;
-  std::vector<std::thread> threadList{};
-  std::vector<std::shared_ptr<LiteQueue>> queueList{};
-  std::unique_ptr<LiteThreadBind> threadBind{nullptr};
-  std::vector<std::pair<int, std::pair<bool, int>>> errorInfo{};
-};
-
-ThreadPool* GlobalThreadPool();
-}  // namespace predict
-}  // namespace mindspore
+/**
+ * destroy thread pool, and release resource
+ */
+void DestroyThreadPool(int context_id);
 
 #endif  // MINDSPORE_LITE_SRC_RUNTIME_THREAD_POOL_H_
-
diff --git a/mindspore/lite/test/CMakeLists.txt b/mindspore/lite/test/CMakeLists.txt
index 1a324cb850..c078f9c7b0 100644
--- a/mindspore/lite/test/CMakeLists.txt
+++ b/mindspore/lite/test/CMakeLists.txt
@@ -179,7 +179,7 @@ set(TEST_LITE_SRC
         ${KERNEL_OP_SRC}
         ${LITE_DIR}/src/runtime/allocator.cc
         ${LITE_DIR}/src/runtime/runtime_api.cc
-        ${LITE_DIR}/src/runtime/thread_pool.cc
+        ${LITE_DIR}/src/runtime/thread_pool.c
         ${LITE_DIR}/src/runtime/workspace_pool.cc
         ${LITE_DIR}/src/runtime/parallel_executor.cc
         ${LITE_DIR}/src/ir/tensor.cc
diff --git a/mindspore/lite/test/ut/src/infer_test.cc b/mindspore/lite/test/ut/src/infer_test.cc
index 9de1a2dde1..2cda4426d1 100644
--- a/mindspore/lite/test/ut/src/infer_test.cc
+++ b/mindspore/lite/test/ut/src/infer_test.cc
@@ -106,7 +106,7 @@ TEST_F(InferTest, TestConvNode) {
   meta_graph.reset();
   content = nullptr;
   auto context = new lite::Context;
-  context->cpu_bind_mode_ = lite::NO_BIND;
+  context->cpu_bind_mode_ = NO_BIND;
   context->device_ctx_.type = lite::DT_CPU;
   context->thread_num_ = 4;
   auto session = session::LiteSession::CreateSession(context);
@@ -205,7 +205,7 @@ TEST_F(InferTest, TestAddNode) {
   meta_graph.reset();
   content = nullptr;
   auto context = new lite::Context;
-  context->cpu_bind_mode_ = lite::NO_BIND;
+  context->cpu_bind_mode_ = NO_BIND;
   context->device_ctx_.type = lite::DT_CPU;
   context->thread_num_ = 4;
   auto session = session::LiteSession::CreateSession(context);
@@ -307,7 +307,7 @@ TEST_F(InferTest, TestParallelExecutor) {
   meta_graph.reset();
   content = nullptr;
   auto context = new lite::Context;
-  context->cpu_bind_mode_ = lite::NO_BIND;
+  context->cpu_bind_mode_ = NO_BIND;
   context->device_ctx_.type = lite::DT_CPU;
   context->thread_num_ = 4;
   auto session = new SessionWithParallelExecutor();
@@ -348,7 +348,7 @@ TEST_F(InferTest, TestModel) {
   ASSERT_NE(nullptr, model);
   delete[] buf[0];
   auto context = new lite::Context;
-  context->cpu_bind_mode_ = lite::NO_BIND;
+  context->cpu_bind_mode_ = NO_BIND;
   context->device_ctx_.type = lite::DT_CPU;
   context->thread_num_ = 4;
   auto session = session::LiteSession::CreateSession(context);
diff --git a/mindspore/lite/tools/converter/CMakeLists.txt b/mindspore/lite/tools/converter/CMakeLists.txt
index fe7b9e0836..da853e8781 100644
--- a/mindspore/lite/tools/converter/CMakeLists.txt
+++ b/mindspore/lite/tools/converter/CMakeLists.txt
@@ -68,7 +68,7 @@ if (WIN32)
             ${CMAKE_CURRENT_SOURCE_DIR}/../../src/kernel_registry.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/../../src/common/graph_util.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/../../src/runtime/runtime_api.cc
-            ${CMAKE_CURRENT_SOURCE_DIR}/../../src/runtime/thread_pool.cc
+            ${CMAKE_CURRENT_SOURCE_DIR}/../../src/runtime/thread_pool.c
             ${CMAKE_CURRENT_SOURCE_DIR}/../../src/runtime/workspace_pool.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/../../src/runtime/allocator.cc
             ${CMAKE_CURRENT_SOURCE_DIR}/../../src/executor.cc
@@ -122,7 +122,7 @@ set(LITE_SRC
         ${SRC_DIR}/common/ms_tensor_utils.cc
         ${SRC_DIR}/runtime/allocator.cc
         ${SRC_DIR}/runtime/runtime_api.cc
-        ${SRC_DIR}/runtime/thread_pool.cc
+        ${SRC_DIR}/runtime/thread_pool.c
         ${SRC_DIR}/runtime/workspace_pool.cc
         ${SRC_DIR}/ir/tensor.cc
         ${SRC_DIR}/context.cc