mindspore serving support gpu backend

2021-01-28 09:17:24 +08:00 · 2021-01-28 09:17:24 +08:00 · a911b9ef9e
parent 3708624a25
commit a911b9ef9e
19 changed files with 775 additions and 112 deletions
--- a/include/api/types.h
+++ b/include/api/types.h
@ -104,6 +104,7 @@ class MS_API Buffer {

 extern MS_API const char *kDeviceTypeAscend310;
 extern MS_API const char *kDeviceTypeAscend910;
+extern MS_API const char *kDeviceTypeGpu;

 constexpr auto kModelOptionDumpCfgPath = "mindspore.option.dump_config_file_path";
 constexpr auto kModelOptionInsertOpCfgPath = "mindspore.option.insert_op_config_file_path";  // aipp config file
--- a/mindspore/ccsrc/backend/session/CMakeLists.txt
+++ b/mindspore/ccsrc/backend/session/CMakeLists.txt
@ -13,7 +13,7 @@ if(CMAKE_SYSTEM_NAME MATCHES "Darwin")
 endif()

 if(ENABLE_GPU)
-    file(GLOB_RECURSE _GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu_session.cc")
+    file(GLOB_RECURSE _GPU_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "gpu_session.cc" "gpu_inference_session.cc")
    list(APPEND _SESSION_SRC_LIST ${_GPU_SRC_LIST})
 endif()

--- a/mindspore/ccsrc/backend/session/gpu_inference_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_inference_session.cc
@ -0,0 +1,217 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <algorithm>
+#include "backend/session/gpu_inference_session.h"
+#include "ir/tensor.h"
+#include "ir/anf.h"
+#include "ir/param_info.h"
+#include "runtime/device/kernel_runtime.h"
+#include "backend/session/anf_runtime_algorithm.h"
+#include "utils/ms_utils.h"
+#include "common/trans.h"
+#include "utils/config_manager.h"
+
+namespace mindspore {
+namespace session {
+void GpuInferenceSession::LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
+                                        const std::vector<tensor::TensorPtr> &inputs_const) const {
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  std::vector<tensor::TensorPtr> inputs(inputs_const);
+  auto input_nodes = kernel_graph->inputs();
+
+  size_t no_weight_input = 0;
+  for (size_t i = 0; i < input_nodes.size(); ++i) {
+    tensor::TensorPtr tensor = nullptr;
+    if (!input_nodes[i]->isa<Parameter>()) {
+      MS_LOG(ERROR) << "Kernel graph inputs have anfnode which is not Parameter";
+      continue;
+    }
+    auto pk_node = input_nodes[i]->cast<ParameterPtr>();
+    MS_EXCEPTION_IF_NULL(pk_node);
+    auto device_address = AnfAlgo::GetMutableOutputAddr(pk_node, 0);
+    MS_EXCEPTION_IF_NULL(device_address);
+    if (!AnfAlgo::IsParameterWeight(pk_node)) {
+      tensor = inputs[no_weight_input++];
+      if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(pk_node, 0),
+                                            LongToSize(tensor->data().nbytes()), tensor->data_type(),
+                                            tensor->data_c())) {
+        MS_LOG(EXCEPTION) << "SyncHostToDevice failed.";
+      }
+    }
+  }
+}
+
+GraphId GpuInferenceSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) {
+  auto graph_id = GPUSession::CompileGraphImpl(func_graph);
+  auto kernel_graph = GetGraph(graph_id);
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  // load weight data to device
+  auto input_nodes = kernel_graph->inputs();
+  for (size_t i = 0; i < input_nodes.size(); ++i) {
+    if (!input_nodes[i]->isa<Parameter>()) {
+      MS_LOG(ERROR) << "Kernel graph inputs have anfnode which is not Parameter";
+      continue;
+    }
+    auto pk_node = input_nodes[i]->cast<ParameterPtr>();
+    MS_EXCEPTION_IF_NULL(pk_node);
+    auto device_address = AnfAlgo::GetMutableOutputAddr(pk_node, 0);
+    MS_EXCEPTION_IF_NULL(device_address);
+    if (AnfAlgo::IsParameterWeight(pk_node)) {
+      const auto &param_value = pk_node->default_param();
+      MS_EXCEPTION_IF_NULL(param_value);
+      auto tensor = std::dynamic_pointer_cast<tensor::Tensor>(param_value);
+      MS_EXCEPTION_IF_NULL(tensor);
+      if (!device_address->SyncHostToDevice(trans::GetRuntimePaddingShape(pk_node, 0),
+                                            LongToSize(tensor->data().nbytes()), tensor->data_type(),
+                                            tensor->data_c())) {
+        MS_LOG(EXCEPTION) << "SyncHostToDevice failed.";
+      }
+    }
+  }
+  return graph_id;
+}
+
+bool GpuInferenceSession::CheckModelInputs(uint32_t graph_id, const std::vector<tensor::TensorPtr> &inputs,
+                                           std::string *error_msg) const {
+  MS_LOG(INFO) << "Start check client inputs, graph id : " << graph_id;
+  auto kernel_graph = GetGraph(graph_id);
+  MS_EXCEPTION_IF_NULL(kernel_graph);
+  auto kernel_graph_inputs = kernel_graph->inputs();
+  size_t no_weight_input = 0;
+  vector<ParameterPtr> paras;
+  // find parameters of graph inputs
+  for (size_t i = 0; i < kernel_graph_inputs.size(); ++i) {
+    if (!kernel_graph_inputs[i]->isa<Parameter>()) {
+      MS_LOG(ERROR) << "Kernel graph inputs have anfnode which is not Parameter.";
+      continue;
+    }
+    auto parameter = kernel_graph_inputs[i]->cast<ParameterPtr>();
+    if (!AnfAlgo::IsParameterWeight(parameter)) {
+      paras.push_back(parameter);
+    }
+  }
+
+  // check inputs
+  for (size_t i = 0; i < paras.size(); ++i) {
+    // compare input number
+    if (paras.size() != inputs.size()) {
+      MS_LOG(ERROR) << "Input number is inconsistent. The actual input number [" << inputs.size()
+                    << "] but the graph input number is [" << paras.size() << "]";
+      MS_LOG(ERROR) << "InputsInfo --" << InputsInfo(paras, inputs);
+      if (error_msg != nullptr) {
+        std::stringstream str_stream;
+        str_stream << "Input number is inconsistent. The given input number [" << inputs.size()
+                   << "] but the graph input number is [" << paras.size() << "]\n";
+        str_stream << "InputsInfo --" << InputsInfo(paras, inputs);
+        *error_msg = str_stream.str();
+      }
+      return false;
+    }
+    auto input = inputs[no_weight_input++];
+    if (!CompareInput(input, paras[i])) {
+      MS_LOG(ERROR) << "Please check the input information.";
+      MS_LOG(ERROR) << "InputsInfo --" << InputsInfo(paras, inputs);
+      if (error_msg != nullptr) {
+        std::stringstream str_stream;
+        str_stream << "Please check the input information.\n";
+        str_stream << "InputsInfo --" << InputsInfo(paras, inputs);
+        *error_msg = str_stream.str();
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+bool GpuInferenceSession::CompareInput(const tensor::TensorPtr &input, const ParameterPtr &parameter) const {
+  MS_EXCEPTION_IF_NULL(input);
+  MS_EXCEPTION_IF_NULL(parameter);
+  // compare dims
+  auto parameter_shape = AnfAlgo::GetOutputDeviceShape(parameter, 0);
+
+  // compare shape
+  auto input_shape = input->shape();
+  vector<size_t> trans_input;
+  (void)std::transform(input_shape.begin(), input_shape.end(), std::back_inserter(trans_input),
+                       [](const int64_t dim) { return static_cast<size_t>(dim); });
+  auto is_scalar_shape = [](const vector<size_t> &shape) {
+    return shape.empty() || (shape.size() == 1 && shape[0] == 1);
+  };
+  if ((!is_scalar_shape(trans_input) || !is_scalar_shape(parameter_shape)) && (trans_input != parameter_shape)) {
+    MS_LOG(ERROR) << "Input shape is inconsistent. The actual shape is " << PrintInputShape(trans_input)
+                  << ", but the parameter shape is " << PrintInputShape(parameter_shape)
+                  << ". parameter : " << parameter->DebugString();
+    return false;
+  }
+
+  // compare data type
+  auto kernel_build_info = AnfAlgo::GetSelectKernelBuildInfo(parameter);
+  if (input->data_type() != kernel_build_info->GetOutputDeviceType(0)) {
+    MS_LOG(ERROR) << "Input data type is inconsistent. The actual data type is " << input->data_type()
+                  << ", but the parameter data type is " << kernel_build_info->GetOutputDeviceType(0)
+                  << ". parameter : " << parameter->DebugString();
+    return false;
+  }
+  return true;
+}
+
+template <typename T>
+std::string GpuInferenceSession::PrintInputShape(std::vector<T> shape) const {
+  string res = "[";
+  for (auto dim : shape) {
+    res += " " + std::to_string(dim);
+  }
+  return res + " ]";
+}
+
+std::string GpuInferenceSession::InputsInfo(const std::vector<ParameterPtr> &paras,
+                                            const std::vector<tensor::TensorPtr> &inputs) const {
+  const std::map<TypeId, std::string> dtype_name_map{
+    {TypeId::kNumberTypeBegin, "Unknown"},   {TypeId::kNumberTypeBool, "Bool"},
+    {TypeId::kNumberTypeFloat64, "Float64"}, {TypeId::kNumberTypeInt8, "Int8"},
+    {TypeId::kNumberTypeUInt8, "Uint8"},     {TypeId::kNumberTypeInt16, "Int16"},
+    {TypeId::kNumberTypeUInt16, "Uint16"},   {TypeId::kNumberTypeInt32, "Int32"},
+    {TypeId::kNumberTypeUInt32, "Uint32"},   {TypeId::kNumberTypeInt64, "Int64"},
+    {TypeId::kNumberTypeUInt64, "Uint64"},   {TypeId::kNumberTypeFloat16, "Float16"},
+    {TypeId::kNumberTypeFloat32, "Float32"},
+  };
+  auto data_type_to_string = [&dtype_name_map](TypeId type_id) {
+    auto it = dtype_name_map.find(type_id);
+    if (it == dtype_name_map.end()) {
+      return std::string("Unknown");
+    }
+    return it->second;
+  };
+
+  std::string graph = "graph inputs:{ ";
+  for (size_t i = 0; i < paras.size(); ++i) {
+    auto &para = paras[i];
+    graph += std::to_string(i) + ": dims " + std::to_string(AnfAlgo::GetOutputDeviceShape(para, 0).size()) +
+             ", shape " + PrintInputShape(AnfAlgo::GetOutputDeviceShape(para, 0)) + ", data type " +
+             data_type_to_string(AnfAlgo::GetSelectKernelBuildInfo(para)->GetOutputDeviceType(0)) + " }";
+  }
+
+  std::string actual = "given inputs:{ ";
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    actual += std::to_string(i) + ": dims " + std::to_string(inputs[i]->shape().size()) + ", shape " +
+              PrintInputShape(inputs[i]->shape()) + ", data type " + data_type_to_string(inputs[i]->data_type()) + " }";
+  }
+  return graph + "   " + actual;
+}
+
+}  // namespace session
+}  // namespace mindspore
--- a/mindspore/ccsrc/backend/session/gpu_inference_session.h
+++ b/mindspore/ccsrc/backend/session/gpu_inference_session.h
@ -0,0 +1,53 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_BACKEND_SESSION_GPU_INFERENCE_SESSION_H
+#define MINDSPORE_CCSRC_BACKEND_SESSION_GPU_INFERENCE_SESSION_H
+#include <unordered_map>
+#include <string>
+#include <memory>
+#include <vector>
+#include <utility>
+#include <stack>
+#include <map>
+#include <tuple>
+#include <set>
+#include "backend/session/gpu_session.h"
+#include "backend/session/kernel_graph.h"
+#include "backend/kernel_compiler/kernel.h"
+#include "backend/session/session_factory.h"
+
+namespace mindspore {
+namespace session {
+class GpuInferenceSession : public gpu::GPUSession {
+ public:
+  GpuInferenceSession() = default;
+  ~GpuInferenceSession() = default;
+  void LoadInputData(const std::shared_ptr<KernelGraph> &kernel_graph,
+                     const std::vector<tensor::TensorPtr> &inputs_const) const;
+  bool CheckModelInputs(uint32_t graph_id, const std::vector<tensor::TensorPtr> &inputs,
+                        std::string *error_msg) const override;
+  bool CompareInput(const tensor::TensorPtr &input, const ParameterPtr &parameter) const;
+  template <typename T>
+  std::string PrintInputShape(std::vector<T> shape) const;
+  std::string InputsInfo(const std::vector<ParameterPtr> &paras, const std::vector<tensor::TensorPtr> &inputs) const;
+
+ protected:
+  GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) override;
+};
+MS_REG_SESSION(kGpuInferenceDevice, GpuInferenceSession);
+}  // namespace session
+}  // namespace mindspore
+#endif  // MINDSPORE_CCSRC_BACKEND_SESSION_GPU_INFERENCE_SESSION_H
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@ -15,9 +15,11 @@
 */
 #include "backend/session/gpu_session.h"

+#include <string>
 #include "backend/optimizer/common/helper.h"
 #include "backend/optimizer/common/optimizer.h"
 #include "backend/optimizer/common/pass_manager.h"
+#include "backend/optimizer/common/common_backend_optimization.h"
 #include "backend/optimizer/gpu/adam_weight_decay_fusion.h"
 #include "backend/optimizer/gpu/adam_fusion.h"
 #include "backend/optimizer/gpu/apply_momentum_weight_scale_fusion.h"
@ -298,16 +300,31 @@ void GPUSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph) const

 GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) {
  // Construct graph, if successfully, graph_sum_ + 1
-  auto graph_id = graph_sum_;
  auto graph = ConstructKernelGraph(lst, outputs);
  MS_EXCEPTION_IF_NULL(graph);
+  return CompileGraphImpl(graph);
+}
+
+GraphId GPUSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) {
+  std::vector<KernelGraphPtr> all_graphs;
+  auto root_graph = ConstructKernelGraph(func_graph, &all_graphs);
+  MS_EXCEPTION_IF_NULL(root_graph);
+  if (all_graphs.size() != 1) {
+    MS_LOG(EXCEPTION) << "Gpu backend does not support multi-graph schedule. graph num" << all_graphs.size();
+  }
+
+  opt::BackendCommonOptimization(root_graph);
+  return CompileGraphImpl(root_graph);
+}
+
+GraphId GPUSession::CompileGraphImpl(KernelGraphPtr graph) {
  // Prepare ms context info for dump .pb graph
  auto context_ptr = MsContext::GetInstance();
  MS_EXCEPTION_IF_NULL(context_ptr);
  bool save_graphs = context_ptr->get_param<bool>(MS_CTX_SAVE_GRAPHS_FLAG);
  // Dump .pb graph before graph optimization
  if (save_graphs) {
-    DumpIRProto(graph, "before_opt_" + std::to_string(graph_id));
+    DumpIRProto(graph, "before_opt_" + std::to_string(graph->graph_id()));
  }
  // Graph optimization irrelevant to device data format
  Optimize(graph);
@ -326,7 +343,7 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
  AssignStream(graph);
  // Dump .pb graph before remove nop nodes
  if (save_graphs) {
-    DumpIRProto(graph, "before_removeNop_" + std::to_string(graph_id));
+    DumpIRProto(graph, "before_removeNop_" + std::to_string(graph->graph_id()));
  }
  // Update Graph Dynamic Shape Attr.
  UpdateGraphDynamicShapeAttr(NOT_NULL(graph));
@ -343,7 +360,7 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
  SetSummaryNodes(graph.get());
  // Dump .pb graph after graph optimization
  if (save_graphs) {
-    DumpIRProto(graph, "after_opt_" + std::to_string(graph_id));
+    DumpIRProto(graph, "after_opt_" + std::to_string(graph->graph_id()));
  }
  // Set graph manager.
  MS_EXCEPTION_IF_NULL(context_);
@ -361,9 +378,8 @@ GraphId GPUSession::CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtr
    debugger_->LoadGraphs(graph);
  }
 #endif
-  MS_LOG(INFO) << "CompileGraph graph_id: " << graph_id;
-
-  return graph_id;
+  MS_LOG(INFO) << "CompileGraph graph_id: " << graph->graph_id();
+  return graph->graph_id();
 }

 void GPUSession::RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs,
--- a/mindspore/ccsrc/backend/session/gpu_session.h
+++ b/mindspore/ccsrc/backend/session/gpu_session.h
@ -37,6 +37,7 @@ class GPUSession : public SessionBasic {
 protected:
  void UnifyMindIR(const KernelGraphPtr &graph) override { return; }
  GraphId CompileGraphImpl(const AnfNodePtrList &lst, const AnfNodePtrList &outputs) override;
+  GraphId CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) override;
  void RunGraphImpl(const GraphId &graph_id, const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) override;
  void BuildOpImpl(const OpRunInfo &op_run_info, const GraphInfo &graph_info,
                   const std::vector<tensor::TensorPtr> &input_tensors,
@ -81,6 +82,8 @@ class GPUSession : public SessionBasic {
  void SyncValueNodeDeviceAddr(const std::shared_ptr<KernelGraph> &kernel_graph) const;

  void CleanValueNodeDeviceAddr(const std::shared_ptr<KernelGraph> &kernel_graph) const;
+
+  GraphId CompileGraphImpl(KernelGraphPtr kernel_graph);
 };
 using GPUSessionPtr = std::shared_ptr<GPUSession>;
 MS_REG_SESSION(kGPUDevice, GPUSession);
--- a/mindspore/ccsrc/cxx_api/CMakeLists.txt
+++ b/mindspore/ccsrc/cxx_api/CMakeLists.txt
@ -15,10 +15,15 @@ if(ENABLE_ACL)
            "model/model_converter_utils/*.cc"
            "graph/acl/*.cc"
            )
-
 endif()
+
 if(ENABLE_D)
-    file(GLOB_RECURSE API_MS_INFER_SRC ${CMAKE_CURRENT_SOURCE_DIR} "python_utils.cc" "model/ms/*.cc" "graph/ms/*.cc")
+    file(GLOB_RECURSE API_MS_INFER_SRC ${CMAKE_CURRENT_SOURCE_DIR}
+        "python_utils.cc" "model/ms/*.cc" "graph/ascend/*.cc")
+endif()
+
+if(ENABLE_GPU)
+    file(GLOB_RECURSE API_MS_INFER_SRC ${CMAKE_CURRENT_SOURCE_DIR} "python_utils.cc" "model/ms/*.cc" "graph/gpu/*.cc")
 endif()

 set(MSLIB_SRC ${CMAKE_CURRENT_SOURCE_DIR}/types.cc
@ -98,6 +103,15 @@ if(ENABLE_D)
    target_link_libraries(mindspore_shared_lib PRIVATE ${adump_server})
 endif()

+if(ENABLE_GPU)
+    target_link_libraries(mindspore_shared_lib PRIVATE  gpu_cuda_lib gpu_queue cublas
+                          ${CUDA_PATH}/lib64/libcurand.so
+                          ${CUDNN_LIBRARY_PATH}
+                          ${CUDA_PATH}/lib64/libcudart.so
+                          ${CUDA_PATH}/lib64/stubs/libcuda.so
+                          ${CUDA_PATH}/lib64/libcusolver.so)
+endif()
+
 if(CMAKE_SYSTEM_NAME MATCHES "Linux")
    set(MINDSPORE_RPATH $ORIGIN)
    if(ENABLE_D)
@ -110,7 +124,8 @@ if(CMAKE_SYSTEM_NAME MATCHES "Linux")
        set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/add-ons)
        set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe/op_tiling)
        set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
-        set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
+        set(MINDSPORE_RPATH
+            ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
    elseif(ENABLE_ACL)
        set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/atc/lib64)
        set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/atc/lib64)
@ -121,7 +136,8 @@ if(CMAKE_SYSTEM_NAME MATCHES "Linux")
        set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/add-ons)
        set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/opp/op_impl/built-in/ai_core/tbe/op_tiling)
        set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
-        set(MINDSPORE_RPATH ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
+        set(MINDSPORE_RPATH
+            ${MINDSPORE_RPATH}:/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe/op_tiling)
    endif()

    set_target_properties(mindspore_shared_lib PROPERTIES INSTALL_RPATH ${MINDSPORE_RPATH})
--- a/mindspore/ccsrc/cxx_api/graph/ascend/ascend_graph_impl.cc
+++ b/mindspore/ccsrc/cxx_api/graph/ascend/ascend_graph_impl.cc
@ -13,7 +13,7 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#include "cxx_api/graph/ms/ms_graph_impl.h"
+#include "cxx_api/graph/ascend/ascend_graph_impl.h"
 #include <algorithm>
 #include "include/api/context.h"
 #include "cxx_api/factory.h"
@ -26,43 +26,9 @@
 #include "runtime/device/kernel_runtime_manager.h"

 namespace mindspore::api {
-API_FACTORY_REG(GraphCell::GraphImpl, Ascend910, MsGraphImpl);
+API_FACTORY_REG(GraphCell::GraphImpl, Ascend910, AscendGraphImpl);

-static DataType TransTypeId2InferDataType(TypeId type_id) {
-  const std::map<TypeId, api::DataType> id2type_map{
-    {TypeId::kNumberTypeBegin, api::kMsUnknown},   {TypeId::kNumberTypeBool, api::kMsBool},
-    {TypeId::kNumberTypeFloat64, api::kMsFloat64}, {TypeId::kNumberTypeInt8, api::kMsInt8},
-    {TypeId::kNumberTypeUInt8, api::kMsUint8},     {TypeId::kNumberTypeInt16, api::kMsInt16},
-    {TypeId::kNumberTypeUInt16, api::kMsUint16},   {TypeId::kNumberTypeInt32, api::kMsInt32},
-    {TypeId::kNumberTypeUInt32, api::kMsUint32},   {TypeId::kNumberTypeInt64, api::kMsInt64},
-    {TypeId::kNumberTypeUInt64, api::kMsUint64},   {TypeId::kNumberTypeFloat16, api::kMsFloat16},
-    {TypeId::kNumberTypeFloat32, api::kMsFloat32},
-  };
-
-  // cppcheck-suppress stlIfFind
-  if (auto it = id2type_map.find(type_id); it != id2type_map.end()) {
-    return it->second;
-  }
-
-  MS_LOG(WARNING) << "Unsupported data id " << type_id;
-  return api::kMsUnknown;
-}
-
-template <class T>
-inline static void ClearIfNotNull(T *vec) {
-  if (vec != nullptr) {
-    vec->clear();
-  }
-}
-
-template <class T, class U = std::vector<T>>
-inline static void PushbackIfNotNull(U *vec, T &&item) {
-  if (vec != nullptr) {
-    vec->emplace_back(item);
-  }
-}
-
-MsGraphImpl::MsGraphImpl()
+AscendGraphImpl::AscendGraphImpl()
    : session_impl_(nullptr),
      graph_id_(0),
      device_type_("Ascend"),
@ -75,9 +41,9 @@ MsGraphImpl::MsGraphImpl()
      init_flag_(false),
      load_flag_(false) {}

-MsGraphImpl::~MsGraphImpl() { (void)FinalizeEnv(); }
+AscendGraphImpl::~AscendGraphImpl() { (void)FinalizeEnv(); }

-Status MsGraphImpl::InitEnv() {
+Status AscendGraphImpl::InitEnv() {
  if (init_flag_) {
    return SUCCESS;
  }
@ -108,7 +74,7 @@ Status MsGraphImpl::InitEnv() {
  return SUCCESS;
 }

-Status MsGraphImpl::FinalizeEnv() {
+Status AscendGraphImpl::FinalizeEnv() {
  if (!init_flag_) {
    return SUCCESS;
  }
@ -136,7 +102,7 @@ Status MsGraphImpl::FinalizeEnv() {
  return SUCCESS;
 }

-Status MsGraphImpl::CompileGraph(const std::shared_ptr<FuncGraph> &funcGraphPtr) {
+Status AscendGraphImpl::CompileGraph(const std::shared_ptr<FuncGraph> &funcGraphPtr) {
  MS_ASSERT(session_impl_ != nullptr);
  try {
    graph_id_ = session_impl_->CompileGraph(NOT_NULL(funcGraphPtr));
@ -147,7 +113,7 @@ Status MsGraphImpl::CompileGraph(const std::shared_ptr<FuncGraph> &funcGraphPtr)
  }
 }

-std::vector<tensor::TensorPtr> MsGraphImpl::RunGraph(const std::vector<tensor::TensorPtr> &inputs) {
+std::vector<tensor::TensorPtr> AscendGraphImpl::RunGraph(const std::vector<tensor::TensorPtr> &inputs) {
  try {
    VectorRef outputs;
    session_impl_->RunGraph(graph_id_, inputs, &outputs);
@ -158,7 +124,7 @@ std::vector<tensor::TensorPtr> MsGraphImpl::RunGraph(const std::vector<tensor::T
  }
 }

-Status MsGraphImpl::CheckModelInputs(const std::vector<tensor::TensorPtr> &inputs) const {
+Status AscendGraphImpl::CheckModelInputs(const std::vector<tensor::TensorPtr> &inputs) const {
  MS_ASSERT(session_impl_ != nullptr);
  std::string error_msg;
  if (!session_impl_->CheckModelInputs(graph_id_, inputs, &error_msg)) {
@ -167,7 +133,7 @@ Status MsGraphImpl::CheckModelInputs(const std::vector<tensor::TensorPtr> &input
  return SUCCESS;
 }

-Status MsGraphImpl::ExecuteModel(const std::vector<Buffer> &request, std::vector<Buffer> *reply) {
+Status AscendGraphImpl::ExecuteModel(const std::vector<Buffer> &request, std::vector<Buffer> *reply) {
  MS_EXCEPTION_IF_NULL(reply);
  if (context_ == nullptr) {
    MS_LOG(ERROR) << "rtCtx is nullptr";
@ -206,7 +172,7 @@ Status MsGraphImpl::ExecuteModel(const std::vector<Buffer> &request, std::vector
  return SUCCESS;
 }

-Status MsGraphImpl::GetInputsInfo(std::vector<std::string> *names, std::vector<std::vector<int64_t>> *shapes,
+Status AscendGraphImpl::GetInputsInfo(std::vector<std::string> *names, std::vector<std::vector<int64_t>> *shapes,
                                      std::vector<DataType> *data_types, std::vector<size_t> *mem_sizes) {
  if (!load_flag_) {
    Status ret = Load();
@ -216,21 +182,21 @@ Status MsGraphImpl::GetInputsInfo(std::vector<std::string> *names, std::vector<s
    }
  }

-  ClearIfNotNull(names);
-  ClearIfNotNull(shapes);
-  ClearIfNotNull(data_types);
-  ClearIfNotNull(mem_sizes);
+  GraphUtils::ClearIfNotNull(names);
+  GraphUtils::ClearIfNotNull(shapes);
+  GraphUtils::ClearIfNotNull(data_types);
+  GraphUtils::ClearIfNotNull(mem_sizes);
  for (size_t i = 0; i < inputs_.size(); i++) {
    auto &tensor = inputs_[i];
-    PushbackIfNotNull(names, input_names_[i]);
-    PushbackIfNotNull(shapes, tensor->shape());
-    PushbackIfNotNull(data_types, TransTypeId2InferDataType(tensor->data_type()));
-    PushbackIfNotNull(mem_sizes, tensor->Size());
+    GraphUtils::PushbackIfNotNull(names, input_names_[i]);
+    GraphUtils::PushbackIfNotNull(shapes, tensor->shape());
+    GraphUtils::PushbackIfNotNull(data_types, GraphUtils::TransTypeId2InferDataType(tensor->data_type()));
+    GraphUtils::PushbackIfNotNull(mem_sizes, tensor->Size());
  }
  return SUCCESS;
 }

-Status MsGraphImpl::GetOutputsInfo(std::vector<std::string> *names, std::vector<std::vector<int64_t>> *shapes,
+Status AscendGraphImpl::GetOutputsInfo(std::vector<std::string> *names, std::vector<std::vector<int64_t>> *shapes,
                                       std::vector<DataType> *data_types, std::vector<size_t> *mem_sizes) {
  if (!load_flag_) {
    Status ret = Load();
@ -240,22 +206,22 @@ Status MsGraphImpl::GetOutputsInfo(std::vector<std::string> *names, std::vector<
    }
  }

-  ClearIfNotNull(names);
-  ClearIfNotNull(shapes);
-  ClearIfNotNull(data_types);
-  ClearIfNotNull(mem_sizes);
+  GraphUtils::ClearIfNotNull(names);
+  GraphUtils::ClearIfNotNull(shapes);
+  GraphUtils::ClearIfNotNull(data_types);
+  GraphUtils::ClearIfNotNull(mem_sizes);
  for (size_t i = 0; i < outputs_.size(); i++) {
    auto &tensor = outputs_[i];
-    PushbackIfNotNull(names, output_names_[i]);
-    PushbackIfNotNull(shapes, tensor->shape());
-    PushbackIfNotNull(data_types, TransTypeId2InferDataType(tensor->data_type()));
-    PushbackIfNotNull(mem_sizes, tensor->Size());
+    GraphUtils::PushbackIfNotNull(names, output_names_[i]);
+    GraphUtils::PushbackIfNotNull(shapes, tensor->shape());
+    GraphUtils::PushbackIfNotNull(data_types, GraphUtils::TransTypeId2InferDataType(tensor->data_type()));
+    GraphUtils::PushbackIfNotNull(mem_sizes, tensor->Size());
  }

  return SUCCESS;
 }

-Status MsGraphImpl::Load() {
+Status AscendGraphImpl::Load() {
  // check graph type
  if (graph_->ModelType() != ModelType::kMindIR) {
    MS_LOG(ERROR) << "Unsupported model type " << graph_->ModelType();
@ -311,7 +277,7 @@ Status MsGraphImpl::Load() {
  return SUCCESS;
 }

-Status MsGraphImpl::Run(const std::vector<Buffer> &inputs, std::vector<Buffer> *outputs) {
+Status AscendGraphImpl::Run(const std::vector<Buffer> &inputs, std::vector<Buffer> *outputs) {
  MS_EXCEPTION_IF_NULL(outputs);
  if (!load_flag_) {
    Status ret = Load();
--- a/mindspore/ccsrc/cxx_api/graph/ascend/ascend_graph_impl.h
+++ b/mindspore/ccsrc/cxx_api/graph/ascend/ascend_graph_impl.h
@ -13,8 +13,8 @@
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
-#ifndef MINDSPORE_CCSRC_CXX_API_GRAPH_MS_MS_GRAPH_IMPL_H
-#define MINDSPORE_CCSRC_CXX_API_GRAPH_MS_MS_GRAPH_IMPL_H
+#ifndef MINDSPORE_CCSRC_CXX_API_GRAPH_MS_ASCEND_GRAPH_IMPL_H
+#define MINDSPORE_CCSRC_CXX_API_GRAPH_MS_ASCEND_GRAPH_IMPL_H
 #include <functional>
 #include <map>
 #include <string>
@ -28,12 +28,13 @@
 #include "ir/anf.h"
 #include "cxx_api/model/model_impl.h"
 #include "runtime/context.h"
+#include "cxx_api/graph/graph_utils.h"

 namespace mindspore::api {
-class MsGraphImpl : public GraphCell::GraphImpl {
+class AscendGraphImpl : public GraphCell::GraphImpl {
 public:
-  MsGraphImpl();
-  ~MsGraphImpl() override;
+  AscendGraphImpl();
+  ~AscendGraphImpl() override;

  Status Run(const std::vector<Buffer> &inputs, std::vector<Buffer> *outputs) override;
  Status Load() override;
@ -63,4 +64,4 @@ class MsGraphImpl : public GraphCell::GraphImpl {
  bool load_flag_;
 };
 }  // namespace mindspore::api
-#endif  // MINDSPORE_CCSRC_CXX_API_GRAPH_MS_MS_GRAPH_IMPL_H
+#endif  // MINDSPORE_CCSRC_CXX_API_GRAPH_MS_ASCEND_GRAPH_IMPL_H
--- a/mindspore/ccsrc/cxx_api/graph/ascend/ascend_stub.cc
+++ b/mindspore/ccsrc/cxx_api/graph/ascend/ascend_stub.cc
@ -20,7 +20,7 @@ VMCallbackRegister &VMCallbackRegister::GetInstance() {
  return instance;
 }

-bool VMCallbackRegister::Registe(Status (*pRegProfCtrlCallback)(MsprofCtrlCallback),
+bool VMCallbackRegister::Register(Status (*pRegProfCtrlCallback)(MsprofCtrlCallback),
                                  Status (*pRegProfSetDeviceCallback)(MsprofSetDeviceCallback),
                                  Status (*pRegProfReporterCallback)(MsprofReporterCallback),
                                  Status (*pProfCommandHandle)(ProfCommandHandleType, void *, uint32_t)) {
--- a/mindspore/ccsrc/cxx_api/graph/gpu/gpu_graph_impl.cc
+++ b/mindspore/ccsrc/cxx_api/graph/gpu/gpu_graph_impl.cc
@ -0,0 +1,256 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "cxx_api/graph/gpu/gpu_graph_impl.h"
+#include <algorithm>
+#include "include/api/context.h"
+#include "cxx_api/factory.h"
+#include "utils/log_adapter.h"
+#include "mindspore/core/base/base_ref_utils.h"
+#include "backend/session/session_factory.h"
+#include "backend/session/executor_manager.h"
+#include "runtime/device/kernel_runtime_manager.h"
+
+namespace mindspore::api {
+API_FACTORY_REG(GraphCell::GraphImpl, GPU, GPUGraphImpl);
+
+GPUGraphImpl::GPUGraphImpl()
+    : session_impl_(nullptr),
+      graph_id_(0),
+      device_id_(Context::Instance().GetDeviceID()),
+      inputs_(),
+      outputs_(),
+      input_names_(),
+      output_names_(),
+      init_flag_(false),
+      load_flag_(false) {}
+
+Status GPUGraphImpl::InitEnv() {
+  if (init_flag_) {
+    MS_LOG(WARNING) << "Initialized again, return success.";
+    return SUCCESS;
+  }
+
+  auto ms_context = MsContext::GetInstance();
+  if (ms_context == nullptr) {
+    MS_LOG(ERROR) << "Get Context failed!";
+    return FAILED;
+  }
+  ms_context->set_param<int>(MS_CTX_EXECUTION_MODE, kGraphMode);
+  ms_context->set_param<uint32_t>(MS_CTX_DEVICE_ID, device_id_);
+  ms_context->set_param<std::string>(MS_CTX_DEVICE_TARGET, kGPUDevice);
+  ms_context->set_param<bool>(MS_CTX_ENABLE_INFER_OPT, true);
+
+  session_impl_ = session::SessionFactory::Get().Create(kGpuInferenceDevice);
+  if (session_impl_ == nullptr) {
+    MS_LOG(ERROR) << "Session create failed!, please make sure target device:" << kGpuInferenceDevice
+                  << " is available.";
+    return FAILED;
+  }
+
+  session_impl_->Init(device_id_);
+  init_flag_ = true;
+  return SUCCESS;
+}
+
+Status GPUGraphImpl::FinalizeEnv() {
+  if (!init_flag_) {
+    MS_LOG(WARNING) << "Never initialize before, return success";
+    return SUCCESS;
+  }
+
+  MS_LOG_INFO << "Start finalize env";
+  session::ExecutorManager::Instance().Clear();
+  device::KernelRuntimeManager::Instance().ClearRuntimeResource();
+
+  init_flag_ = false;
+  MS_LOG(INFO) << "End finalize env";
+  return SUCCESS;
+}
+
+Status GPUGraphImpl::Load() {
+  // check graph type
+  if (graph_->ModelType() != ModelType::kMindIR) {
+    MS_LOG(ERROR) << "Unsupported model type " << graph_->ModelType();
+    return INVALID_INPUTS;
+  }
+
+  const auto &graph_data = GraphImpl::MutableGraphData();
+  MS_EXCEPTION_IF_NULL(graph_data);
+  auto func_graph = graph_data->GetFuncGraph();
+
+  // init
+  Status ret = InitEnv();
+  if (ret != SUCCESS) {
+    MS_LOG(ERROR) << "InitEnv failed.";
+    return FAILED;
+  }
+
+  ret = CompileGraph(func_graph);
+  if (ret != SUCCESS) {
+    MS_LOG(ERROR) << "Compile graph model failed";
+    return FAILED;
+  }
+  session_impl_->GetModelInputsInfo(graph_id_, &inputs_, &input_names_);
+  session_impl_->GetModelOutputsInfo(graph_id_, &outputs_, &output_names_);
+  if (inputs_.empty() || inputs_.size() != input_names_.size()) {
+    MS_LOG_ERROR << "Get model inputs info failed";
+    return FAILED;
+  }
+  if (outputs_.empty() || outputs_.size() != output_names_.size()) {
+    MS_LOG_ERROR << "Get model outputs info failed";
+    return FAILED;
+  }
+  load_flag_ = true;
+  return SUCCESS;
+}
+
+Status GPUGraphImpl::CompileGraph(const std::shared_ptr<FuncGraph> &funcGraphPtr) {
+  MS_ASSERT(session_impl_ != nullptr);
+  try {
+    graph_id_ = session_impl_->CompileGraph(NOT_NULL(funcGraphPtr));
+    return SUCCESS;
+  } catch (std::exception &e) {
+    MS_LOG(ERROR) << "CompileGraph failed: " << e.what();
+    return FAILED;
+  }
+}
+
+std::vector<tensor::TensorPtr> GPUGraphImpl::RunGraph(const std::vector<tensor::TensorPtr> &inputs) {
+  try {
+    VectorRef outputs;
+    session_impl_->RunGraph(graph_id_, inputs, &outputs);
+    return TransformVectorRefToMultiTensor(outputs);
+  } catch (std::exception &e) {
+    MS_LOG(ERROR) << "RunGraph failed: " << e.what();
+    return std::vector<tensor::TensorPtr>();
+  }
+}
+
+Status GPUGraphImpl::ExecuteModel(const std::vector<Buffer> &request, std::vector<Buffer> *reply) {
+  MS_EXCEPTION_IF_NULL(reply);
+
+  vector<tensor::TensorPtr> inputs;
+  for (size_t i = 0; i < request.size(); i++) {
+    auto &item = request[i];
+    auto input = inputs_[i];
+    if (input->Size() != item.DataSize()) {
+      MS_LOG(ERROR) << "Input " << i << " data size " << item.DataSize() << " not match model input data size "
+                    << input->Size();
+      return FAILED;
+    }
+    auto ret = memcpy_s(input->data_c(), input->Size(), item.Data(), item.DataSize());
+    if (ret != SUCCESS) {
+      MS_LOG(ERROR) << "Tensor copy failed";
+      return FAILED;
+    }
+    inputs.push_back(input);
+  }
+  vector<tensor::TensorPtr> outputs = RunGraph(inputs);
+  if (outputs.empty()) {
+    MS_LOG(ERROR) << "Execute Model Failed";
+    return FAILED;
+  }
+  reply->clear();
+  std::transform(outputs.begin(), outputs.end(), std::back_inserter(*reply),
+                 [](const tensor::TensorPtr &tensor) { return Buffer(tensor->data_c(), tensor->Size()); });
+  return SUCCESS;
+}
+
+Status GPUGraphImpl::Run(const std::vector<Buffer> &inputs, std::vector<Buffer> *outputs) {
+  MS_EXCEPTION_IF_NULL(outputs);
+  if (!load_flag_) {
+    Status ret = Load();
+    if (ret != SUCCESS) {
+      MS_LOG(ERROR) << "PrepareModel failed.";
+      return ret;
+    }
+  }
+
+  if (inputs.size() != inputs_.size()) {
+    MS_LOG(ERROR) << "inputs count not match, required count " << inputs_.size() << ", given count " << inputs.size();
+    return INVALID_INPUTS;
+  }
+
+  for (size_t i = 0; i < inputs_.size(); ++i) {
+    if (inputs[i].DataSize() != inputs_[i]->Size()) {
+      MS_LOG(ERROR) << "input " << i << " data size not match, required size " << inputs_[i]->Size() << ", given count "
+                    << inputs[i].DataSize();
+      return INVALID_INPUTS;
+    }
+  }
+  if (ExecuteModel(inputs, outputs) != SUCCESS) {
+    MS_LOG(ERROR) << "Execute Model Failed";
+    return FAILED;
+  }
+  if (outputs_.size() != outputs->size()) {
+    MS_LOG(ERROR) << "Predict output size " << outputs->size() << " not match output size got from model info "
+                  << outputs_.size();
+    return FAILED;
+  }
+
+  return SUCCESS;
+}
+
+Status GPUGraphImpl::GetInputsInfo(std::vector<std::string> *names, std::vector<std::vector<int64_t>> *shapes,
+                                   std::vector<DataType> *data_types, std::vector<size_t> *mem_sizes) {
+  if (!load_flag_) {
+    Status ret = Load();
+    if (ret != SUCCESS) {
+      MS_LOG(ERROR) << "PrepareModel failed.";
+      return ret;
+    }
+  }
+
+  GraphUtils::ClearIfNotNull(names);
+  GraphUtils::ClearIfNotNull(shapes);
+  GraphUtils::ClearIfNotNull(data_types);
+  GraphUtils::ClearIfNotNull(mem_sizes);
+  for (size_t i = 0; i < inputs_.size(); i++) {
+    auto &tensor = inputs_[i];
+    GraphUtils::PushbackIfNotNull(names, input_names_[i]);
+    GraphUtils::PushbackIfNotNull(shapes, tensor->shape());
+    GraphUtils::PushbackIfNotNull(data_types, GraphUtils::TransTypeId2InferDataType(tensor->data_type()));
+    GraphUtils::PushbackIfNotNull(mem_sizes, tensor->Size());
+  }
+  return SUCCESS;
+}
+
+Status GPUGraphImpl::GetOutputsInfo(std::vector<std::string> *names, std::vector<std::vector<int64_t>> *shapes,
+                                    std::vector<DataType> *data_types, std::vector<size_t> *mem_sizes) {
+  if (!load_flag_) {
+    Status ret = Load();
+    if (ret != SUCCESS) {
+      MS_LOG(ERROR) << "PrepareModel failed.";
+      return ret;
+    }
+  }
+
+  GraphUtils::ClearIfNotNull(names);
+  GraphUtils::ClearIfNotNull(shapes);
+  GraphUtils::ClearIfNotNull(data_types);
+  GraphUtils::ClearIfNotNull(mem_sizes);
+  for (size_t i = 0; i < outputs_.size(); i++) {
+    auto &tensor = outputs_[i];
+    GraphUtils::PushbackIfNotNull(names, output_names_[i]);
+    GraphUtils::PushbackIfNotNull(shapes, tensor->shape());
+    GraphUtils::PushbackIfNotNull(data_types, GraphUtils::TransTypeId2InferDataType(tensor->data_type()));
+    GraphUtils::PushbackIfNotNull(mem_sizes, tensor->Size());
+  }
+
+  return SUCCESS;
+}
+
+}  // namespace mindspore::api
--- a/mindspore/ccsrc/cxx_api/graph/gpu/gpu_graph_impl.h
+++ b/mindspore/ccsrc/cxx_api/graph/gpu/gpu_graph_impl.h
@ -0,0 +1,67 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_CXX_API_GRAPH_MS_GPU_GRAPH_IMPL_H
+#define MINDSPORE_CCSRC_CXX_API_GRAPH_MS_GPU_GRAPH_IMPL_H
+#include <string>
+#include <vector>
+#include <utility>
+#include <memory>
+#include "include/api/status.h"
+#include "include/api/graph.h"
+#include "cxx_api/graph/graph_impl.h"
+#include "backend/session/session_basic.h"
+#include "ir/anf.h"
+#include "cxx_api/model/model_impl.h"
+#include "cxx_api/graph/graph_utils.h"
+
+namespace mindspore::api {
+class GPUGraphImpl : public GraphCell::GraphImpl {
+ public:
+  GPUGraphImpl();
+  ~GPUGraphImpl() override = default;
+
+  Status Run(const std::vector<Buffer> &inputs, std::vector<Buffer> *outputs) override;
+  Status Load() override;
+  Status GetInputsInfo(std::vector<std::string> *names, std::vector<std::vector<int64_t>> *shapes,
+                       std::vector<DataType> *data_types, std::vector<size_t> *mem_sizes) override;
+  Status GetOutputsInfo(std::vector<std::string> *names, std::vector<std::vector<int64_t>> *shapes,
+                        std::vector<DataType> *data_types, std::vector<size_t> *mem_sizes) override;
+
+ private:
+  Status InitEnv();
+  Status FinalizeEnv();
+  Status CompileGraph(const std::shared_ptr<FuncGraph> &funcGraphPtr);
+  Status CheckModelInputs(const std::vector<tensor::TensorPtr> &inputs) const;
+  std::vector<tensor::TensorPtr> RunGraph(const std::vector<tensor::TensorPtr> &inputs);
+  Status ExecuteModel(const std::vector<Buffer> &inputs, std::vector<Buffer> *outputs);
+
+  std::shared_ptr<session::SessionBasic> session_impl_;
+  uint32_t graph_id_;
+  std::string device_type_;
+  uint32_t device_id_;
+  std::vector<tensor::TensorPtr> inputs_;
+  std::vector<tensor::TensorPtr> outputs_;
+  std::vector<std::string> input_names_;
+  std::vector<std::string> output_names_;
+  bool init_flag_;
+  bool load_flag_;
+
+  // tensor-rt
+  uint32_t batch_size_;
+  uint32_t workspace_size_;
+};
+}  // namespace mindspore::api
+#endif  // MINDSPORE_CCSRC_CXX_API_GRAPH_MS_GPU_GRAPH_IMPL_H
--- a/mindspore/ccsrc/cxx_api/graph/graph_utils.h
+++ b/mindspore/ccsrc/cxx_api/graph/graph_utils.h
@ -0,0 +1,63 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef MINDSPORE_CCSRC_CXX_API_GRAPH_GRAPH_UTILS_H
+#define MINDSPORE_CCSRC_CXX_API_GRAPH_GRAPH_UTILS_H
+#include <map>
+#include <vector>
+#include "include/api/types.h"
+#include "ir/dtype/type_id.h"
+#include "utils/log_adapter.h"
+
+namespace mindspore::api {
+class GraphUtils {
+ public:
+  static DataType TransTypeId2InferDataType(TypeId type_id) {
+    const std::map<TypeId, api::DataType> id2type_map{
+      {TypeId::kNumberTypeBegin, api::kMsUnknown},   {TypeId::kNumberTypeBool, api::kMsBool},
+      {TypeId::kNumberTypeFloat64, api::kMsFloat64}, {TypeId::kNumberTypeInt8, api::kMsInt8},
+      {TypeId::kNumberTypeUInt8, api::kMsUint8},     {TypeId::kNumberTypeInt16, api::kMsInt16},
+      {TypeId::kNumberTypeUInt16, api::kMsUint16},   {TypeId::kNumberTypeInt32, api::kMsInt32},
+      {TypeId::kNumberTypeUInt32, api::kMsUint32},   {TypeId::kNumberTypeInt64, api::kMsInt64},
+      {TypeId::kNumberTypeUInt64, api::kMsUint64},   {TypeId::kNumberTypeFloat16, api::kMsFloat16},
+      {TypeId::kNumberTypeFloat32, api::kMsFloat32},
+    };
+
+    auto it = id2type_map.find(type_id);
+    if (it != id2type_map.end()) {
+      return it->second;
+    }
+
+    MS_LOG(WARNING) << "Unsupported data id " << type_id;
+    return api::kMsUnknown;
+  }
+
+  template <class T>
+  inline static void ClearIfNotNull(T *vec) {
+    if (vec != nullptr) {
+      vec->clear();
+    }
+  }
+
+  template <class T, class U>
+  inline static void PushbackIfNotNull(U *vec, T &&item) {
+    if (vec != nullptr) {
+      vec->emplace_back(item);
+    }
+  }
+};
+}  // namespace mindspore::api
+
+#endif  // MINDSPORE_CCSRC_CXX_API_GRAPH_GRAPH_UTILS_H
--- a/mindspore/ccsrc/cxx_api/model/ms/ms_model.cc
+++ b/mindspore/ccsrc/cxx_api/model/ms/ms_model.cc
@ -22,6 +22,7 @@
 namespace mindspore {
 namespace api {
 API_FACTORY_REG(ModelImpl, Ascend910, MsModel);
+API_FACTORY_REG(ModelImpl, GPU, MsModel);

 Status MsModel::Build(const std::map<std::string, std::string> &) {
  MS_LOG(INFO) << "Start build model.";
--- a/mindspore/ccsrc/cxx_api/types.cc
+++ b/mindspore/ccsrc/cxx_api/types.cc
@ -21,6 +21,7 @@
 namespace mindspore::api {
 const char *kDeviceTypeAscend310 = "Ascend310";
 const char *kDeviceTypeAscend910 = "Ascend910";
+const char *kDeviceTypeGpu = "GPU";

 class DataImpl {
 public:
--- a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_callback_register.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_callback_register.cc
@ -31,7 +31,7 @@ constexpr Status PROF_FAILED = 0xFFFFFFFF;
 }  // namespace

 Status RegProfCtrlCallback(MsprofCtrlCallback func) {
-  if (VMCallbackRegister::GetInstance().registed()) {
+  if (VMCallbackRegister::GetInstance().registered()) {
    return VMCallbackRegister::GetInstance().DoRegProfCtrlCallback(func);
  } else {
    return PROF_SUCCESS;
@ -39,7 +39,7 @@ Status RegProfCtrlCallback(MsprofCtrlCallback func) {
 }

 Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) {
-  if (VMCallbackRegister::GetInstance().registed()) {
+  if (VMCallbackRegister::GetInstance().registered()) {
    return VMCallbackRegister::GetInstance().DoRegProfSetDeviceCallback(func);
  } else {
    return PROF_SUCCESS;
@ -47,7 +47,7 @@ Status RegProfSetDeviceCallback(MsprofSetDeviceCallback func) {
 }

 Status RegProfReporterCallback(MsprofReporterCallback func) {
-  if (VMCallbackRegister::GetInstance().registed()) {
+  if (VMCallbackRegister::GetInstance().registered()) {
    return VMCallbackRegister::GetInstance().DoRegProfReporterCallback(func);
  } else {
    return PROF_SUCCESS;
@ -55,7 +55,7 @@ Status RegProfReporterCallback(MsprofReporterCallback func) {
 }

 Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len) {
-  if (VMCallbackRegister::GetInstance().registed()) {
+  if (VMCallbackRegister::GetInstance().registered()) {
    return VMCallbackRegister::GetInstance().DoProfCommandHandle(type, data, len);
  } else {
    return PROF_SUCCESS;
@ -69,16 +69,16 @@ VMCallbackRegister &VMCallbackRegister::GetInstance() {
  return instance;
 }

-bool VMCallbackRegister::Registe(Status (*pRegProfCtrlCallback)(MsprofCtrlCallback),
+bool VMCallbackRegister::Register(Status (*pRegProfCtrlCallback)(MsprofCtrlCallback),
                                  Status (*pRegProfSetDeviceCallback)(MsprofSetDeviceCallback),
                                  Status (*pRegProfReporterCallback)(MsprofReporterCallback),
                                  Status (*pProfCommandHandle)(ProfCommandHandleType, void *, uint32_t)) {
-  if (!registed_) {
+  if (!registered_) {
    pRegProfCtrlCallback_ = pRegProfCtrlCallback;
    pRegProfSetDeviceCallback_ = pRegProfSetDeviceCallback;
    pRegProfReporterCallback_ = pRegProfReporterCallback;
    pProfCommandHandle_ = pProfCommandHandle;
-    registed_ = true;
+    registered_ = true;
    ForceMsprofilerInit();
    return true;
  }
--- a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_callback_register.h
+++ b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_callback_register.h
@ -49,12 +49,12 @@ class __attribute__((visibility("default"))) VMCallbackRegister {
  static VMCallbackRegister &GetInstance();
  VMCallbackRegister(const VMCallbackRegister &) = delete;
  VMCallbackRegister &operator=(const VMCallbackRegister &) = delete;
-  bool Registe(Status (*pRegProfCtrlCallback)(MsprofCtrlCallback),
+  bool Register(Status (*pRegProfCtrlCallback)(MsprofCtrlCallback),
                Status (*pRegProfSetDeviceCallback)(MsprofSetDeviceCallback),
                Status (*pRegProfReporterCallback)(MsprofReporterCallback),
                Status (*pProfCommandHandle)(ProfCommandHandleType, void *, uint32_t));
  void ForceMsprofilerInit();
-  bool registed() { return registed_; }
+  bool registered() { return registered_; }
  Status DoRegProfCtrlCallback(MsprofCtrlCallback func) { return pRegProfCtrlCallback_(func); }
  Status DoRegProfSetDeviceCallback(MsprofSetDeviceCallback func) { return pRegProfSetDeviceCallback_(func); }
  Status DoRegProfReporterCallback(MsprofReporterCallback func) { return pRegProfReporterCallback_(func); }
@ -64,7 +64,7 @@ class __attribute__((visibility("default"))) VMCallbackRegister {

 private:
  VMCallbackRegister()
-      : registed_(false),
+      : registered_(false),
        ms_profile_inited_(false),
        pRegProfCtrlCallback_(nullptr),
        pRegProfSetDeviceCallback_(nullptr),
@ -72,7 +72,7 @@ class __attribute__((visibility("default"))) VMCallbackRegister {
        pProfCommandHandle_(nullptr) {}
  ~VMCallbackRegister() = default;

-  bool registed_;
+  bool registered_;
  bool ms_profile_inited_;
  Status (*pRegProfCtrlCallback_)(MsprofCtrlCallback);
  Status (*pRegProfSetDeviceCallback_)(MsprofSetDeviceCallback);
--- a/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc
+++ b/mindspore/ccsrc/runtime/device/ascend/profiling/profiling_manager.cc
@ -299,7 +299,7 @@ Status ProfCommandHandle(ProfCommandHandleType type, void *data, uint32_t len) {

 bool DoRegiste() {
  MS_LOG(INFO) << "VM profiling register start";
-  return VMCallbackRegister::GetInstance().Registe(RegProfCtrlCallback, RegProfSetDeviceCallback,
+  return VMCallbackRegister::GetInstance().Register(RegProfCtrlCallback, RegProfSetDeviceCallback,
                                                    RegProfReporterCallback, ProfCommandHandle);
 }
 static bool doRegiste = DoRegiste();
--- a/mindspore/core/utils/ms_context.h
+++ b/mindspore/core/utils/ms_context.h
@ -41,6 +41,7 @@ const char kCPUDevice[] = "CPU";
 const char kGPUDevice[] = "GPU";
 const char kAscendDevice[] = "Ascend";
 const char kDavinciInferenceDevice[] = "AscendInference";
+const char kGpuInferenceDevice[] = "GpuInference";
 const char kDavinciDevice[] = "Davinci";
 const char KNpuLog[] = "_npu_log";
 const unsigned int MAX_CALL_DEPTH_DEFAULT = 1000;
@ -51,7 +52,7 @@ const float kDefaultMaxDeviceMemory = 1024;

 // enum definition for MindSpore Context Parameter
 enum MsCtxParam : unsigned {
-  // paramater of type bool
+  // parameter of type bool
  MS_CTX_TYPE_BOOL_BEGIN,
  MS_CTX_ENABLE_AUTO_MIXED_PRECISION = MS_CTX_TYPE_BOOL_BEGIN,
  MS_CTX_CHECK_BPROP_FLAG,
@ -74,14 +75,15 @@ enum MsCtxParam : unsigned {
  MS_CTX_ENABLE_PROFILING,
  MS_CTX_SAVE_GRAPHS_FLAG,
  MS_CTX_ENABLE_PARALLEL_SPLIT,
+  MS_CTX_ENABLE_INFER_OPT,
  MS_CTX_TYPE_BOOL_END,

-  // paramater of type int
+  // parameter of type int
  MS_CTX_TYPE_INT_BEGIN = MS_CTX_TYPE_BOOL_END,
  MS_CTX_EXECUTION_MODE = MS_CTX_TYPE_INT_BEGIN,
  MS_CTX_TYPE_INT_END,

-  // paramater of type uint32
+  // parameter of type uint32
  MS_CTX_TYPE_UINT32_BEGIN = MS_CTX_TYPE_INT_END,
  MS_CTX_DEVICE_ID = MS_CTX_TYPE_UINT32_BEGIN,
  MS_CTX_GE_REF,
@ -89,12 +91,12 @@ enum MsCtxParam : unsigned {
  MS_CTX_TSD_REF,
  MS_CTX_TYPE_UINT32_END,

-  // paramater of type float
+  // parameter of type float
  MS_CTX_TYPE_FLOAT_BEGIN = MS_CTX_TYPE_UINT32_END,
  MS_CTX_MAX_DEVICE_MEMORY = MS_CTX_TYPE_FLOAT_BEGIN,
  MS_CTX_TYPE_FLOAT_END,

-  // paramater of type string
+  // parameter of type string
  MS_CTX_TYPE_STRING_BEGIN = MS_CTX_TYPE_FLOAT_END,
  MS_CTX_DEVICE_TARGET = MS_CTX_TYPE_STRING_BEGIN,
  MS_CTX_GRAPH_MEMORY_MAX_SIZE,