!27929 consider the case inputs and outputs of host kernel will be on CPU

Merge pull request !27929 from lingyunli63/refine_memcpy_of_host_kernel
2021-12-23 08:01:51 +00:00 · 2021-12-23 08:01:51 +00:00 · 1a474138da
parent 9ee08945d7 e54aabf53d
commit 1a474138da
6 changed files with 130 additions and 26 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_reshape_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_reshape_kernel.cc
@ -99,17 +99,24 @@ void DynamicReshapeKernel::Execute() {
  size_t input_size_byte = LongToSize(arr_prod) * abstract::TypeIdSize(type_x);
  auto output_addr = AnfAlgo::GetOutputAddr(cnode, 0);
  MS_EXCEPTION_IF_NULL(output_addr);
-  auto ms_context = MsContext::GetInstance();
-  MS_EXCEPTION_IF_NULL(ms_context);
-  auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
-  auto temp_device_address = std::make_shared<device::ascend::AscendDeviceAddress>(
-    address_x->GetMutablePtr(), input_size_byte, address_x->format(), address_x->type_id(), kAscendDevice, device_id);
-  if (!output_addr->SyncDeviceToDevice(temp_device_address.get())) {
-    MS_LOG(EXCEPTION) << "Host Reshape sync device to device failed.";
+  if (address_x->DeviceType() == device::DeviceAddressType::kCPU) {
+    auto ret =
+      memcpy_s(const_cast<void *>(output_addr->GetPtr()), output_addr->GetSize(), address_x->GetPtr(), input_size_byte);
+    if (ret != EOK) {
+      MS_LOG(EXCEPTION) << "Execute DynamicReshapeKernel memcpy_s failed";
+    }
+  } else {
+    auto ms_context = MsContext::GetInstance();
+    MS_EXCEPTION_IF_NULL(ms_context);
+    auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
+    auto temp_device_address = std::make_shared<device::ascend::AscendDeviceAddress>(
+      address_x->GetMutablePtr(), input_size_byte, address_x->format(), address_x->type_id(), kAscendDevice, device_id);
+    if (!output_addr->SyncDeviceToDevice(temp_device_address.get())) {
+      MS_LOG(EXCEPTION) << "Host Reshape sync device to device failed.";
+    }
+    MS_LOG(INFO) << "Execute host ReshapeKernel End";
  }
-  MS_LOG(INFO) << "Execute host ReshapeKernel End";
 }
-
 device::DynamicKernelPtr DynamicReshapeKernelMod::GenDynamicKernel(const CNodePtr &cnode_ptr, void *stream_ptr) {
  return std::make_shared<DynamicReshapeKernel>(stream_ptr, cnode_ptr);
 }
--- a/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_shape_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/dynamic_shape_kernel.cc
@ -47,16 +47,24 @@ void DynamicShapeKernel::Execute() {
  auto output_addr = AnfAlgo::GetOutputAddr(cnode, 0);
  MS_EXCEPTION_IF_NULL(output_addr);

-  auto runtime_instance = device::KernelRuntimeManager::Instance().GetCurrentKernelRuntime();
-  MS_EXCEPTION_IF_NULL(runtime_instance);
-  auto ret = runtime_instance->SyncStream();
-  if (!ret) {
-    MS_LOG(EXCEPTION) << "Sync stream error!";
+  if (output_addr->DeviceType() == device::DeviceAddressType::kCPU) {
+    auto ret = memcpy_s(const_cast<void *>(output_addr->GetPtr()), output_addr->GetSize(),
+                        output_tensor_for_sync->data_c(), LongToSize(output_tensor_for_sync->data().nbytes()));
+    if (ret != EOK) {
+      MS_LOG(EXCEPTION) << "Execute DynamicShapeKernel memcpy_s failed!";
+    }
+  } else {
+    auto runtime_instance = device::KernelRuntimeManager::Instance().GetCurrentKernelRuntime();
+    MS_EXCEPTION_IF_NULL(runtime_instance);
+    auto ret = runtime_instance->SyncStream();
+    if (!ret) {
+      MS_LOG(EXCEPTION) << "Sync stream error!";
+    }
+    output_addr->SyncHostToDevice(output_shape, LongToSize(output_tensor_for_sync->data().nbytes()),
+                                  output_tensor_for_sync->data_type(), output_tensor_for_sync->data_c(),
+                                  output_tensor_for_sync->device_info().host_format_);
  }

-  output_addr->SyncHostToDevice(output_shape, LongToSize(output_tensor_for_sync->data().nbytes()),
-                                output_tensor_for_sync->data_type(), output_tensor_for_sync->data_c(),
-                                output_tensor_for_sync->device_info().host_format_);
  MS_LOG(INFO) << "Execute DynamicShapeKernel End";
 }

--- a/mindspore/ccsrc/backend/kernel_compiler/host/host_kernel_metadata.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/host/host_kernel_metadata.cc
@ -24,20 +24,15 @@

 namespace mindspore {
 namespace kernel {
-static const std::set<std::string> host_kernel = {
-  prim::kPrimDynamicShape->name(), prim::kPrimDynamicBroadcastGradientArgs->name(), prim::kPrimDynamicReshape->name()};
-
 void HostMetadataInfo(const CNodePtr &kernel_node, std::vector<std::shared_ptr<KernelBuildInfo>> *kernel_info_list) {
  MS_LOG(INFO) << "HostMetadataInfo.";
  MS_EXCEPTION_IF_NULL(kernel_node);
  MS_EXCEPTION_IF_NULL(kernel_info_list);

-  std::string op_name = AnfAlgo::GetCNodeName(kernel_node);
-  if (host_kernel.find(op_name) == host_kernel.end()) {
-    MS_LOG(DEBUG) << "Host dose not have op [" << op_name << "]";
+  if (!AnfAlgo::IsHostKernel(kernel_node)) {
+    MS_LOG(DEBUG) << "Host dose not have op [" << kernel_node->DebugString() << "]";
    return;
  }
-
  std::vector<std::string> inputs_format{};
  std::vector<TypeId> inputs_type{};
  size_t input_num = AnfAlgo::GetInputTensorNum(kernel_node);
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.cc
@ -2200,6 +2200,39 @@ void AnfRuntimeAlgorithm::GetAllFatherRealNode(const AnfNodePtr &anf_node, std::
  }
 }

+bool AnfRuntimeAlgorithm::IsHostKernel(const CNodePtr &kernel_node) {
+  const std::set<std::string> host_kernel = {prim::kPrimDynamicShape->name(), prim::kPrimDynamicReshape->name(),
+                                             prim::kPrimDynamicBroadcastGradientArgs->name()};
+  auto op_name = AnfAlgo::GetCNodeName(kernel_node);
+  if (host_kernel.find(op_name) == host_kernel.end()) {
+    return false;
+  }
+  return true;
+}
+
+namespace {
+// Host kernel with inputs on host
+bool SkipDataSync(const CNodePtr &node, const std::map<uint32_t, tensor::TensorPtr> &depend_tensors) {
+  if (!AnfAlgo::IsHostKernel(node)) {
+    return false;
+  }
+  auto input_size = AnfAlgo::GetInputTensorNum(node);
+  for (size_t i = 0; i < input_size; ++i) {
+    auto input_with_index = AnfAlgo::GetPrevNodeOutput(node, i);
+    auto real_input = input_with_index.first;
+    auto iter_tensor = depend_tensors.find(i);
+    if (iter_tensor != depend_tensors.end()) {
+      auto output_addr = AnfAlgo::GetOutputAddr(real_input, 0);
+      MS_EXCEPTION_IF_NULL(output_addr);
+      if (output_addr->DeviceType() != device::DeviceAddressType::kCPU) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+}  // namespace
+
 void AnfRuntimeAlgorithm::InferShape(const CNodePtr &node, std::map<uint32_t, tensor::TensorPtr> *depend_tensors) {
  MS_EXCEPTION_IF_NULL(node);
  MS_LOG(INFO) << "InferShape start, node:" << node->DebugString();
@ -2222,8 +2255,10 @@ void AnfRuntimeAlgorithm::InferShape(const CNodePtr &node, std::map<uint32_t, te
      if (iter_tensor != depend_tensors->end()) {
        auto tensor_ptr = iter_tensor->second;
        MS_EXCEPTION_IF_NULL(tensor_ptr);
-        // sync data from device to host
-        tensor_ptr->data_sync();
+        if (!SkipDataSync(node, *depend_tensors)) {
+          // sync data from device to host
+          tensor_ptr->data_sync();
+        }
        auto real_abs = real_input->abstract();
        if (real_abs->isa<abstract::AbstractTensor>()) {
          real_input->abstract()->set_value(tensor_ptr);
--- a/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
+++ b/mindspore/ccsrc/backend/session/anf_runtime_algorithm.h
@ -297,6 +297,7 @@ class AnfRuntimeAlgorithm {
  static std::vector<int64_t> GetOutputMaxShape(const AnfNodePtr &anf_node, size_t index);
  static std::vector<int64_t> GetOutputMinShape(const AnfNodePtr &anf_node, size_t index);
  static bool IsNodeDynamicShape(const AnfNodePtr &node);
+  static bool IsHostKernel(const CNodePtr &node);
  static void InferShape(const CNodePtr &node, std::map<uint32_t, tensor::TensorPtr> *depend_tensors = nullptr);
  static void AddArgList(AbstractBasePtrList *args_spec_list, const AnfNodePtr &cnode_input,
                         const AnfNodePtr &real_input, size_t index);
--- a/tests/st/ops/ascend/test_shape.py
+++ b/tests/st/ops/ascend/test_shape.py
@ -0,0 +1,58 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+import numpy as np
+import pytest
+import mindspore.context as context
+import mindspore.nn as nn
+import mindspore.dataset as ds
+from mindspore.ops import operations as P
+from mindspore import Model
+
+context.set_context(mode=context.GRAPH_MODE,
+                    device_target="Ascend")
+
+def dataset_generator():
+    for i in range(1, 10):
+        yield(np.ones((32, 2*i), dtype=np.float32), np.ones((32, 2*i), dtype=np.float32))
+
+class Net(nn.Cell):
+    def __init__(self):
+        super(Net, self).__init__()
+        self.unique = P.Unique()
+        self.shape = P.DynamicShape()
+        self.reshape = P.Reshape()
+        self.add = P.Add()
+
+    def construct(self, x, y):
+        val = self.add(x, y)
+        size = self.shape(val)
+        res = self.reshape(val, size)
+        return res
+
+@pytest.mark.level0
+@pytest.mark.platform_arm_ascend_training
+@pytest.mark.platform_x86_ascend_training
+@pytest.mark.env_onecard
+def test_shape():
+    """
+    Feature: dynamic shape
+    Description: dynamic shape input data set
+    Expectation: success
+    """
+    network = Net()
+    dataset = ds.GeneratorDataset(dataset_generator, ["data1", "data2"])
+    dataset.set_dynamic_columns(columns={"data1": [32, None], "data2": [32, None]})
+    model = Model(network)
+    model.train(1, dataset, sink_size=1)