From 633e18df4d3de15f0115173b4376aadbac9656ec Mon Sep 17 00:00:00 2001 From: limingqi107 Date: Sun, 4 Jul 2021 19:03:33 +0800 Subject: [PATCH] fix bug of host device from different graph --- .../runtime/framework/actor/actor_common.cc | 20 +++++++++++++ .../runtime/framework/actor/actor_common.h | 4 +++ .../runtime/framework/actor/copy_actor.cc | 16 ----------- .../runtime/framework/actor/copy_actor.h | 2 -- .../framework/actor/data_source_actor.cc | 6 ++-- .../runtime/framework/graph_scheduler.cc | 28 +++++++++---------- 6 files changed, 42 insertions(+), 34 deletions(-) diff --git a/mindspore/ccsrc/runtime/framework/actor/actor_common.cc b/mindspore/ccsrc/runtime/framework/actor/actor_common.cc index fb00e49bb2b..9af9b0216a6 100644 --- a/mindspore/ccsrc/runtime/framework/actor/actor_common.cc +++ b/mindspore/ccsrc/runtime/framework/actor/actor_common.cc @@ -113,5 +113,25 @@ bool IsGatherActor(const AnfNodePtr &front_node, } return false; } + +bool Copy(DeviceTensor *dst_device_tensor, const DeviceTensor *src_device_tensor) { + MS_EXCEPTION_IF_NULL(dst_device_tensor); + MS_EXCEPTION_IF_NULL(src_device_tensor); + + // Exist the size alignment in some device, so get the min device size. + size_t copy_size = std::min(src_device_tensor->GetSize(), dst_device_tensor->GetSize()); + + if (src_device_tensor->DeviceType() == device::DeviceAddressType::kCPU) { + // CPU device tensor copy to other device tensor. + return dst_device_tensor->SyncHostToDevice(copy_size, src_device_tensor->GetPtr()); + } else if (dst_device_tensor->DeviceType() == device::DeviceAddressType::kCPU) { + // Other device tensor copy to CPU device tensor. + return src_device_tensor->SyncDeviceToHost(copy_size, dst_device_tensor->GetMutablePtr()); + } else { + MS_LOG(ERROR) << "Invalid device type, src device type: " << src_device_tensor->DeviceType() + << ", dst device type: " << dst_device_tensor->DeviceType(); + return false; + } +} } // namespace runtime } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/framework/actor/actor_common.h b/mindspore/ccsrc/runtime/framework/actor/actor_common.h index 59789686ce9..6aaed45f6d4 100644 --- a/mindspore/ccsrc/runtime/framework/actor/actor_common.h +++ b/mindspore/ccsrc/runtime/framework/actor/actor_common.h @@ -22,6 +22,7 @@ #include #include #include +#include #include "mindrt/include/actor/op_actor.h" #include "runtime/device/device_address.h" #include "backend/session/kernel_graph.h" @@ -86,6 +87,9 @@ bool IsPersistentDeviceTensor(const AnfNodePtr &node); // Judge whether the front node is in a gather actor. bool IsGatherActor(const AnfNodePtr &front_node, const std::unordered_map *> &actor_name_to_actor); + +// Copy data from src_device_tensor to dst_device_tensor. +bool Copy(DeviceTensor *dst_device_tensor, const DeviceTensor *src_device_tensor); } // namespace runtime } // namespace mindspore diff --git a/mindspore/ccsrc/runtime/framework/actor/copy_actor.cc b/mindspore/ccsrc/runtime/framework/actor/copy_actor.cc index 3a59c19d9b3..10510227630 100644 --- a/mindspore/ccsrc/runtime/framework/actor/copy_actor.cc +++ b/mindspore/ccsrc/runtime/framework/actor/copy_actor.cc @@ -89,22 +89,6 @@ void CopyActor::OnMemoryAllocFinish(OpContext *context) { SendOutput(context); } -bool CopyActor::Copy(DeviceTensor *dst_device_tensor, const DeviceTensor *src_device_tensor) { - MS_EXCEPTION_IF_NULL(dst_device_tensor); - MS_EXCEPTION_IF_NULL(src_device_tensor); - - if (src_device_tensor->DeviceType() == device::DeviceAddressType::kCPU) { - // CPU device tensor copy to other device tensor. - return dst_device_tensor->SyncHostToDevice(src_device_tensor->GetSize(), src_device_tensor->GetPtr()); - } else if (dst_device_tensor->DeviceType() == device::DeviceAddressType::kCPU) { - // Other device tensor copy to CPU device tensor. - return src_device_tensor->SyncDeviceToHost(dst_device_tensor->GetSize(), dst_device_tensor->GetMutablePtr()); - } else { - MS_LOG(ERROR) << "Invalid device type for copy actor: " << GetAID().Name(); - return false; - } -} - bool CopyActor::CheckCopyCondition(OpContext *context) const { MS_EXCEPTION_IF_NULL(context); if (input_datas_num_ != 0) { diff --git a/mindspore/ccsrc/runtime/framework/actor/copy_actor.h b/mindspore/ccsrc/runtime/framework/actor/copy_actor.h index d9968b00b24..2d9361078d3 100644 --- a/mindspore/ccsrc/runtime/framework/actor/copy_actor.h +++ b/mindspore/ccsrc/runtime/framework/actor/copy_actor.h @@ -65,8 +65,6 @@ class CopyActor : public MemoryAwareActor { // Fetch the device tensor for copy. void FetchDeviceTensor(OpContext *context); - // Copy data from src_device_tensor to dst_device_tensor. - bool Copy(DeviceTensor *dst_device_tensor, const DeviceTensor *src_device_tensor); // Send output data and output controls when finish copy. void SendOutput(OpContext *context) const; // Erase input data and input controls when finish copy. diff --git a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc index 313f8b28571..a969591c733 100644 --- a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc +++ b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc @@ -258,13 +258,15 @@ void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext *cont MS_EXCEPTION_IF_NULL(host_tensor); MS_EXCEPTION_IF_NULL(device_tensor); auto tensor_device_address = std::dynamic_pointer_cast(host_tensor->device_address()); + // Sync data from host_tensor_device_address to device_tensor. if (tensor_device_address != nullptr) { - if (tensor_device_address.get() != device_tensor) { - MS_LOG(EXCEPTION) << "The device tensor of host queue node should be equal to device address of input tensor"; + if ((tensor_device_address.get() != device_tensor) && (!Copy(device_tensor, tensor_device_address.get()))) { + SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "Copy data failed."); } continue; } + // Sync data from host_tensor to device_tensor. if (!device_tensor->SyncHostToDevice(trans::GetRuntimePaddingShape(data_nodes_[i], 0), LongToSize(host_tensor->data().nbytes()), host_tensor->data_type(), host_tensor->data_c(), host_tensor->device_info().host_format_)) { diff --git a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc index e04ddfd9afc..90f788066dd 100644 --- a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc +++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc @@ -229,15 +229,8 @@ void PrepareDataForWeightNode(const AnfNodePtr &backend_node, const AnfNodePtr & } MS_LOG(INFO) << "Prepare device data for weight node:" << backend_node->fullname_with_scope() << ", device type:" << another_device_type; - if (host_tensor_address->DeviceType() == device::DeviceAddressType::kCPU) { - // CPU device tensor copy to other device tensor. - (void)another_device_tensor->SyncHostToDevice(host_tensor_address->GetSize(), host_tensor_address->GetPtr()); - } else if (another_device_tensor->DeviceType() == device::DeviceAddressType::kCPU) { - // Other device tensor copy to CPU device tensor. - (void)host_tensor_address->SyncDeviceToHost(another_device_tensor->GetSize(), - another_device_tensor->GetMutablePtr()); - } else { - MS_LOG(EXCEPTION) << "Invalid device type for sync data."; + if (!Copy(another_device_tensor.get(), host_tensor_address.get())) { + MS_LOG(EXCEPTION) << "Sync data error."; } } } @@ -310,10 +303,11 @@ void PrepareDataForHostDataSourceActor(const std::unordered_mapsecond] = tensor; - auto device_address = std::dynamic_pointer_cast(tensor->device_address()); - if (device_address != nullptr) { - AnfAlgo::SetOutputAddr(device_address, 0, node.get()); - return; + auto tensor_address = std::dynamic_pointer_cast(tensor->device_address()); + auto device_address = AnfAlgo::GetMutableOutputAddr(node, 0, false); + MS_EXCEPTION_IF_NULL(device_address); + if ((tensor_address != nullptr) && (tensor_address->DeviceType() == device_address->DeviceType())) { + AnfAlgo::SetOutputAddr(tensor_address, 0, node.get()); } } @@ -1554,8 +1548,14 @@ void GraphScheduler::LinkDataArrowForCopyActor(OpActor *from_actor // Set the member of the copy actor. MS_EXCEPTION_IF_NULL(from_device_tensor); + auto to_kernel_mod = AnfAlgo::GetKernelMod(to_kernel_with_input_idx.first); + MS_EXCEPTION_IF_NULL(to_kernel_mod); + auto input_sizes = to_kernel_mod->GetInputSizeList(); + if (to_input_index >= input_sizes.size()) { + MS_LOG(EXCEPTION) << "To input index(" << to_input_index << ") is out of size: " << input_sizes.size(); + } copy_actor->output_ = to_devcie_context->CreateDeviceAddress( - nullptr, from_device_tensor->GetSize(), from_device_tensor->format(), from_device_tensor->type_id()); + nullptr, input_sizes[to_input_index], from_device_tensor->format(), from_device_tensor->type_id()); MS_EXCEPTION_IF_NULL(from_devcie_context); copy_actor->input_device_context_ = from_devcie_context; copy_actor->output_device_context_ = to_devcie_context;