fix bug of CPU actor runtime

2021-06-19 17:35:03 +08:00 · 2021-06-19 17:35:03 +08:00 · b25d00731c
parent c3a9b08624
commit b25d00731c
8 changed files with 43 additions and 20 deletions
--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_tensor_dense_matmul_cpu_kernel.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_tensor_dense_matmul_cpu_kernel.cc
@ -60,11 +60,10 @@ bool SparseTensorDenseMatmulCPUKernel<I, T>::Launch(const std::vector<kernel::Ad
  auto a_values = reinterpret_cast<T *>(inputs[1]->addr);
  auto b = reinterpret_cast<T *>(inputs[3]->addr);
  auto out = reinterpret_cast<T *>(outputs[0]->addr);
-  const size_t output_length = outputs[0]->size / sizeof(T);
  const size_t indices_length = inputs[0]->size / sizeof(I);
  const size_t values_length = inputs[1]->size / sizeof(T);
  const size_t b_length = inputs[3]->size / sizeof(T);
-  if (memset_s(out, output_length, 0, output_length) != EOK) {
+  if (memset_s(out, outputs[0]->size, 0, outputs[0]->size) != EOK) {
    MS_LOG(EXCEPTION) << "Memset Failed!";
  }

--- a/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_to_dense_cpu_kernal.cc
+++ b/mindspore/ccsrc/backend/kernel_compiler/cpu/sparse_to_dense_cpu_kernal.cc
@ -55,10 +55,9 @@ bool SparseToDenseCPUKernel<I, T>::Launch(const std::vector<kernel::AddressPtr>
  auto indices_addr = reinterpret_cast<I *>(inputs[0]->addr);
  auto values_addr = reinterpret_cast<T *>(inputs[1]->addr);
  auto output_addr = reinterpret_cast<T *>(outputs[0]->addr);
-  const size_t output_length = outputs[0]->size / sizeof(T);
  const size_t indices_length = inputs[0]->size / sizeof(I);
  const size_t values_length = inputs[1]->size / sizeof(T);
-  if (memset_s(output_addr, output_length, 0, output_length) != EOK) {
+  if (memset_s(output_addr, outputs[0]->size, 0, outputs[0]->size) != EOK) {
    MS_LOG(EXCEPTION) << "Memset Failed!";
  }

--- a/mindspore/ccsrc/runtime/device/cpu/cpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/cpu/cpu_device_address.cc
@ -44,8 +44,13 @@ bool CPUDeviceAddress::SyncDeviceToHost(const ShapeVector &, size_t size, TypeId
    MS_LOG(DEBUG) << "host_ptr is equal to ptr_, request ignored.";
    return true;
  }
+
  if (type == type_id_) {
-    auto ret_code = memcpy_s(host_ptr, size, ptr_, size_);
+    if ((size == 0) || (size_ == 0) || (size > size_)) {
+      MS_LOG(INFO) << "No need sync, host size: " << size << ", device size: " << size_;
+      return true;
+    }
+    auto ret_code = memcpy_s(host_ptr, size, ptr_, size);
    if (ret_code != EOK) {
      MS_LOG(ERROR) << "Failed to copy tensor!";
      return false;
@ -78,7 +83,11 @@ bool CPUDeviceAddress::SyncHostToDevice(const ShapeVector & /* shape */, size_t
  }

  if (type == type_id_) {
-    auto ret_code = memcpy_s(ptr_, size_, host_ptr, size);
+    if ((size == 0) || (size_ == 0) || (size > size_)) {
+      MS_LOG(INFO) << "No need sync, host size: " << size << ", device size: " << size_;
+      return true;
+    }
+    auto ret_code = memcpy_s(ptr_, size, host_ptr, size);
    if (ret_code != EOK) {
      MS_LOG(ERROR) << "Failed to copy tensor!";
      return false;
--- a/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
+++ b/mindspore/ccsrc/runtime/device/gpu/gpu_device_address.cc
@ -33,10 +33,16 @@ namespace device {
 namespace gpu {
 bool GPUDeviceAddress::SyncDeviceToHost(size_t size, void *host_ptr) const {
  MS_EXCEPTION_IF_NULL(host_ptr);
+  if (ptr_ == nullptr) {
+    MS_LOG(ERROR) << "The device address is null!";
+    return false;
+  }
  bool need_sync = (size != 0) && (size_ != 0) && (size <= size_);
  if (!need_sync) {
+    MS_LOG(INFO) << "No need sync, host size: " << size << ", device size: " << size_;
    return true;
  }
+
  auto &stream = GPUDeviceManager::GetInstance().default_stream();
  MS_EXCEPTION_IF_NULL(stream);
  auto ret = GPUDeviceManager::GetInstance().SyncStream(stream);
@ -53,8 +59,13 @@ bool GPUDeviceAddress::SyncDeviceToHost(size_t size, void *host_ptr) const {

 bool GPUDeviceAddress::SyncHostToDevice(size_t size, const void *host_ptr) const {
  MS_EXCEPTION_IF_NULL(host_ptr);
+  if (ptr_ == nullptr) {
+    MS_LOG(ERROR) << "The device address is null!";
+    return false;
+  }
  bool need_sync = (size != 0) && (size_ != 0) && (size <= size_);
  if (!need_sync) {
+    MS_LOG(INFO) << "No need sync, host size: " << size << ", device size: " << size_;
    return true;
  }

--- a/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/data_source_actor.cc
@ -140,8 +140,13 @@ void DeviceQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *co
  }

  // Copy data from device queue by data kernel launching.
-  auto ret =
-    device_context_->LaunchKernel(data_kernel_, launch_info_.inputs_, launch_info_.workspaces_, launch_info_.outputs_);
+  bool ret = true;
+  try {
+    ret = device_context_->LaunchKernel(data_kernel_, launch_info_.inputs_, launch_info_.workspaces_,
+                                        launch_info_.outputs_);
+  } catch (const std::exception &e) {
+    MsException::Instance().SetException();
+  }
  if (!ret) {
    std::string error_info = "Launch kernel failed: " + data_kernel_->ToString();
    SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), error_info);
--- a/mindspore/ccsrc/runtime/framework/actor/kernel_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/kernel_actor.cc
@ -147,8 +147,13 @@ void KernelActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *context) {

  PreLaunchKernel(context);

-  auto ret = device_context_->LaunchKernel(kernel_, launch_info_.inputs_, launch_info_.workspaces_,
-                                           launch_info_.outputs_, is_dynamic_shape_);
+  bool ret = true;
+  try {
+    ret = device_context_->LaunchKernel(kernel_, launch_info_.inputs_, launch_info_.workspaces_, launch_info_.outputs_,
+                                        is_dynamic_shape_);
+  } catch (const std::exception &e) {
+    MsException::Instance().SetException();
+  }
  if (!ret) {
    std::string error_info = "Launch kernel failed: " + kernel_->ToString();
    SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), error_info);
--- a/mindspore/ccsrc/runtime/framework/actor/output_actor.cc
+++ b/mindspore/ccsrc/runtime/framework/actor/output_actor.cc
@ -25,11 +25,9 @@ TensorPtr CreateOutputTensor(const AnfNodePtr &output_node, size_t output_index,
  MS_LOG(INFO) << "Create output tensor, output node: " << output_node->fullname_with_scope()
               << ", output index: " << output_index << ", output position: " << output_position;

-  // Create host tensor.
-  auto type_id = AnfAlgo::GetOutputDeviceDataType(output_node, output_index);
-  if (type_id == kTypeUnknown) {
-    type_id = AnfAlgo::GetOutputInferDataType(output_node, output_index);
-  }
+  // Create host tensor, the output tensor should use the infer type, it will be handed correctly by tensor data sync
+  // when infer type is not equal to device type.
+  auto type_id = AnfAlgo::GetOutputInferDataType(output_node, output_index);
  std::vector<int64_t> temp_shape;
  auto shape = AnfAlgo::GetOutputInferShape(output_node, output_index);
  (void)std::copy(shape.begin(), shape.end(), std::back_inserter(temp_shape));
--- a/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
+++ b/mindspore/ccsrc/runtime/framework/graph_scheduler.cc
@ -579,11 +579,8 @@ bool GraphScheduler::Run(const ActorSet *actor_set, GraphExecutionStrategy strat
  // Get the run result.
  auto result_future = result[0].GetFuture();
  result_future.Wait();
-  if (!result_future.IsOK()) {
-    return false;
-  }
-
-  return true;
+  MsException::Instance().CheckException();
+  return result_future.IsOK();
 }

 ActorSet *GraphScheduler::Fetch(const ActorInfo &actor_info) const {