diff --git a/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc b/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc index 640607b9e54..7ef1cb31571 100644 --- a/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc +++ b/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include "kernel/kernel.h" #include "device/cpu/cpu_device_address.h" #include "utils/context/ms_context.h" @@ -139,8 +140,12 @@ DeviceAddressPtr CPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t return std::make_shared(device_ptr, device_size, format, type_id); } -BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, size_t index, - const std::unordered_map &input_map) { +BaseRef CPUKernelRuntime::CreatTensorForOutput(const session::KernelWithIndex &kernel_with_index, + const std::unordered_map &input_map, + std::set *bound_addresses, + std::vector *need_sync_outputs) { + auto &input_node = kernel_with_index.first; + auto index = kernel_with_index.second; MS_EXCEPTION_IF_NULL(input_node); if (input_node->isa() && AnfAlgo::GetCNodeName(input_node) == prim::kPrimMakeTuple->name()) { auto cnode = input_node->cast(); @@ -148,7 +153,7 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, siz VectorRef ret; for (size_t i = 1; i < cnode->inputs().size(); i++) { auto item_with_index = AnfAlgo::VisitKernelWithReturnType(cnode->input(i), 0); - auto out = CreatTensorForOutput(item_with_index.first, item_with_index.second, input_map); + auto out = CreatTensorForOutput(item_with_index, input_map, bound_addresses, need_sync_outputs); ret.push_back(out); } return ret; @@ -169,11 +174,13 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, siz type_id = GetCPUSupportOutputTypeId(type_id); tensor::TensorPtr tensor = std::make_shared(type_id, temp_shape); MS_EXCEPTION_IF_NULL(tensor); - if (address->ref_count_ > 0 && address->ptr_ != nullptr) { + if (bound_addresses->find(address) != bound_addresses->end()) { tensor->set_device_address(address); + need_sync_outputs->emplace_back(tensor); } else { address->ptr_ = tensor->data_c(true); address->ref_count_ = INIT_NODE_REF; + (void)bound_addresses->insert(address); } tensor->set_dirty(false); return tensor; @@ -187,7 +194,8 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, siz } void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph, - const std::vector &inputs, VectorRef *outputs) { + const std::vector &inputs, VectorRef *outputs, + std::vector *need_sync_outputs) { MS_EXCEPTION_IF_NULL(kernel_graph); MS_EXCEPTION_IF_NULL(outputs); // bind input ptr @@ -195,10 +203,8 @@ void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph, if (input_nodes.size() != inputs.size()) { MS_LOG(EXCEPTION) << "Input size not equal to input node size!"; } - std::unordered_map input_map; size_t input_idx = 0; - size_t type_size = sizeof(float); for (auto &item : input_nodes) { MS_EXCEPTION_IF_NULL(item); input_map[item.get()] = inputs[input_idx]; @@ -212,7 +218,8 @@ void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph, (void)tensor->data_sync(); } std::vector data_shape = tensor->shape(); - size_t tensor_size = std::accumulate(data_shape.begin(), data_shape.end(), type_size, std::multiplies()); + size_t tensor_size = + std::accumulate(data_shape.begin(), data_shape.end(), sizeof(float), std::multiplies()); if (tensor->data_type() == kNumberTypeFloat32 || tensor->data_type() == kNumberTypeInt32) { address->ptr_ = tensor->data_c(false); } else { @@ -223,18 +230,17 @@ void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph, } tensor->set_dirty(true); } - address->ref_count_ = INIT_NODE_REF; tensor->set_device_address(address); } input_idx++; } - // new output and bind ptr + std::set bound_addresses; auto output_nodes = kernel_graph->outputs(); for (const auto &item : output_nodes) { auto item_with_index = AnfAlgo::VisitKernelWithReturnType(item, 0, true); - auto out = CreatTensorForOutput(item_with_index.first, item_with_index.second, input_map); + auto out = CreatTensorForOutput(item_with_index, input_map, &bound_addresses, need_sync_outputs); outputs->push_back(std::move(out)); } } diff --git a/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.h b/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.h index ac63f55d3ee..27dcefdba91 100644 --- a/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.h +++ b/mindspore/ccsrc/device/cpu/cpu_kernel_runtime.h @@ -20,10 +20,12 @@ #include #include #include +#include #include "device/kernel_runtime.h" #include "session/kernel_graph.h" #include "session/session_basic.h" #include "device/cpu/cpu_resource_manager.h" +#include "session/anf_runtime_algorithm.h" #include "utils/any.h" namespace mindspore { namespace device { @@ -37,7 +39,7 @@ class CPUKernelRuntime : public KernelRuntime { bool Run(session::KernelGraph *graph) override; void AssignKernelAddress(session::KernelGraph *kernel_graph); void BindInputOutput(const session::KernelGraph *kernel_graph, const std::vector &inputs, - VectorRef *outputs); + VectorRef *outputs, std::vector *need_sync_outputs); void IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs); void DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs); @@ -47,8 +49,10 @@ class CPUKernelRuntime : public KernelRuntime { TypeId type_id) override; private: - BaseRef CreatTensorForOutput(const AnfNodePtr &input_node, size_t index, - const std::unordered_map &input_map); + BaseRef CreatTensorForOutput(const session::KernelWithIndex &kernel_with_index, + const std::unordered_map &input_map, + std::set *bound_addresses, + std::vector *need_sync_outputs); void AssignValueNodeAddress(session::KernelGraph *kernel_graph); void AssignInputNodeAddress(const session::KernelGraph *kernel_graph); void AssignKernelOutputAddress(const session::KernelGraph *kernel_graph); diff --git a/mindspore/ccsrc/kernel/cpu/gather_cpu_kernel.cc b/mindspore/ccsrc/kernel/cpu/gather_cpu_kernel.cc index cb311043ac9..9117a533c8b 100644 --- a/mindspore/ccsrc/kernel/cpu/gather_cpu_kernel.cc +++ b/mindspore/ccsrc/kernel/cpu/gather_cpu_kernel.cc @@ -74,8 +74,8 @@ void GatherV2CPUKernel::CopyDataToOutput(const std::vector & size_t dim2, float **output_addr, size_t *buff_size) { auto input_addr = reinterpret_cast(inputs[0]->addr); auto indices_addr = reinterpret_cast(inputs[1]->addr); - - for (size_t i = 0; i < output_shape_[axis_]; ++i) { + size_t elem_num = inputs[1]->size / 4; + for (size_t i = 0; i < elem_num; ++i) { size_t index = IntToSize(indices_addr[i]); size_t pos = 0; if (axis_ == 3) { diff --git a/mindspore/ccsrc/session/cpu_session.cc b/mindspore/ccsrc/session/cpu_session.cc index 8d6bc0f2b9e..e70e5510227 100644 --- a/mindspore/ccsrc/session/cpu_session.cc +++ b/mindspore/ccsrc/session/cpu_session.cc @@ -63,7 +63,8 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector need_sync_outputs; + runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs, &need_sync_outputs); MS_LOG(INFO) << "Run graph start"; predictmodel::StepConvertWeight(inputs); auto execution_order = kernel_graph->execution_order(); @@ -82,6 +83,9 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vectordata_sync(); + } if (enable_summary) { Summary(kernel_graph.get());