!2394 sync cpu output if needed

Merge pull request !2394 from kisnwang/add-cpu-outputsync
This commit is contained in:
mindspore-ci-bot 2020-06-20 17:44:11 +08:00 committed by Gitee
commit 9969a9b07d
4 changed files with 31 additions and 17 deletions

View File

@ -21,6 +21,7 @@
#include <utility> #include <utility>
#include <functional> #include <functional>
#include <unordered_map> #include <unordered_map>
#include <set>
#include "kernel/kernel.h" #include "kernel/kernel.h"
#include "device/cpu/cpu_device_address.h" #include "device/cpu/cpu_device_address.h"
#include "utils/context/ms_context.h" #include "utils/context/ms_context.h"
@ -139,8 +140,12 @@ DeviceAddressPtr CPUKernelRuntime::CreateDeviceAddress(void *device_ptr, size_t
return std::make_shared<CPUDeviceAddress>(device_ptr, device_size, format, type_id); return std::make_shared<CPUDeviceAddress>(device_ptr, device_size, format, type_id);
} }
BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, size_t index, BaseRef CPUKernelRuntime::CreatTensorForOutput(const session::KernelWithIndex &kernel_with_index,
const std::unordered_map<AnfNode *, tensor::TensorPtr> &input_map) { const std::unordered_map<AnfNode *, tensor::TensorPtr> &input_map,
std::set<DeviceAddressPtr> *bound_addresses,
std::vector<tensor::TensorPtr> *need_sync_outputs) {
auto &input_node = kernel_with_index.first;
auto index = kernel_with_index.second;
MS_EXCEPTION_IF_NULL(input_node); MS_EXCEPTION_IF_NULL(input_node);
if (input_node->isa<CNode>() && AnfAlgo::GetCNodeName(input_node) == prim::kPrimMakeTuple->name()) { if (input_node->isa<CNode>() && AnfAlgo::GetCNodeName(input_node) == prim::kPrimMakeTuple->name()) {
auto cnode = input_node->cast<CNodePtr>(); auto cnode = input_node->cast<CNodePtr>();
@ -148,7 +153,7 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, siz
VectorRef ret; VectorRef ret;
for (size_t i = 1; i < cnode->inputs().size(); i++) { for (size_t i = 1; i < cnode->inputs().size(); i++) {
auto item_with_index = AnfAlgo::VisitKernelWithReturnType(cnode->input(i), 0); auto item_with_index = AnfAlgo::VisitKernelWithReturnType(cnode->input(i), 0);
auto out = CreatTensorForOutput(item_with_index.first, item_with_index.second, input_map); auto out = CreatTensorForOutput(item_with_index, input_map, bound_addresses, need_sync_outputs);
ret.push_back(out); ret.push_back(out);
} }
return ret; return ret;
@ -169,11 +174,13 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, siz
type_id = GetCPUSupportOutputTypeId(type_id); type_id = GetCPUSupportOutputTypeId(type_id);
tensor::TensorPtr tensor = std::make_shared<tensor::Tensor>(type_id, temp_shape); tensor::TensorPtr tensor = std::make_shared<tensor::Tensor>(type_id, temp_shape);
MS_EXCEPTION_IF_NULL(tensor); MS_EXCEPTION_IF_NULL(tensor);
if (address->ref_count_ > 0 && address->ptr_ != nullptr) { if (bound_addresses->find(address) != bound_addresses->end()) {
tensor->set_device_address(address); tensor->set_device_address(address);
need_sync_outputs->emplace_back(tensor);
} else { } else {
address->ptr_ = tensor->data_c(true); address->ptr_ = tensor->data_c(true);
address->ref_count_ = INIT_NODE_REF; address->ref_count_ = INIT_NODE_REF;
(void)bound_addresses->insert(address);
} }
tensor->set_dirty(false); tensor->set_dirty(false);
return tensor; return tensor;
@ -187,7 +194,8 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const AnfNodePtr &input_node, siz
} }
void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph, void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph,
const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) { const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs,
std::vector<tensor::TensorPtr> *need_sync_outputs) {
MS_EXCEPTION_IF_NULL(kernel_graph); MS_EXCEPTION_IF_NULL(kernel_graph);
MS_EXCEPTION_IF_NULL(outputs); MS_EXCEPTION_IF_NULL(outputs);
// bind input ptr // bind input ptr
@ -195,10 +203,8 @@ void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph,
if (input_nodes.size() != inputs.size()) { if (input_nodes.size() != inputs.size()) {
MS_LOG(EXCEPTION) << "Input size not equal to input node size!"; MS_LOG(EXCEPTION) << "Input size not equal to input node size!";
} }
std::unordered_map<AnfNode *, tensor::TensorPtr> input_map; std::unordered_map<AnfNode *, tensor::TensorPtr> input_map;
size_t input_idx = 0; size_t input_idx = 0;
size_t type_size = sizeof(float);
for (auto &item : input_nodes) { for (auto &item : input_nodes) {
MS_EXCEPTION_IF_NULL(item); MS_EXCEPTION_IF_NULL(item);
input_map[item.get()] = inputs[input_idx]; input_map[item.get()] = inputs[input_idx];
@ -212,7 +218,8 @@ void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph,
(void)tensor->data_sync(); (void)tensor->data_sync();
} }
std::vector<int> data_shape = tensor->shape(); std::vector<int> data_shape = tensor->shape();
size_t tensor_size = std::accumulate(data_shape.begin(), data_shape.end(), type_size, std::multiplies<size_t>()); size_t tensor_size =
std::accumulate(data_shape.begin(), data_shape.end(), sizeof(float), std::multiplies<size_t>());
if (tensor->data_type() == kNumberTypeFloat32 || tensor->data_type() == kNumberTypeInt32) { if (tensor->data_type() == kNumberTypeFloat32 || tensor->data_type() == kNumberTypeInt32) {
address->ptr_ = tensor->data_c(false); address->ptr_ = tensor->data_c(false);
} else { } else {
@ -223,18 +230,17 @@ void CPUKernelRuntime::BindInputOutput(const session::KernelGraph *kernel_graph,
} }
tensor->set_dirty(true); tensor->set_dirty(true);
} }
address->ref_count_ = INIT_NODE_REF; address->ref_count_ = INIT_NODE_REF;
tensor->set_device_address(address); tensor->set_device_address(address);
} }
input_idx++; input_idx++;
} }
// new output and bind ptr // new output and bind ptr
std::set<DeviceAddressPtr> bound_addresses;
auto output_nodes = kernel_graph->outputs(); auto output_nodes = kernel_graph->outputs();
for (const auto &item : output_nodes) { for (const auto &item : output_nodes) {
auto item_with_index = AnfAlgo::VisitKernelWithReturnType(item, 0, true); auto item_with_index = AnfAlgo::VisitKernelWithReturnType(item, 0, true);
auto out = CreatTensorForOutput(item_with_index.first, item_with_index.second, input_map); auto out = CreatTensorForOutput(item_with_index, input_map, &bound_addresses, need_sync_outputs);
outputs->push_back(std::move(out)); outputs->push_back(std::move(out));
} }
} }

View File

@ -20,10 +20,12 @@
#include <vector> #include <vector>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <set>
#include "device/kernel_runtime.h" #include "device/kernel_runtime.h"
#include "session/kernel_graph.h" #include "session/kernel_graph.h"
#include "session/session_basic.h" #include "session/session_basic.h"
#include "device/cpu/cpu_resource_manager.h" #include "device/cpu/cpu_resource_manager.h"
#include "session/anf_runtime_algorithm.h"
#include "utils/any.h" #include "utils/any.h"
namespace mindspore { namespace mindspore {
namespace device { namespace device {
@ -37,7 +39,7 @@ class CPUKernelRuntime : public KernelRuntime {
bool Run(session::KernelGraph *graph) override; bool Run(session::KernelGraph *graph) override;
void AssignKernelAddress(session::KernelGraph *kernel_graph); void AssignKernelAddress(session::KernelGraph *kernel_graph);
void BindInputOutput(const session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs, void BindInputOutput(const session::KernelGraph *kernel_graph, const std::vector<tensor::TensorPtr> &inputs,
VectorRef *outputs); VectorRef *outputs, std::vector<tensor::TensorPtr> *need_sync_outputs);
void IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs); void IncreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);
void DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs); void DecreaseSummaryRefCount(const session::NamedSummaryOutputs &summary_outputs);
@ -47,8 +49,10 @@ class CPUKernelRuntime : public KernelRuntime {
TypeId type_id) override; TypeId type_id) override;
private: private:
BaseRef CreatTensorForOutput(const AnfNodePtr &input_node, size_t index, BaseRef CreatTensorForOutput(const session::KernelWithIndex &kernel_with_index,
const std::unordered_map<AnfNode *, tensor::TensorPtr> &input_map); const std::unordered_map<AnfNode *, tensor::TensorPtr> &input_map,
std::set<DeviceAddressPtr> *bound_addresses,
std::vector<tensor::TensorPtr> *need_sync_outputs);
void AssignValueNodeAddress(session::KernelGraph *kernel_graph); void AssignValueNodeAddress(session::KernelGraph *kernel_graph);
void AssignInputNodeAddress(const session::KernelGraph *kernel_graph); void AssignInputNodeAddress(const session::KernelGraph *kernel_graph);
void AssignKernelOutputAddress(const session::KernelGraph *kernel_graph); void AssignKernelOutputAddress(const session::KernelGraph *kernel_graph);

View File

@ -74,8 +74,8 @@ void GatherV2CPUKernel::CopyDataToOutput(const std::vector<kernel::AddressPtr> &
size_t dim2, float **output_addr, size_t *buff_size) { size_t dim2, float **output_addr, size_t *buff_size) {
auto input_addr = reinterpret_cast<float *>(inputs[0]->addr); auto input_addr = reinterpret_cast<float *>(inputs[0]->addr);
auto indices_addr = reinterpret_cast<int *>(inputs[1]->addr); auto indices_addr = reinterpret_cast<int *>(inputs[1]->addr);
size_t elem_num = inputs[1]->size / 4;
for (size_t i = 0; i < output_shape_[axis_]; ++i) { for (size_t i = 0; i < elem_num; ++i) {
size_t index = IntToSize(indices_addr[i]); size_t index = IntToSize(indices_addr[i]);
size_t pos = 0; size_t pos = 0;
if (axis_ == 3) { if (axis_ == 3) {

View File

@ -63,7 +63,8 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
auto &kernel_graph = graphs_[graph_id]; auto &kernel_graph = graphs_[graph_id];
MS_EXCEPTION_IF_NULL(kernel_graph); MS_EXCEPTION_IF_NULL(kernel_graph);
MS_LOG(INFO) << "Bind input output address"; MS_LOG(INFO) << "Bind input output address";
runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs); std::vector<tensor::TensorPtr> need_sync_outputs;
runtime_.BindInputOutput(kernel_graph.get(), inputs, outputs, &need_sync_outputs);
MS_LOG(INFO) << "Run graph start"; MS_LOG(INFO) << "Run graph start";
predictmodel::StepConvertWeight(inputs); predictmodel::StepConvertWeight(inputs);
auto execution_order = kernel_graph->execution_order(); auto execution_order = kernel_graph->execution_order();
@ -82,6 +83,9 @@ void CPUSession::RunGraph(const GraphId &graph_id, const std::vector<tensor::Ten
if (!ret) { if (!ret) {
MS_LOG(EXCEPTION) << "Run graph failed"; MS_LOG(EXCEPTION) << "Run graph failed";
} }
for (auto output : need_sync_outputs) {
(void)output->data_sync();
}
if (enable_summary) { if (enable_summary) {
Summary(kernel_graph.get()); Summary(kernel_graph.get());