!22867 PyNative RunOp Performance Optimize

Merge pull request !22867 from caifubi/master-pynative-ascend-performance
This commit is contained in:
i-robot 2021-09-06 01:24:02 +00:00 committed by Gitee
commit 682d0c7d4f
5 changed files with 102 additions and 9 deletions

View File

@ -2394,5 +2394,26 @@ void AnfRuntimeAlgorithm::CacheAddrForAtomicClean(const AnfNodePtr &node, kernel
}
kernel_mod->set_inputs_addr(kernel_inputs);
}
std::string OpRuntimeInfo::output_format(size_t index) const {
if (index >= output_format_.size()) {
MS_LOG(EXCEPTION) << "Invalid index:" << index << " total output_format:" << output_format_.size();
}
return output_format_[index];
}
TypeId OpRuntimeInfo::output_type(size_t index) const {
if (index >= output_type_.size()) {
MS_LOG(EXCEPTION) << "Invalid index:" << index << " total output_type:" << output_type_.size();
}
return output_type_[index];
}
size_t OpRuntimeInfo::output_tensor_size(size_t index) const {
if (index >= output_tensor_size_.size()) {
MS_LOG(EXCEPTION) << "Invalid index::" << index << " total output_tensor_size:" << output_tensor_size_.size();
}
return output_tensor_size_[index];
}
} // namespace session
} // namespace mindspore

View File

@ -58,6 +58,28 @@ struct KernelWithIndexCmp {
}
};
class OpRuntimeInfo {
public:
OpRuntimeInfo(std::vector<std::string> output_format, std::vector<TypeId> output_type,
std::vector<size_t> output_tensor_size)
: output_format_(std::move(output_format)),
output_type_(std::move(output_type)),
output_tensor_size_(std::move(output_tensor_size)) {}
~OpRuntimeInfo() = default;
// Key for user data.
constexpr static char key[] = "OpRuntimeInfo";
std::string output_format(size_t index) const;
TypeId output_type(size_t index) const;
size_t output_tensor_size(size_t index) const;
private:
std::vector<std::string> output_format_;
std::vector<TypeId> output_type_;
std::vector<size_t> output_tensor_size_;
};
class AnfRuntimeAlgorithm {
public:
static AnfNodePtr MakeMonadValueNode(const KernelGraphPtr &kg);

View File

@ -50,6 +50,7 @@
#include "runtime/device/ascend/ascend_stream_assign.h"
#include "backend/session/anf_runtime_algorithm.h"
#include "utils/ms_utils.h"
#include "utils/utils.h"
#include "utils/context/graph_kernel_flags.h"
#include "backend/optimizer/common/helper.h"
#include "runtime/device/kernel_runtime_manager.h"
@ -866,9 +867,54 @@ KernelGraphPtr AscendSession::PreBuildOp(const OpRunInfo &op_run_info,
opt::RunOpAscendBackendIRFusionOptimization(graph);
SelectKernel(*graph);
RunOpHardwareOptimize(graph);
CacheCNodeOutputInfo(*graph);
return graph;
}
void AscendSession::CacheCNodeOutputInfo(const KernelGraph &graph) const {
auto &nodes = graph.execution_order();
for (auto const &node : nodes) {
std::vector<std::string> formats;
std::vector<TypeId> types;
std::vector<size_t> tensor_sizes;
auto output_num = AnfAlgo::GetOutputTensorNum(node);
for (size_t i = 0; i < output_num; ++i) {
std::string output_format = AnfAlgo::GetOutputFormat(node, i);
auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i);
auto tensor_size = AnfAlgo::GetOutputTensorMemSize(node, i);
formats.emplace_back(output_format);
types.emplace_back(output_type);
tensor_sizes.emplace_back(tensor_size);
}
MS_EXCEPTION_IF_NULL(node);
node->set_user_data<OpRuntimeInfo>(std::make_shared<OpRuntimeInfo>(formats, types, tensor_sizes));
}
auto &inputs = graph.inputs();
for (const auto &input : inputs) {
MS_EXCEPTION_IF_NULL(input);
if (!input->isa<Parameter>()) {
continue;
}
std::vector<std::string> formats;
std::vector<TypeId> types;
std::vector<size_t> tensor_sizes;
auto output_size = AnfAlgo::GetOutputTensorNum(input);
for (size_t index = 0; index < output_size; index++) {
auto format = AnfAlgo::GetOutputFormat(input, index);
auto type_id = AnfAlgo::GetOutputDeviceDataType(input, index);
if (type_id == kTypeUnknown) {
type_id = AnfAlgo::GetOutputInferDataType(input, index);
}
auto tensor_size = AnfAlgo::GetOutputTensorMemSize(input, index);
formats.emplace_back(format);
types.emplace_back(type_id);
tensor_sizes.emplace_back(tensor_size);
}
input->set_user_data<OpRuntimeInfo>(std::make_shared<OpRuntimeInfo>(formats, types, tensor_sizes));
}
}
void AscendSession::GetOpInputStubTensors(const CNodePtr &cnode, const std::map<AnfNodePtr, size_t> &parameter_index,
const std::vector<tensor::TensorPtr> &graph_inputs,
const std::map<KernelWithIndex, OutputTensorInfo> &node_output_info,

View File

@ -131,6 +131,7 @@ class AscendSession : public SessionBasic {
void LoadGraphsToDbg(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
void AssignStaticMemory(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
void UpdateRefOutputMap(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
void CacheCNodeOutputInfo(const KernelGraph &graph) const;
KernelGraphPtr PreBuildOp(const OpRunInfo &op_run_info, const std::vector<tensor::TensorPtr> &input_tensors,
const std::vector<int64_t> &tensors_mask);
void GetOpInputStubTensors(const CNodePtr &cnode, const std::map<AnfNodePtr, size_t> &parameter_index,

View File

@ -99,9 +99,12 @@ void KernelRuntime::RunOpMallocPre(const session::KernelGraph &graph,
for (const auto &node : nodes) {
auto output_num = AnfAlgo::GetOutputTensorNum(node);
for (size_t i = 0; i < output_num; ++i) {
std::string output_format = AnfAlgo::GetOutputFormat(node, i);
auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i);
auto tensor_size = AnfAlgo::GetOutputTensorMemSize(node, i);
MS_EXCEPTION_IF_NULL(node);
auto runtime_info = node->user_data<session::OpRuntimeInfo>();
MS_EXCEPTION_IF_NULL(runtime_info);
auto const &output_format = runtime_info->output_format(i);
auto output_type = runtime_info->output_type(i);
auto tensor_size = runtime_info->output_tensor_size(i);
// Create DeviceAddress without ptr.
// Get real device ptr after KernelBuild finish.
auto device_address = CreateDeviceAddress(nullptr, tensor_size, output_format, output_type);
@ -130,13 +133,13 @@ void KernelRuntime::RunOpMallocPre(const session::KernelGraph &graph,
AnfAlgo::SetOutputAddr(output_address, index, item.get());
continue;
}
TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(item, index);
if (output_type_id == kTypeUnknown) {
output_type_id = AnfAlgo::GetOutputInferDataType(item, index);
}
auto tensor_size = AnfAlgo::GetOutputTensorMemSize(item, index);
auto op_runtime_info = item->user_data<session::OpRuntimeInfo>();
TypeId output_type_id = op_runtime_info->output_type(index);
auto output_tensor_size = op_runtime_info->output_tensor_size(index);
auto output_format = op_runtime_info->output_format(index);
auto device_address =
CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id, {item, index});
CreateDeviceAddress(nullptr, output_tensor_size, output_format, output_type_id, {item, index});
AnfAlgo::SetOutputAddr(device_address, index, item.get());
current_tensor->set_device_address(device_address);
current_tensor->set_sync_status(kNeedSyncHostToDevice);