forked from mindspore-Ecosystem/mindspore
!22867 PyNative RunOp Performance Optimize
Merge pull request !22867 from caifubi/master-pynative-ascend-performance
This commit is contained in:
commit
682d0c7d4f
|
@ -2394,5 +2394,26 @@ void AnfRuntimeAlgorithm::CacheAddrForAtomicClean(const AnfNodePtr &node, kernel
|
||||||
}
|
}
|
||||||
kernel_mod->set_inputs_addr(kernel_inputs);
|
kernel_mod->set_inputs_addr(kernel_inputs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string OpRuntimeInfo::output_format(size_t index) const {
|
||||||
|
if (index >= output_format_.size()) {
|
||||||
|
MS_LOG(EXCEPTION) << "Invalid index:" << index << " total output_format:" << output_format_.size();
|
||||||
|
}
|
||||||
|
return output_format_[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
TypeId OpRuntimeInfo::output_type(size_t index) const {
|
||||||
|
if (index >= output_type_.size()) {
|
||||||
|
MS_LOG(EXCEPTION) << "Invalid index:" << index << " total output_type:" << output_type_.size();
|
||||||
|
}
|
||||||
|
return output_type_[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t OpRuntimeInfo::output_tensor_size(size_t index) const {
|
||||||
|
if (index >= output_tensor_size_.size()) {
|
||||||
|
MS_LOG(EXCEPTION) << "Invalid index::" << index << " total output_tensor_size:" << output_tensor_size_.size();
|
||||||
|
}
|
||||||
|
return output_tensor_size_[index];
|
||||||
|
}
|
||||||
} // namespace session
|
} // namespace session
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -58,6 +58,28 @@ struct KernelWithIndexCmp {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class OpRuntimeInfo {
|
||||||
|
public:
|
||||||
|
OpRuntimeInfo(std::vector<std::string> output_format, std::vector<TypeId> output_type,
|
||||||
|
std::vector<size_t> output_tensor_size)
|
||||||
|
: output_format_(std::move(output_format)),
|
||||||
|
output_type_(std::move(output_type)),
|
||||||
|
output_tensor_size_(std::move(output_tensor_size)) {}
|
||||||
|
~OpRuntimeInfo() = default;
|
||||||
|
|
||||||
|
// Key for user data.
|
||||||
|
constexpr static char key[] = "OpRuntimeInfo";
|
||||||
|
|
||||||
|
std::string output_format(size_t index) const;
|
||||||
|
TypeId output_type(size_t index) const;
|
||||||
|
size_t output_tensor_size(size_t index) const;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::vector<std::string> output_format_;
|
||||||
|
std::vector<TypeId> output_type_;
|
||||||
|
std::vector<size_t> output_tensor_size_;
|
||||||
|
};
|
||||||
|
|
||||||
class AnfRuntimeAlgorithm {
|
class AnfRuntimeAlgorithm {
|
||||||
public:
|
public:
|
||||||
static AnfNodePtr MakeMonadValueNode(const KernelGraphPtr &kg);
|
static AnfNodePtr MakeMonadValueNode(const KernelGraphPtr &kg);
|
||||||
|
|
|
@ -50,6 +50,7 @@
|
||||||
#include "runtime/device/ascend/ascend_stream_assign.h"
|
#include "runtime/device/ascend/ascend_stream_assign.h"
|
||||||
#include "backend/session/anf_runtime_algorithm.h"
|
#include "backend/session/anf_runtime_algorithm.h"
|
||||||
#include "utils/ms_utils.h"
|
#include "utils/ms_utils.h"
|
||||||
|
#include "utils/utils.h"
|
||||||
#include "utils/context/graph_kernel_flags.h"
|
#include "utils/context/graph_kernel_flags.h"
|
||||||
#include "backend/optimizer/common/helper.h"
|
#include "backend/optimizer/common/helper.h"
|
||||||
#include "runtime/device/kernel_runtime_manager.h"
|
#include "runtime/device/kernel_runtime_manager.h"
|
||||||
|
@ -866,9 +867,54 @@ KernelGraphPtr AscendSession::PreBuildOp(const OpRunInfo &op_run_info,
|
||||||
opt::RunOpAscendBackendIRFusionOptimization(graph);
|
opt::RunOpAscendBackendIRFusionOptimization(graph);
|
||||||
SelectKernel(*graph);
|
SelectKernel(*graph);
|
||||||
RunOpHardwareOptimize(graph);
|
RunOpHardwareOptimize(graph);
|
||||||
|
CacheCNodeOutputInfo(*graph);
|
||||||
return graph;
|
return graph;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void AscendSession::CacheCNodeOutputInfo(const KernelGraph &graph) const {
|
||||||
|
auto &nodes = graph.execution_order();
|
||||||
|
for (auto const &node : nodes) {
|
||||||
|
std::vector<std::string> formats;
|
||||||
|
std::vector<TypeId> types;
|
||||||
|
std::vector<size_t> tensor_sizes;
|
||||||
|
auto output_num = AnfAlgo::GetOutputTensorNum(node);
|
||||||
|
for (size_t i = 0; i < output_num; ++i) {
|
||||||
|
std::string output_format = AnfAlgo::GetOutputFormat(node, i);
|
||||||
|
auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i);
|
||||||
|
auto tensor_size = AnfAlgo::GetOutputTensorMemSize(node, i);
|
||||||
|
formats.emplace_back(output_format);
|
||||||
|
types.emplace_back(output_type);
|
||||||
|
tensor_sizes.emplace_back(tensor_size);
|
||||||
|
}
|
||||||
|
MS_EXCEPTION_IF_NULL(node);
|
||||||
|
node->set_user_data<OpRuntimeInfo>(std::make_shared<OpRuntimeInfo>(formats, types, tensor_sizes));
|
||||||
|
}
|
||||||
|
|
||||||
|
auto &inputs = graph.inputs();
|
||||||
|
for (const auto &input : inputs) {
|
||||||
|
MS_EXCEPTION_IF_NULL(input);
|
||||||
|
if (!input->isa<Parameter>()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
std::vector<std::string> formats;
|
||||||
|
std::vector<TypeId> types;
|
||||||
|
std::vector<size_t> tensor_sizes;
|
||||||
|
auto output_size = AnfAlgo::GetOutputTensorNum(input);
|
||||||
|
for (size_t index = 0; index < output_size; index++) {
|
||||||
|
auto format = AnfAlgo::GetOutputFormat(input, index);
|
||||||
|
auto type_id = AnfAlgo::GetOutputDeviceDataType(input, index);
|
||||||
|
if (type_id == kTypeUnknown) {
|
||||||
|
type_id = AnfAlgo::GetOutputInferDataType(input, index);
|
||||||
|
}
|
||||||
|
auto tensor_size = AnfAlgo::GetOutputTensorMemSize(input, index);
|
||||||
|
formats.emplace_back(format);
|
||||||
|
types.emplace_back(type_id);
|
||||||
|
tensor_sizes.emplace_back(tensor_size);
|
||||||
|
}
|
||||||
|
input->set_user_data<OpRuntimeInfo>(std::make_shared<OpRuntimeInfo>(formats, types, tensor_sizes));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void AscendSession::GetOpInputStubTensors(const CNodePtr &cnode, const std::map<AnfNodePtr, size_t> ¶meter_index,
|
void AscendSession::GetOpInputStubTensors(const CNodePtr &cnode, const std::map<AnfNodePtr, size_t> ¶meter_index,
|
||||||
const std::vector<tensor::TensorPtr> &graph_inputs,
|
const std::vector<tensor::TensorPtr> &graph_inputs,
|
||||||
const std::map<KernelWithIndex, OutputTensorInfo> &node_output_info,
|
const std::map<KernelWithIndex, OutputTensorInfo> &node_output_info,
|
||||||
|
|
|
@ -131,6 +131,7 @@ class AscendSession : public SessionBasic {
|
||||||
void LoadGraphsToDbg(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
|
void LoadGraphsToDbg(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
|
||||||
void AssignStaticMemory(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
|
void AssignStaticMemory(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
|
||||||
void UpdateRefOutputMap(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
|
void UpdateRefOutputMap(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
|
||||||
|
void CacheCNodeOutputInfo(const KernelGraph &graph) const;
|
||||||
KernelGraphPtr PreBuildOp(const OpRunInfo &op_run_info, const std::vector<tensor::TensorPtr> &input_tensors,
|
KernelGraphPtr PreBuildOp(const OpRunInfo &op_run_info, const std::vector<tensor::TensorPtr> &input_tensors,
|
||||||
const std::vector<int64_t> &tensors_mask);
|
const std::vector<int64_t> &tensors_mask);
|
||||||
void GetOpInputStubTensors(const CNodePtr &cnode, const std::map<AnfNodePtr, size_t> ¶meter_index,
|
void GetOpInputStubTensors(const CNodePtr &cnode, const std::map<AnfNodePtr, size_t> ¶meter_index,
|
||||||
|
|
|
@ -99,9 +99,12 @@ void KernelRuntime::RunOpMallocPre(const session::KernelGraph &graph,
|
||||||
for (const auto &node : nodes) {
|
for (const auto &node : nodes) {
|
||||||
auto output_num = AnfAlgo::GetOutputTensorNum(node);
|
auto output_num = AnfAlgo::GetOutputTensorNum(node);
|
||||||
for (size_t i = 0; i < output_num; ++i) {
|
for (size_t i = 0; i < output_num; ++i) {
|
||||||
std::string output_format = AnfAlgo::GetOutputFormat(node, i);
|
MS_EXCEPTION_IF_NULL(node);
|
||||||
auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i);
|
auto runtime_info = node->user_data<session::OpRuntimeInfo>();
|
||||||
auto tensor_size = AnfAlgo::GetOutputTensorMemSize(node, i);
|
MS_EXCEPTION_IF_NULL(runtime_info);
|
||||||
|
auto const &output_format = runtime_info->output_format(i);
|
||||||
|
auto output_type = runtime_info->output_type(i);
|
||||||
|
auto tensor_size = runtime_info->output_tensor_size(i);
|
||||||
// Create DeviceAddress without ptr.
|
// Create DeviceAddress without ptr.
|
||||||
// Get real device ptr after KernelBuild finish.
|
// Get real device ptr after KernelBuild finish.
|
||||||
auto device_address = CreateDeviceAddress(nullptr, tensor_size, output_format, output_type);
|
auto device_address = CreateDeviceAddress(nullptr, tensor_size, output_format, output_type);
|
||||||
|
@ -130,13 +133,13 @@ void KernelRuntime::RunOpMallocPre(const session::KernelGraph &graph,
|
||||||
AnfAlgo::SetOutputAddr(output_address, index, item.get());
|
AnfAlgo::SetOutputAddr(output_address, index, item.get());
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(item, index);
|
auto op_runtime_info = item->user_data<session::OpRuntimeInfo>();
|
||||||
if (output_type_id == kTypeUnknown) {
|
|
||||||
output_type_id = AnfAlgo::GetOutputInferDataType(item, index);
|
TypeId output_type_id = op_runtime_info->output_type(index);
|
||||||
}
|
auto output_tensor_size = op_runtime_info->output_tensor_size(index);
|
||||||
auto tensor_size = AnfAlgo::GetOutputTensorMemSize(item, index);
|
auto output_format = op_runtime_info->output_format(index);
|
||||||
auto device_address =
|
auto device_address =
|
||||||
CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id, {item, index});
|
CreateDeviceAddress(nullptr, output_tensor_size, output_format, output_type_id, {item, index});
|
||||||
AnfAlgo::SetOutputAddr(device_address, index, item.get());
|
AnfAlgo::SetOutputAddr(device_address, index, item.get());
|
||||||
current_tensor->set_device_address(device_address);
|
current_tensor->set_device_address(device_address);
|
||||||
current_tensor->set_sync_status(kNeedSyncHostToDevice);
|
current_tensor->set_sync_status(kNeedSyncHostToDevice);
|
||||||
|
|
Loading…
Reference in New Issue