forked from mindspore-Ecosystem/mindspore
!22867 PyNative RunOp Performance Optimize
Merge pull request !22867 from caifubi/master-pynative-ascend-performance
This commit is contained in:
commit
682d0c7d4f
|
@ -2394,5 +2394,26 @@ void AnfRuntimeAlgorithm::CacheAddrForAtomicClean(const AnfNodePtr &node, kernel
|
|||
}
|
||||
kernel_mod->set_inputs_addr(kernel_inputs);
|
||||
}
|
||||
|
||||
std::string OpRuntimeInfo::output_format(size_t index) const {
|
||||
if (index >= output_format_.size()) {
|
||||
MS_LOG(EXCEPTION) << "Invalid index:" << index << " total output_format:" << output_format_.size();
|
||||
}
|
||||
return output_format_[index];
|
||||
}
|
||||
|
||||
TypeId OpRuntimeInfo::output_type(size_t index) const {
|
||||
if (index >= output_type_.size()) {
|
||||
MS_LOG(EXCEPTION) << "Invalid index:" << index << " total output_type:" << output_type_.size();
|
||||
}
|
||||
return output_type_[index];
|
||||
}
|
||||
|
||||
size_t OpRuntimeInfo::output_tensor_size(size_t index) const {
|
||||
if (index >= output_tensor_size_.size()) {
|
||||
MS_LOG(EXCEPTION) << "Invalid index::" << index << " total output_tensor_size:" << output_tensor_size_.size();
|
||||
}
|
||||
return output_tensor_size_[index];
|
||||
}
|
||||
} // namespace session
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -58,6 +58,28 @@ struct KernelWithIndexCmp {
|
|||
}
|
||||
};
|
||||
|
||||
class OpRuntimeInfo {
|
||||
public:
|
||||
OpRuntimeInfo(std::vector<std::string> output_format, std::vector<TypeId> output_type,
|
||||
std::vector<size_t> output_tensor_size)
|
||||
: output_format_(std::move(output_format)),
|
||||
output_type_(std::move(output_type)),
|
||||
output_tensor_size_(std::move(output_tensor_size)) {}
|
||||
~OpRuntimeInfo() = default;
|
||||
|
||||
// Key for user data.
|
||||
constexpr static char key[] = "OpRuntimeInfo";
|
||||
|
||||
std::string output_format(size_t index) const;
|
||||
TypeId output_type(size_t index) const;
|
||||
size_t output_tensor_size(size_t index) const;
|
||||
|
||||
private:
|
||||
std::vector<std::string> output_format_;
|
||||
std::vector<TypeId> output_type_;
|
||||
std::vector<size_t> output_tensor_size_;
|
||||
};
|
||||
|
||||
class AnfRuntimeAlgorithm {
|
||||
public:
|
||||
static AnfNodePtr MakeMonadValueNode(const KernelGraphPtr &kg);
|
||||
|
|
|
@ -50,6 +50,7 @@
|
|||
#include "runtime/device/ascend/ascend_stream_assign.h"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "utils/ms_utils.h"
|
||||
#include "utils/utils.h"
|
||||
#include "utils/context/graph_kernel_flags.h"
|
||||
#include "backend/optimizer/common/helper.h"
|
||||
#include "runtime/device/kernel_runtime_manager.h"
|
||||
|
@ -866,9 +867,54 @@ KernelGraphPtr AscendSession::PreBuildOp(const OpRunInfo &op_run_info,
|
|||
opt::RunOpAscendBackendIRFusionOptimization(graph);
|
||||
SelectKernel(*graph);
|
||||
RunOpHardwareOptimize(graph);
|
||||
CacheCNodeOutputInfo(*graph);
|
||||
return graph;
|
||||
}
|
||||
|
||||
void AscendSession::CacheCNodeOutputInfo(const KernelGraph &graph) const {
|
||||
auto &nodes = graph.execution_order();
|
||||
for (auto const &node : nodes) {
|
||||
std::vector<std::string> formats;
|
||||
std::vector<TypeId> types;
|
||||
std::vector<size_t> tensor_sizes;
|
||||
auto output_num = AnfAlgo::GetOutputTensorNum(node);
|
||||
for (size_t i = 0; i < output_num; ++i) {
|
||||
std::string output_format = AnfAlgo::GetOutputFormat(node, i);
|
||||
auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i);
|
||||
auto tensor_size = AnfAlgo::GetOutputTensorMemSize(node, i);
|
||||
formats.emplace_back(output_format);
|
||||
types.emplace_back(output_type);
|
||||
tensor_sizes.emplace_back(tensor_size);
|
||||
}
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
node->set_user_data<OpRuntimeInfo>(std::make_shared<OpRuntimeInfo>(formats, types, tensor_sizes));
|
||||
}
|
||||
|
||||
auto &inputs = graph.inputs();
|
||||
for (const auto &input : inputs) {
|
||||
MS_EXCEPTION_IF_NULL(input);
|
||||
if (!input->isa<Parameter>()) {
|
||||
continue;
|
||||
}
|
||||
std::vector<std::string> formats;
|
||||
std::vector<TypeId> types;
|
||||
std::vector<size_t> tensor_sizes;
|
||||
auto output_size = AnfAlgo::GetOutputTensorNum(input);
|
||||
for (size_t index = 0; index < output_size; index++) {
|
||||
auto format = AnfAlgo::GetOutputFormat(input, index);
|
||||
auto type_id = AnfAlgo::GetOutputDeviceDataType(input, index);
|
||||
if (type_id == kTypeUnknown) {
|
||||
type_id = AnfAlgo::GetOutputInferDataType(input, index);
|
||||
}
|
||||
auto tensor_size = AnfAlgo::GetOutputTensorMemSize(input, index);
|
||||
formats.emplace_back(format);
|
||||
types.emplace_back(type_id);
|
||||
tensor_sizes.emplace_back(tensor_size);
|
||||
}
|
||||
input->set_user_data<OpRuntimeInfo>(std::make_shared<OpRuntimeInfo>(formats, types, tensor_sizes));
|
||||
}
|
||||
}
|
||||
|
||||
void AscendSession::GetOpInputStubTensors(const CNodePtr &cnode, const std::map<AnfNodePtr, size_t> ¶meter_index,
|
||||
const std::vector<tensor::TensorPtr> &graph_inputs,
|
||||
const std::map<KernelWithIndex, OutputTensorInfo> &node_output_info,
|
||||
|
|
|
@ -131,6 +131,7 @@ class AscendSession : public SessionBasic {
|
|||
void LoadGraphsToDbg(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
|
||||
void AssignStaticMemory(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
|
||||
void UpdateRefOutputMap(const NotNull<KernelGraphPtr> graph, NotNull<std::set<KernelGraphPtr> *> memo) const;
|
||||
void CacheCNodeOutputInfo(const KernelGraph &graph) const;
|
||||
KernelGraphPtr PreBuildOp(const OpRunInfo &op_run_info, const std::vector<tensor::TensorPtr> &input_tensors,
|
||||
const std::vector<int64_t> &tensors_mask);
|
||||
void GetOpInputStubTensors(const CNodePtr &cnode, const std::map<AnfNodePtr, size_t> ¶meter_index,
|
||||
|
|
|
@ -99,9 +99,12 @@ void KernelRuntime::RunOpMallocPre(const session::KernelGraph &graph,
|
|||
for (const auto &node : nodes) {
|
||||
auto output_num = AnfAlgo::GetOutputTensorNum(node);
|
||||
for (size_t i = 0; i < output_num; ++i) {
|
||||
std::string output_format = AnfAlgo::GetOutputFormat(node, i);
|
||||
auto output_type = AnfAlgo::GetOutputDeviceDataType(node, i);
|
||||
auto tensor_size = AnfAlgo::GetOutputTensorMemSize(node, i);
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
auto runtime_info = node->user_data<session::OpRuntimeInfo>();
|
||||
MS_EXCEPTION_IF_NULL(runtime_info);
|
||||
auto const &output_format = runtime_info->output_format(i);
|
||||
auto output_type = runtime_info->output_type(i);
|
||||
auto tensor_size = runtime_info->output_tensor_size(i);
|
||||
// Create DeviceAddress without ptr.
|
||||
// Get real device ptr after KernelBuild finish.
|
||||
auto device_address = CreateDeviceAddress(nullptr, tensor_size, output_format, output_type);
|
||||
|
@ -130,13 +133,13 @@ void KernelRuntime::RunOpMallocPre(const session::KernelGraph &graph,
|
|||
AnfAlgo::SetOutputAddr(output_address, index, item.get());
|
||||
continue;
|
||||
}
|
||||
TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(item, index);
|
||||
if (output_type_id == kTypeUnknown) {
|
||||
output_type_id = AnfAlgo::GetOutputInferDataType(item, index);
|
||||
}
|
||||
auto tensor_size = AnfAlgo::GetOutputTensorMemSize(item, index);
|
||||
auto op_runtime_info = item->user_data<session::OpRuntimeInfo>();
|
||||
|
||||
TypeId output_type_id = op_runtime_info->output_type(index);
|
||||
auto output_tensor_size = op_runtime_info->output_tensor_size(index);
|
||||
auto output_format = op_runtime_info->output_format(index);
|
||||
auto device_address =
|
||||
CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id, {item, index});
|
||||
CreateDeviceAddress(nullptr, output_tensor_size, output_format, output_type_id, {item, index});
|
||||
AnfAlgo::SetOutputAddr(device_address, index, item.get());
|
||||
current_tensor->set_device_address(device_address);
|
||||
current_tensor->set_sync_status(kNeedSyncHostToDevice);
|
||||
|
|
Loading…
Reference in New Issue