!27977 The Transpose of the DynamicGRUV2 operator does not need to be executed

Merge pull request !27977 from caifubi/master-test-dynamic-gruv2
This commit is contained in:
i-robot 2021-12-22 06:41:13 +00:00 committed by Gitee
commit ebae4ac5ec
3 changed files with 35 additions and 5 deletions

View File

@ -59,6 +59,7 @@ CNodePtr Insert(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
NewTransOpNode(func_graph, AnfAlgo::GetInputNode(transdata_node->cast<CNodePtr>(), 0), cnode, kernel_select,
false, prim::kPrimTranspose->name(), std::vector<int64_t>{2, 3, 1, 0});
MS_EXCEPTION_IF_NULL(new_transpose_node);
// This Transpose operator is only to change the shape, but does not expect to change the data arrangement!
AnfAlgo::SetNodeAttr("nop_op", MakeValue(true), new_transpose_node);
RefreshKernelBuildInfo(input_format, kOpFormat_HWCN, new_transpose_node);
// trans hwcn to output_format

View File

@ -18,6 +18,7 @@
#include <algorithm>
#include <set>
#include <unordered_map>
#include "acl/acl_rt.h"
#include "backend/optimizer/ascend/ascend_backend_optimization.h"
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
#include "utils/context/graph_kernel_flags.h"
@ -633,8 +634,9 @@ void AscendDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) con
void AscendDeviceContext::PreprocessBeforeRunSingleOpGraph(const KernelGraphPtr &graph) const {
MS_EXCEPTION_IF_NULL(graph);
const auto &nodes = graph->execution_order();
// Remove placeholder
for (const auto &node : nodes) {
// Remove placeholder
auto op_name = AnfAlgo::GetCNodeName(node);
static const std::set<std::string> place_holder_nodes = {kDynamicRNNOpName, kDynamicGRUV2OpName};
auto iter = place_holder_nodes.find(op_name);
@ -652,6 +654,11 @@ void AscendDeviceContext::PreprocessBeforeRunSingleOpGraph(const KernelGraphPtr
}
node->set_inputs(new_inputs);
}
// Save the nop_op that needs to be memcpy
if (op_name == prim::kPrimTranspose->name() && AnfAlgo::HasNodeAttr("nop_op", node)) {
nop_op_to_memcpy_.insert(node);
}
}
device::ascend::InsertAtomicCleanOps(nodes, &node_atomics_persistent_cache_);
@ -677,7 +684,7 @@ std::shared_ptr<Bucket> AscendDeviceContext::CreateBucket(uint32_t bucket_id, ui
return bucket;
}
bool AscendDeviceContext::SyncRuning() const {
bool AscendDeviceContext::PySyncRuning() const {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
if ((ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) &&
@ -687,6 +694,23 @@ bool AscendDeviceContext::SyncRuning() const {
return true;
}
bool AscendDeviceContext::MemoryCopyAsync(const CNodePtr &node, const vector<AddressPtr> &inputs,
const vector<AddressPtr> &outputs) const {
if (inputs.size() != 1 || outputs.size() != 1) {
MS_LOG(ERROR) << "Kernel " << node->fullname_with_scope() << " input output size should be 1 but"
<< " input size is:" << inputs.size() << " output size is:" << outputs.size();
return false;
}
aclError status = aclrtMemcpyAsync(outputs[0]->addr, outputs[0]->size, inputs[0]->addr, inputs[0]->size,
ACL_MEMCPY_DEVICE_TO_DEVICE, compute_stream_);
if (status != ACL_ERROR_NONE) {
MS_LOG(ERROR) << "MemCpyAsync op aclrtMemcpyAsync failed, ret:" << status;
return false;
}
return PySyncRuning();
}
bool AscendDeviceContext::LaunchKernel(const CNodePtr &kernel, const vector<AddressPtr> &inputs,
const vector<AddressPtr> &workspace, const vector<AddressPtr> &outputs,
bool is_dynamic_shape) const {
@ -712,7 +736,7 @@ bool AscendDeviceContext::LaunchKernel(const CNodePtr &kernel, const vector<Addr
dynamic_kernel->UpdateArgs();
dynamic_kernel->Execute();
dynamic_kernel->PostExecute();
return SyncRuning();
return PySyncRuning();
}
std::vector<AddressPtr> real_inputs;
@ -732,13 +756,16 @@ bool AscendDeviceContext::LaunchKernel(const CNodePtr &kernel, const vector<Addr
}
std::lock_guard<std::mutex> locker(launch_mutex_);
if (nop_op_to_memcpy_.find(kernel) != nop_op_to_memcpy_.end()) {
return MemoryCopyAsync(kernel, real_inputs, outputs);
}
auto ret = kernel_mod->Launch(real_inputs, workspace, outputs, compute_stream_);
if (!ret) {
MS_LOG(ERROR) << "Launch kernel failed, kernel full name: " << kernel->fullname_with_scope();
return false;
}
return SyncRuning();
return PySyncRuning();
}
void AscendDeviceContext::BindDeviceToCurrentThread() const {

View File

@ -136,7 +136,8 @@ class AscendDeviceContext : public DeviceContext {
void LoadModel(const NotNull<KernelGraphPtr> &root_graph) const;
void UpdateExecOrder(const KernelGraphPtr &graph) const;
static bool IsGraphMode();
bool SyncRuning() const;
bool PySyncRuning() const;
bool MemoryCopyAsync(const CNodePtr &node, const vector<AddressPtr> &inputs, const vector<AddressPtr> &outputs) const;
void ReportErrorMessage() const;
void ReportWarningMessage() const;
@ -161,6 +162,7 @@ class AscendDeviceContext : public DeviceContext {
// Persistent cache for single op execution.
// node_atomics_ will be cleaned up in CompileGraph.
mutable std::map<CNodePtr, std::vector<CNodePtr>> node_atomics_persistent_cache_;
mutable std::set<CNodePtr> nop_op_to_memcpy_;
// Some NOP nodes have be hide in execution order, it doesn't have output device address, this function creates
// output device address for these nodes, and the output device address is the same with input device address.
void AssignOutputNopNodeDeviceAddress(const KernelGraphPtr &graph) const;