forked from mindspore-Ecosystem/mindspore
!27977 The Transpose of the DynamicGRUV2 operator does not need to be executed
Merge pull request !27977 from caifubi/master-test-dynamic-gruv2
This commit is contained in:
commit
ebae4ac5ec
|
@ -59,6 +59,7 @@ CNodePtr Insert(const FuncGraphPtr &func_graph, const CNodePtr &cnode) {
|
|||
NewTransOpNode(func_graph, AnfAlgo::GetInputNode(transdata_node->cast<CNodePtr>(), 0), cnode, kernel_select,
|
||||
false, prim::kPrimTranspose->name(), std::vector<int64_t>{2, 3, 1, 0});
|
||||
MS_EXCEPTION_IF_NULL(new_transpose_node);
|
||||
// This Transpose operator is only to change the shape, but does not expect to change the data arrangement!
|
||||
AnfAlgo::SetNodeAttr("nop_op", MakeValue(true), new_transpose_node);
|
||||
RefreshKernelBuildInfo(input_format, kOpFormat_HWCN, new_transpose_node);
|
||||
// trans hwcn to output_format
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include <algorithm>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
#include "acl/acl_rt.h"
|
||||
#include "backend/optimizer/ascend/ascend_backend_optimization.h"
|
||||
#include "backend/optimizer/graph_kernel/graph_kernel_optimization.h"
|
||||
#include "utils/context/graph_kernel_flags.h"
|
||||
|
@ -633,8 +634,9 @@ void AscendDeviceContext::OptimizeSingleOpGraph(const KernelGraphPtr &graph) con
|
|||
void AscendDeviceContext::PreprocessBeforeRunSingleOpGraph(const KernelGraphPtr &graph) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
const auto &nodes = graph->execution_order();
|
||||
// Remove placeholder
|
||||
|
||||
for (const auto &node : nodes) {
|
||||
// Remove placeholder
|
||||
auto op_name = AnfAlgo::GetCNodeName(node);
|
||||
static const std::set<std::string> place_holder_nodes = {kDynamicRNNOpName, kDynamicGRUV2OpName};
|
||||
auto iter = place_holder_nodes.find(op_name);
|
||||
|
@ -652,6 +654,11 @@ void AscendDeviceContext::PreprocessBeforeRunSingleOpGraph(const KernelGraphPtr
|
|||
}
|
||||
node->set_inputs(new_inputs);
|
||||
}
|
||||
|
||||
// Save the nop_op that needs to be memcpy
|
||||
if (op_name == prim::kPrimTranspose->name() && AnfAlgo::HasNodeAttr("nop_op", node)) {
|
||||
nop_op_to_memcpy_.insert(node);
|
||||
}
|
||||
}
|
||||
|
||||
device::ascend::InsertAtomicCleanOps(nodes, &node_atomics_persistent_cache_);
|
||||
|
@ -677,7 +684,7 @@ std::shared_ptr<Bucket> AscendDeviceContext::CreateBucket(uint32_t bucket_id, ui
|
|||
return bucket;
|
||||
}
|
||||
|
||||
bool AscendDeviceContext::SyncRuning() const {
|
||||
bool AscendDeviceContext::PySyncRuning() const {
|
||||
auto ms_context = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ms_context);
|
||||
if ((ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kPynativeMode) &&
|
||||
|
@ -687,6 +694,23 @@ bool AscendDeviceContext::SyncRuning() const {
|
|||
return true;
|
||||
}
|
||||
|
||||
bool AscendDeviceContext::MemoryCopyAsync(const CNodePtr &node, const vector<AddressPtr> &inputs,
|
||||
const vector<AddressPtr> &outputs) const {
|
||||
if (inputs.size() != 1 || outputs.size() != 1) {
|
||||
MS_LOG(ERROR) << "Kernel " << node->fullname_with_scope() << " input output size should be 1 but"
|
||||
<< " input size is:" << inputs.size() << " output size is:" << outputs.size();
|
||||
return false;
|
||||
}
|
||||
|
||||
aclError status = aclrtMemcpyAsync(outputs[0]->addr, outputs[0]->size, inputs[0]->addr, inputs[0]->size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE, compute_stream_);
|
||||
if (status != ACL_ERROR_NONE) {
|
||||
MS_LOG(ERROR) << "MemCpyAsync op aclrtMemcpyAsync failed, ret:" << status;
|
||||
return false;
|
||||
}
|
||||
return PySyncRuning();
|
||||
}
|
||||
|
||||
bool AscendDeviceContext::LaunchKernel(const CNodePtr &kernel, const vector<AddressPtr> &inputs,
|
||||
const vector<AddressPtr> &workspace, const vector<AddressPtr> &outputs,
|
||||
bool is_dynamic_shape) const {
|
||||
|
@ -712,7 +736,7 @@ bool AscendDeviceContext::LaunchKernel(const CNodePtr &kernel, const vector<Addr
|
|||
dynamic_kernel->UpdateArgs();
|
||||
dynamic_kernel->Execute();
|
||||
dynamic_kernel->PostExecute();
|
||||
return SyncRuning();
|
||||
return PySyncRuning();
|
||||
}
|
||||
|
||||
std::vector<AddressPtr> real_inputs;
|
||||
|
@ -732,13 +756,16 @@ bool AscendDeviceContext::LaunchKernel(const CNodePtr &kernel, const vector<Addr
|
|||
}
|
||||
|
||||
std::lock_guard<std::mutex> locker(launch_mutex_);
|
||||
if (nop_op_to_memcpy_.find(kernel) != nop_op_to_memcpy_.end()) {
|
||||
return MemoryCopyAsync(kernel, real_inputs, outputs);
|
||||
}
|
||||
auto ret = kernel_mod->Launch(real_inputs, workspace, outputs, compute_stream_);
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "Launch kernel failed, kernel full name: " << kernel->fullname_with_scope();
|
||||
return false;
|
||||
}
|
||||
|
||||
return SyncRuning();
|
||||
return PySyncRuning();
|
||||
}
|
||||
|
||||
void AscendDeviceContext::BindDeviceToCurrentThread() const {
|
||||
|
|
|
@ -136,7 +136,8 @@ class AscendDeviceContext : public DeviceContext {
|
|||
void LoadModel(const NotNull<KernelGraphPtr> &root_graph) const;
|
||||
void UpdateExecOrder(const KernelGraphPtr &graph) const;
|
||||
static bool IsGraphMode();
|
||||
bool SyncRuning() const;
|
||||
bool PySyncRuning() const;
|
||||
bool MemoryCopyAsync(const CNodePtr &node, const vector<AddressPtr> &inputs, const vector<AddressPtr> &outputs) const;
|
||||
|
||||
void ReportErrorMessage() const;
|
||||
void ReportWarningMessage() const;
|
||||
|
@ -161,6 +162,7 @@ class AscendDeviceContext : public DeviceContext {
|
|||
// Persistent cache for single op execution.
|
||||
// node_atomics_ will be cleaned up in CompileGraph.
|
||||
mutable std::map<CNodePtr, std::vector<CNodePtr>> node_atomics_persistent_cache_;
|
||||
mutable std::set<CNodePtr> nop_op_to_memcpy_;
|
||||
// Some NOP nodes have be hide in execution order, it doesn't have output device address, this function creates
|
||||
// output device address for these nodes, and the output device address is the same with input device address.
|
||||
void AssignOutputNopNodeDeviceAddress(const KernelGraphPtr &graph) const;
|
||||
|
|
Loading…
Reference in New Issue