replace memcpy_async with tensor move

This commit is contained in:
laiyongqiang 2021-04-15 11:24:52 +08:00
parent e349e5523f
commit 1533435015
17 changed files with 199 additions and 199 deletions

View File

@ -100,13 +100,13 @@
#include "backend/optimizer/ascend/buffer_fusion/reduce_eltwise_fusion_pass.h"
#include "backend/optimizer/ascend/buffer_fusion/segment_eltwise_fusion_pass.h"
#include "backend/optimizer/ascend/format_type/deal_ref_and_split_unsupported_transdata.h"
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.h"
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_cascade.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_cascade.h"
#include "backend/optimizer/ascend/enhancer/insert_pad_for_nms_with_mask.h"
#include "backend/optimizer/ascend/format_type/insert_transdata_for_runop.h"
#include "backend/optimizer/ascend/enhancer/getnext_memcpy_elimination.h"
#include "backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h"
#include "backend/optimizer/ascend/ir_fission/addn_fission.h"
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_getnext.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h"
#include "backend/optimizer/ascend/ir_fission/batch_norm_grad_infer_fission.h"
#include "backend/optimizer/ascend/ir_fission/split_fission.h"
#include "backend/optimizer/ascend/ir_fission/splitv_fission.h"
@ -292,11 +292,11 @@ void AscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGrap
if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && context_ptr->get_param<bool>(MS_CTX_ENABLE_LOOP_SINK) &&
ConfigManager::GetInstance().iter_num() > 1) {
ir_fusion_pm->AddPass(std::make_shared<InsertMemcpyAsyncForGetNext>());
ir_fusion_pm->AddPass(std::make_shared<InsertTensorMoveForGetNext>());
ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
}
ir_fusion_pm->AddPass(std::make_shared<InsertMemcpyAsyncForHcclOp>());
ir_fusion_pm->AddPass(std::make_shared<InsertTensorMoveForHcclOp>());
ir_fusion_pm->AddPass(std::make_shared<InsertTranspose>());
ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
@ -370,7 +370,7 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern
other_pm->AddPass(std::make_shared<ReduceScatterFusion>());
other_pm->AddPass(std::make_shared<SplitInputsForReduceScatter>());
other_pm->AddPass(std::make_shared<BroadcastFusion>());
other_pm->AddPass(std::make_shared<InsertMemcpyAsyncForCascade>());
other_pm->AddPass(std::make_shared<InsertTensorMoveForCascade>());
other_pm->AddPass(std::make_shared<ParameterTransOpFusion>());
other_pm->AddPass(std::make_shared<RefreshParameterFormat>());
other_pm->AddPass(std::make_shared<SplitOpOptimizer>());
@ -387,7 +387,7 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern
other2_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && context_ptr->get_param<bool>(MS_CTX_ENABLE_LOOP_SINK) &&
ConfigManager::GetInstance().iter_num() > 1) {
other2_pm->AddPass(std::make_shared<GetnextMemcpyElimination>());
other2_pm->AddPass(std::make_shared<GetnextTensorMoveElimination>());
}
other2_pm->AddPass(std::make_shared<CheckConsistency>());
optimizer2->AddPassManager(other2_pm);

View File

@ -383,10 +383,10 @@ CNodePtr InsertCastForInput(const FuncGraphPtr &func_graph, const CNodePtr &cnod
return new_node;
}
AnfNodePtr CreateMemcpyAsyncOp(const FuncGraphPtr &graph, const AnfNodePtr &node) {
AnfNodePtr CreateTensorMoveOp(const FuncGraphPtr &graph, const AnfNodePtr &node) {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(node);
auto prim = std::make_shared<Primitive>(kMemCpyAsyncOpName);
auto prim = std::make_shared<Primitive>(kTensorMoveOpName);
std::vector<AnfNodePtr> new_node_inputs = {NewValueNode(prim), node};
auto new_node = graph->NewCNode(new_node_inputs);
MS_EXCEPTION_IF_NULL(new_node);

View File

@ -108,7 +108,7 @@ AnfNodePtr InsertTransOpForOutput(const FuncGraphPtr &func_graph, const AnfNodeP
CNodePtr InsertCastForInput(const FuncGraphPtr &func_graph, const CNodePtr &cnode);
AnfNodePtr CreateMemcpyAsyncOp(const FuncGraphPtr &graph, const AnfNodePtr &node);
AnfNodePtr CreateTensorMoveOp(const FuncGraphPtr &graph, const AnfNodePtr &node);
AnfNodePtr AddTransOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const KernelSelectPtr &kernel_select, size_t insert_index, bool is_insert_input);

View File

@ -14,49 +14,49 @@
* limitations under the License.
*/
#include "backend/optimizer/ascend/enhancer/getnext_memcpy_elimination.h"
#include "backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h"
#include <memory>
#include "backend/session/anf_runtime_algorithm.h"
#include "frontend/optimizer/opt.h"
namespace mindspore::opt {
const BaseRef GetnextMemcpyElimination::DefinePattern() const {
auto prim_memcpy = std::make_shared<Primitive>(kMemCpyAsyncOpName);
const BaseRef GetnextTensorMoveElimination::DefinePattern() const {
auto prim_tensor_move = std::make_shared<Primitive>(kTensorMoveOpName);
VarPtr x = std::make_shared<SeqVar>();
VectorRef memcpy_async({prim_memcpy, x});
return memcpy_async;
VectorRef tensor_move({prim_tensor_move, x});
return tensor_move;
}
const AnfNodePtr GetnextMemcpyElimination::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
const AnfNodePtr GetnextTensorMoveElimination::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
const EquivPtr &equiv) const {
if (graph == nullptr || node == nullptr || equiv == nullptr) {
return nullptr;
}
auto memcpy_cnode = node->cast<CNodePtr>();
if (memcpy_cnode == nullptr) {
auto tensor_move_node = node->cast<CNodePtr>();
if (tensor_move_node == nullptr) {
return nullptr;
}
// 1. memcpy has attr kAttrLabelForInsertStreamActive
if (!AnfAlgo::HasNodeAttr(kAttrLabelForInsertStreamActive, memcpy_cnode)) {
// 1. tensor move has attr kAttrLabelForInsertStreamActive
if (!AnfAlgo::HasNodeAttr(kAttrLabelForInsertStreamActive, tensor_move_node)) {
MS_LOG(DEBUG) << "node has no label_for_insert_stream_active attr";
return nullptr;
}
// 2. memcpy's output has only one user next_node
// 2. tensor move's output has only one user next_node
auto manager = graph->manager();
MS_EXCEPTION_IF_NULL(manager);
if (manager->node_users().find(memcpy_cnode) == manager->node_users().end()) {
MS_LOG(EXCEPTION) << "memcpy has no output in manager";
if (manager->node_users().find(tensor_move_node) == manager->node_users().end()) {
MS_LOG(EXCEPTION) << "tensor move has no output in manager";
}
auto next_nodes = manager->node_users()[memcpy_cnode];
auto next_nodes = manager->node_users()[tensor_move_node];
if (next_nodes.size() > 1) {
MS_LOG(DEBUG) << "node's output has more than one users";
return nullptr;
}
// 3. next_node is not nop node, not graph output and it has only one input which is memcpy's output
// 3. next_node is not nop node, not graph output and it has only one input which is tensor move's output
for (auto &item : next_nodes) {
auto next_node = item.first->cast<CNodePtr>();
if (opt::IsNopNode(next_node)) {
@ -77,6 +77,6 @@ const AnfNodePtr GetnextMemcpyElimination::Process(const FuncGraphPtr &graph, co
AnfAlgo::SetNodeAttr(kAttrLabelForInsertStreamActive, MakeValue(true), next_node);
}
return memcpy_cnode->input(1);
return tensor_move_node->input(1);
}
} // namespace mindspore::opt

View File

@ -13,21 +13,21 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_TENSORMOVE_ELIMINATION_H
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_TENSORMOVE_ELIMINATION_H
#include "backend/optimizer/common/optimizer.h"
namespace mindspore {
namespace opt {
class GetnextMemcpyElimination : public PatternProcessPass {
class GetnextTensorMoveElimination : public PatternProcessPass {
public:
explicit GetnextMemcpyElimination(bool multigraph = true)
: PatternProcessPass("getnext_memcpy_elimination", multigraph) {}
~GetnextMemcpyElimination() override = default;
explicit GetnextTensorMoveElimination(bool multigraph = true)
: PatternProcessPass("getnext_tensormove_elimination", multigraph) {}
~GetnextTensorMoveElimination() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_TENSORMOVE_ELIMINATION_H

View File

@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_cascade.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_cascade.h"
#include <vector>
#include "utils/utils.h"
#include "backend/session/anf_runtime_algorithm.h"
@ -69,36 +69,36 @@ bool IsPartOutputsOfHcclOp(const AnfNodePtr &node, const CNodePtr &cur_hccl, con
}
} // namespace
AnfNodePtr InsertMemcpyAsyncForCascade::InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const {
AnfNodePtr InsertTensorMoveForCascade::InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(hccl_node);
std::vector<AnfNodePtr> memcpy_async_list;
std::vector<AnfNodePtr> tensor_move_list;
std::vector<AnfNodePtr> new_inputs = {hccl_node->input(0)};
for (size_t i = 1; i < hccl_node->size(); ++i) {
auto input = hccl_node->input(i);
MS_EXCEPTION_IF_NULL(input);
// when input is also a hccl op and just part outputs of it linking with cur_hccl_op
if (IsPartOutputsOfHcclOp(input, hccl_node, graph)) {
auto memcpy_async = CreateMemcpyAsyncOp(graph, input);
if (memcpy_async == nullptr) {
MS_LOG(EXCEPTION) << "Create memcpy_async op failed."
auto tensor_move = CreateTensorMoveOp(graph, input);
if (tensor_move == nullptr) {
MS_LOG(EXCEPTION) << "Create tensor_move op failed."
<< " trace: " << trace::DumpSourceLines(hccl_node);
}
if (AnfAlgo::IsNodeDynamicShape(input)) {
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), memcpy_async);
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), tensor_move);
}
auto kernel_info = std::make_shared<device::KernelInfo>();
memcpy_async->set_kernel_info(kernel_info);
tensor_move->set_kernel_info(kernel_info);
MS_EXCEPTION_IF_NULL(kernel_select_);
kernel_select_->SelectKernel(memcpy_async->cast<CNodePtr>());
new_inputs.push_back(memcpy_async);
memcpy_async_list.push_back(memcpy_async);
kernel_select_->SelectKernel(tensor_move->cast<CNodePtr>());
new_inputs.push_back(tensor_move);
tensor_move_list.push_back(tensor_move);
} else {
new_inputs.push_back(input);
}
}
if (!memcpy_async_list.empty()) {
if (!tensor_move_list.empty()) {
CNodePtr new_hccl_node = std::make_shared<CNode>(*hccl_node);
new_hccl_node->set_inputs(new_inputs);
return new_hccl_node;
@ -106,8 +106,8 @@ AnfNodePtr InsertMemcpyAsyncForCascade::InsertMemcpyAsync(const FuncGraphPtr &gr
return nullptr;
}
const AnfNodePtr InsertMemcpyAsyncForCascade::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
const AnfNodePtr InsertTensorMoveForCascade::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
if (func_graph == nullptr || node == nullptr || !node->isa<CNode>()) {
return nullptr;
}
@ -115,7 +115,7 @@ const AnfNodePtr InsertMemcpyAsyncForCascade::Process(const FuncGraphPtr &func_g
if (!AnfAlgo::IsCommunicationOp(node)) {
return nullptr;
}
return InsertMemcpyAsync(func_graph, cnode);
return InsertTensorMove(func_graph, cnode);
}
} // namespace opt
} // namespace mindspore

View File

@ -13,8 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_CASCADE_H_
#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_CASCADE_H_
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_TENSORMOVE_ASYNC_FOR_CASCADE_H_
#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_TENSORMOVE_ASYNC_FOR_CASCADE_H_
#include <memory>
#include "backend/optimizer/common/optimizer.h"
@ -22,18 +22,18 @@
namespace mindspore {
namespace opt {
class InsertMemcpyAsyncForCascade : public PatternProcessPass {
class InsertTensorMoveForCascade : public PatternProcessPass {
public:
explicit InsertMemcpyAsyncForCascade(bool multigraph = true)
: PatternProcessPass("insert_memcpy_async_for_cascade", multigraph),
explicit InsertTensorMoveForCascade(bool multigraph = true)
: PatternProcessPass("insert_tensor_move_for_cascade", multigraph),
kernel_select_(std::make_shared<KernelSelect>()) {}
~InsertMemcpyAsyncForCascade() override = default;
~InsertTensorMoveForCascade() override = default;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
private:
AnfNodePtr InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const;
AnfNodePtr InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const;
KernelSelectPtr kernel_select_;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_OP_CASCADE_H_
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_TENSORMOVE_ASYNC_FOR_OP_CASCADE_H_

View File

@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_getnext.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h"
#include <vector>
#include <memory>
#include "backend/optimizer/ascend/ascend_helper.h"
@ -22,14 +22,14 @@
namespace mindspore {
namespace opt {
AnfNodePtr InsertMemcpyAsyncForGetNextOutputs(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
AnfNodePtr InsertTensorMoveForGetNextOutputs(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
if (func_graph == nullptr || node == nullptr) {
return nullptr;
}
size_t output_num = AnfAlgo::GetOutputTensorNum(node);
if (output_num == 0) {
MS_LOG(DEBUG) << "Output number is zero, no need to insert memcpy_async!";
MS_LOG(DEBUG) << "Output number is zero, no need to insert tensor_move!";
return node;
}
@ -39,9 +39,9 @@ AnfNodePtr InsertMemcpyAsyncForGetNextOutputs(const FuncGraphPtr &func_graph, co
for (size_t output_index = 0; output_index < output_num; ++output_index) {
auto tuple_get_item = CreatTupleGetItemNode(func_graph, node, output_index);
auto new_node = CreateMemcpyAsyncOp(func_graph, tuple_get_item);
auto new_node = CreateTensorMoveOp(func_graph, tuple_get_item);
if (new_node == nullptr) {
MS_LOG(EXCEPTION) << "Create memcpy_async op failed!";
MS_LOG(EXCEPTION) << "Create tensor move op failed!";
}
if (AnfAlgo::IsNodeDynamicShape(tuple_get_item)) {
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), new_node);
@ -53,15 +53,15 @@ AnfNodePtr InsertMemcpyAsyncForGetNextOutputs(const FuncGraphPtr &func_graph, co
return make_tuple;
}
const BaseRef InsertMemcpyAsyncForGetNext::DefinePattern() const {
const BaseRef InsertTensorMoveForGetNext::DefinePattern() const {
std::shared_ptr<Var> Xs = std::make_shared<SeqVar>();
auto prim = std::make_shared<Primitive>(kGetNextOpName);
return VectorRef({prim, Xs});
}
const AnfNodePtr InsertMemcpyAsyncForGetNext::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
const AnfNodePtr InsertTensorMoveForGetNext::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
if (func_graph == nullptr || node == nullptr || !AnfAlgo::IsRealKernel(node)) {
return nullptr;
}
@ -73,7 +73,7 @@ const AnfNodePtr InsertMemcpyAsyncForGetNext::Process(const FuncGraphPtr &func_g
}
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), cnode);
return InsertMemcpyAsyncForGetNextOutputs(func_graph, cnode);
return InsertTensorMoveForGetNextOutputs(func_graph, cnode);
}
} // namespace opt
} // namespace mindspore

View File

@ -14,22 +14,22 @@
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_GETNEXT_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_GETNEXT_H_
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_GETNEXT_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_GETNEXT_H_
#include "backend/optimizer/common/optimizer.h"
namespace mindspore {
namespace opt {
class InsertMemcpyAsyncForGetNext : public PatternProcessPass {
class InsertTensorMoveForGetNext : public PatternProcessPass {
public:
explicit InsertMemcpyAsyncForGetNext(bool multigraph = true)
: PatternProcessPass("insert_memcpy_async_for_getnext", multigraph) {}
~InsertMemcpyAsyncForGetNext() override = default;
explicit InsertTensorMoveForGetNext(bool multigraph = true)
: PatternProcessPass("insert_tensor_move_for_getnext", multigraph) {}
~InsertTensorMoveForGetNext() override = default;
const BaseRef DefinePattern() const override;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_GETNEXT_H_
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_GETNEXT_H_

View File

@ -13,7 +13,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h"
#include <vector>
#include <set>
#include <string>
@ -25,9 +25,9 @@
namespace mindspore {
namespace opt {
namespace {
// insert memcpy for some cnode even if not a Ref cnode
const std::set<std::string> kNeedInsertMemcpyOpSet = {kLambNextMVOpName, kLambNextMVWithDecayOpName,
kLambUpdateWithLROpName};
// insert tensormove for some cnode even if not a Ref cnode
const std::set<std::string> kNeedInsertTensorMoveOpSet = {kLambNextMVOpName, kLambNextMVWithDecayOpName,
kLambUpdateWithLROpName};
bool IsParameterOrValueNode(const AnfNodePtr &node) {
MS_EXCEPTION_IF_NULL(node);
@ -43,7 +43,7 @@ bool IsParameterOrValueNode(const AnfNodePtr &node) {
// NodeUsersMap, for node B input i use node A, it will be one item in map with key: A, and value: (B, i)
bool IsNodeOutPutUsedByOtherRealKernel(const AnfNodeIndexSet &node_users) {
if (node_users.size() == 1) {
MS_LOG(INFO) << "This node only used once, no need to insert memcpy node.";
MS_LOG(INFO) << "This node only used once, no need to insert tensormove node.";
return false;
}
for (const auto &node_pair : node_users) {
@ -53,13 +53,13 @@ bool IsNodeOutPutUsedByOtherRealKernel(const AnfNodeIndexSet &node_users) {
return true;
}
}
MS_LOG(INFO) << "This node used by other node, but the node is not real kernel, no need to insert memcpy node.";
MS_LOG(INFO) << "This node used by other node, but the node is not real kernel, no need to insert tensormove node.";
return false;
}
} // namespace
bool InsertMemcpyAsyncForHcclOp::NeedInsertMemcpy(const FuncGraphPtr &graph, const AnfNodePtr &input,
const CNodePtr &cur_node) const {
bool InsertTensorMoveForHcclOp::NeedInsertTensorMove(const FuncGraphPtr &graph, const AnfNodePtr &input,
const CNodePtr &cur_node) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(input);
MS_EXCEPTION_IF_NULL(cur_node);
@ -79,7 +79,7 @@ bool InsertMemcpyAsyncForHcclOp::NeedInsertMemcpy(const FuncGraphPtr &graph, con
}
// when input is some special cnodes
if (kNeedInsertMemcpyOpSet.find(AnfAlgo::GetCNodeName(input)) != kNeedInsertMemcpyOpSet.end()) {
if (kNeedInsertTensorMoveOpSet.find(AnfAlgo::GetCNodeName(input)) != kNeedInsertTensorMoveOpSet.end()) {
return true;
}
@ -96,29 +96,29 @@ bool InsertMemcpyAsyncForHcclOp::NeedInsertMemcpy(const FuncGraphPtr &graph, con
return false;
}
void InsertMemcpyAsyncForHcclOp::InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const {
void InsertTensorMoveForHcclOp::InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(hccl_node);
bool need_memcpy_async = false;
bool need_tensor_move_async = false;
std::vector<AnfNodePtr> new_inputs = {hccl_node->input(0)};
for (size_t i = 1; i < hccl_node->size(); ++i) {
auto input = hccl_node->input(i);
if (NeedInsertMemcpy(graph, input, hccl_node)) {
auto memcpy_async = CreateMemcpyAsyncOp(graph, input);
if (memcpy_async == nullptr) {
MS_LOG(EXCEPTION) << "Create memcpy_async op failed.";
if (NeedInsertTensorMove(graph, input, hccl_node)) {
auto tensor_move = CreateTensorMoveOp(graph, input);
if (tensor_move == nullptr) {
MS_LOG(EXCEPTION) << "Create tensor_move op failed.";
}
if (input->isa<CNode>() && AnfAlgo::IsNodeDynamicShape(input)) {
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), memcpy_async);
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), tensor_move);
}
new_inputs.push_back(memcpy_async);
need_memcpy_async = true;
new_inputs.push_back(tensor_move);
need_tensor_move_async = true;
} else {
new_inputs.push_back(input);
}
}
if (need_memcpy_async) {
if (need_tensor_move_async) {
CNodePtr new_hccl_node = std::make_shared<CNode>(*hccl_node);
new_hccl_node->set_inputs(new_inputs);
auto manager = graph->manager();
@ -129,15 +129,15 @@ void InsertMemcpyAsyncForHcclOp::InsertMemcpyAsync(const FuncGraphPtr &graph, co
}
}
const AnfNodePtr InsertMemcpyAsyncForHcclOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
const AnfNodePtr InsertTensorMoveForHcclOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
const EquivPtr &) const {
if (func_graph == nullptr || node == nullptr || !node->isa<CNode>()) {
return nullptr;
}
if (!AnfAlgo::IsCommunicationOp(node)) {
return nullptr;
}
InsertMemcpyAsync(func_graph, node->cast<CNodePtr>());
InsertTensorMove(func_graph, node->cast<CNodePtr>());
return nullptr;
}
} // namespace opt

View File

@ -13,8 +13,8 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_HCCL_OP_H_
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_HCCL_OP_H_
#include <memory>
#include "backend/optimizer/common/optimizer.h"
@ -22,19 +22,19 @@
namespace mindspore {
namespace opt {
class InsertMemcpyAsyncForHcclOp : public PatternProcessPass {
class InsertTensorMoveForHcclOp : public PatternProcessPass {
public:
explicit InsertMemcpyAsyncForHcclOp(bool multigraph = true)
: PatternProcessPass("insert_memcpy_async_for_hccl_op", multigraph),
explicit InsertTensorMoveForHcclOp(bool multigraph = true)
: PatternProcessPass("insert_tensor_move_for_hccl_op", multigraph),
kernel_query_(std::make_shared<KernelQuery>()) {}
~InsertMemcpyAsyncForHcclOp() override = default;
~InsertTensorMoveForHcclOp() override = default;
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
private:
void InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const;
bool NeedInsertMemcpy(const FuncGraphPtr &graph, const AnfNodePtr &input, const CNodePtr &cur_node) const;
void InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const;
bool NeedInsertTensorMove(const FuncGraphPtr &graph, const AnfNodePtr &input, const CNodePtr &cur_node) const;
KernelQueryPtr kernel_query_;
};
} // namespace opt
} // namespace mindspore
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_HCCL_OP_H_

View File

@ -22,75 +22,75 @@
#include "utils/utils.h"
#include "backend/kernel_compiler/kernel_build_info.h"
#include "backend/optimizer/common/optimizer.h"
#include "mindspore/ccsrc/backend/optimizer/ascend/enhancer/getnext_memcpy_elimination.h"
#include "mindspore/ccsrc/backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h"
namespace mindspore {
namespace opt {
class TestGetNextMemcpyElimination : public BackendCommon {
class TestGetNextTensorMoveElimination : public BackendCommon {
public:
TestGetNextMemcpyElimination() : get_py_fun_("gtest_input.pre_activate.getnext_memcpy_elimination_test", true) {}
TestGetNextTensorMoveElimination() : get_py_fun_("gtest_input.pre_activate.getnext_tensor_move_elimination_test", true) {}
public:
UT::PyFuncGraphFetcher get_py_fun_;
};
TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination", "before");
TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensormove_elimination) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination", "before");
ASSERT_TRUE(g_before != nullptr);
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::GetnextMemcpyElimination>();
auto pass = std::make_shared<opt::GetnextTensorMoveElimination>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(g_before);
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}
TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_no_attr) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_no_attr", "before");
TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensor_move_elimination_no_attr) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_no_attr", "before");
ASSERT_TRUE(g_before != nullptr);
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::GetnextMemcpyElimination>();
auto pass = std::make_shared<opt::GetnextTensorMoveElimination>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(g_before);
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_no_attr", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_no_attr", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}
TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_memcpy_multi_users) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_memcpy_multi_users", "before");
TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensor_move_elimination_tensor_move_multi_users) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_tensor_move_multi_users", "before");
ASSERT_TRUE(g_before != nullptr);
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::GetnextMemcpyElimination>();
auto pass = std::make_shared<opt::GetnextTensorMoveElimination>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(g_before);
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_memcpy_multi_users", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_tensor_move_multi_users", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}
TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_next_multi_inputs) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_next_multi_inputs", "before");
TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensor_move_elimination_next_multi_inputs) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_next_multi_inputs", "before");
ASSERT_TRUE(g_before != nullptr);
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::GetnextMemcpyElimination>();
auto pass = std::make_shared<opt::GetnextTensorMoveElimination>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(g_before);
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_next_multi_inputs", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_next_multi_inputs", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}

View File

@ -24,23 +24,23 @@
#include "utils/utils.h"
#include "backend/kernel_compiler/kernel_build_info.h"
#include "backend/optimizer/common/optimizer.h"
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_getnext.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h"
namespace mindspore {
namespace opt {
using KernelBuildInfoBuilder = kernel::KernelBuildInfo::KernelBuildInfoBuilder;
class TestHWInsertMemcpyAsyncForGetNext : public BackendCommon {
class TestHWInsertTensorMoveForGetNext : public BackendCommon {
public:
TestHWInsertMemcpyAsyncForGetNext() : get_py_fun_("gtest_input.pre_activate.insert_memcpy_async_for_getnext", true) {}
~TestHWInsertMemcpyAsyncForGetNext() override = default;
TestHWInsertTensorMoveForGetNext() : get_py_fun_("gtest_input.pre_activate.insert_tensor_move_for_getnext", true) {}
~TestHWInsertTensorMoveForGetNext() override = default;
public:
UT::PyFuncGraphFetcher get_py_fun_;
};
TEST_F(TestHWInsertMemcpyAsyncForGetNext, test_insert_memcpy_async_for_getnext_multi_output) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_getnext", "getnext_multi_output_before");
TEST_F(TestHWInsertTensorMoveForGetNext, test_insert_tensor_move_for_getnext_multi_output) {
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_getnext", "getnext_multi_output_before");
AbstractBasePtrList args_spec_list{};
auto kernel_graph = GetKernelGraph(g_before, args_spec_list);
@ -57,11 +57,11 @@ TEST_F(TestHWInsertMemcpyAsyncForGetNext, test_insert_memcpy_async_for_getnext_m
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
pm->AddPass(std::make_shared<opt::InsertMemcpyAsyncForGetNext>());
pm->AddPass(std::make_shared<opt::InsertTensorMoveForGetNext>());
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(kernel_graph);
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_getnext", "getnext_multi_output_after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_getnext", "getnext_multi_output_after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}
} // namespace opt

View File

@ -25,24 +25,24 @@
#include "ir/param_info.h"
#define private public
#define protected public
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.h"
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h"
#undef private
#undef protected
namespace mindspore {
namespace opt {
class TestHWInsertMemcpyForHccl : public BackendCommon {
class TestHWInsertTensorMoveForHccl : public BackendCommon {
public:
TestHWInsertMemcpyForHccl() : get_py_fun_("gtest_input.pre_activate.insert_memcpy_async_for_hccl_op", true) {}
~TestHWInsertMemcpyForHccl() override = default;
TestHWInsertTensorMoveForHccl() : get_py_fun_("gtest_input.pre_activate.insert_tensor_move_for_hccl_op", true) {}
~TestHWInsertTensorMoveForHccl() override = default;
public:
UT::PyFuncGraphFetcher get_py_fun_;
};
class MockInsertMemcpyForHcclKernelQuery : public KernelQuery {
class MockInsertTensorMoveForHcclKernelQuery : public KernelQuery {
public:
MockInsertMemcpyForHcclKernelQuery() = default;
~MockInsertMemcpyForHcclKernelQuery() override = default;
MockInsertTensorMoveForHcclKernelQuery() = default;
~MockInsertTensorMoveForHcclKernelQuery() override = default;
bool IsTbeRef(const AnfNodePtr &node) override {
MS_EXCEPTION_IF_NULL(node);
if (!node->isa<CNode>()) {
@ -53,9 +53,9 @@ class MockInsertMemcpyForHcclKernelQuery : public KernelQuery {
}
};
TEST_F(TestHWInsertMemcpyForHccl, test_cond1_no_insert) {
TEST_F(TestHWInsertTensorMoveForHccl, test_cond1_no_insert) {
get_py_fun_.SetDoResolve(true);
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond1", "before2");
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond1", "before2");
ASSERT_TRUE(g != nullptr);
std::vector<int64_t> shp_x{1, 64, 112, 112};
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
@ -66,7 +66,7 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond1_no_insert) {
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(kg);
@ -74,9 +74,9 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond1_no_insert) {
EXPECT_TRUE(CheckEqualGraph(origin_graph, new_graph));
}
TEST_F(TestHWInsertMemcpyForHccl, test_cond2) {
TEST_F(TestHWInsertTensorMoveForHccl, test_cond2) {
get_py_fun_.SetDoResolve(true);
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond2", "before");
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond2", "before");
ASSERT_TRUE(g != nullptr);
std::vector<int64_t> shp_x{1, 64, 112, 112};
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
@ -90,19 +90,19 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond2) {
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(kg);
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond2", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond2", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}
TEST_F(TestHWInsertMemcpyForHccl, test_cond3) {
TEST_F(TestHWInsertTensorMoveForHccl, test_cond3) {
get_py_fun_.SetDoResolve(true);
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond3", "before");
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond3", "before");
ASSERT_TRUE(g != nullptr);
std::vector<int64_t> shp_x{3, 2};
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
@ -112,19 +112,19 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond3) {
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(kg);
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond3", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond3", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}
TEST_F(TestHWInsertMemcpyForHccl, test_cond4) {
TEST_F(TestHWInsertTensorMoveForHccl, test_cond4) {
get_py_fun_.SetDoResolve(true);
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond4", "before");
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond4", "before");
ASSERT_TRUE(g != nullptr);
std::vector<int64_t> shp_x{1, 64, 112, 112};
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
@ -139,19 +139,19 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond4) {
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(kg);
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond4", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond4", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}
TEST_F(TestHWInsertMemcpyForHccl, test_cond5) {
TEST_F(TestHWInsertTensorMoveForHccl, test_cond5) {
get_py_fun_.SetDoResolve(true);
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond5", "before");
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond5", "before");
ASSERT_TRUE(g != nullptr);
std::vector<int64_t> shp_x{1, 64, 112, 112};
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
@ -166,14 +166,14 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond5) {
auto optimizer = std::make_shared<opt::GraphOptimizer>();
auto pm = std::make_shared<opt::PassManager>();
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>();
pm->AddPass(pass);
optimizer->AddPassManager(pm);
auto new_graph = optimizer->Optimize(kg);
kg->SetExecOrderByDefault();
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond5", "after");
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond5", "after");
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
}
} // namespace opt

View File

@ -18,9 +18,9 @@ from mindspore.ops import Primitive
from mindspore.ops import operations as P
get_next = P.GetNext([ms.float32], [[1, 64, 112, 112]], 1, "")
memcpy_async_attr = Primitive('memcpy_async')
memcpy_async_attr.add_prim_attr("label_for_insert_stream_active", True)
memcpy_async = Primitive('memcpy_async')
tensor_move_attr = Primitive('TensorMove')
tensor_move_attr.add_prim_attr("label_for_insert_stream_active", True)
tensor_move = Primitive('tensor_move')
cast = P.Cast()
add = P.Add()
@ -36,13 +36,13 @@ class FnDict:
return self.fnDict[name]
def test_getnext_memcpy_elimination(tag):
def test_getnext_tensor_move_elimination(tag):
fns = FnDict()
@fns
def before():
res = get_next()
res = memcpy_async_attr(res)
res = tensor_move_attr(res)
res = cast(res)
res = add(res)
return res
@ -57,63 +57,63 @@ def test_getnext_memcpy_elimination(tag):
return fns[tag]
def test_getnext_memcpy_elimination_no_attr(tag):
def test_getnext_tensor_move_elimination_no_attr(tag):
fns = FnDict()
@fns
def before():
res = get_next()
res = memcpy_async(res)
res = tensor_move(res)
res = cast(res)
return res
@fns
def after():
res = get_next()
res = memcpy_async(res)
res = tensor_move(res)
res = cast(res)
return res
return fns[tag]
def test_getnext_memcpy_elimination_memcpy_multi_users(tag):
def test_getnext_tensor_move_elimination_tensor_move_multi_users(tag):
fns = FnDict()
@fns
def before():
res = get_next()
memcpy_out = memcpy_async_attr(res)
res = cast(memcpy_out)
res = add(memcpy_out, res)
tensor_move_out = tensor_move_attr(res)
res = cast(tensor_move_out)
res = add(tensor_move_out, res)
return res
@fns
def after():
res = get_next()
memcpy_out = memcpy_async_attr(res)
res = cast(memcpy_out)
res = add(memcpy_out, res)
tensor_move_out = tensor_move_attr(res)
res = cast(tensor_move_out)
res = add(tensor_move_out, res)
return res
return fns[tag]
def test_getnext_memcpy_elimination_next_multi_inputs(tag):
def test_getnext_tensor_move_elimination_next_multi_inputs(tag):
fns = FnDict()
@fns
def before():
res = get_next()
memcpy_out = memcpy_async_attr(res)
res = add(memcpy_out, res)
tensormove_out = tensor_move_attr(res)
res = add(tensormove_out, res)
return res
@fns
def after():
res = get_next()
memcpy_out = memcpy_async_attr(res)
res = add(memcpy_out, res)
tensormove_out = tensor_move_attr(res)
res = add(tensormove_out, res)
return res
return fns[tag]

View File

@ -19,7 +19,7 @@ from mindspore.ops import _constants as Constants
from mindspore.ops import operations as P
get_next = P.GetNext([ms.float32, ms.int32], [[32, 64], [32]], 2, "")
memcpy_async = Primitive('memcpy_async')
tensor_move = Primitive('TensorMove')
make_tuple = Primitive('MakeTuple')
tuple_getitem = Primitive(Constants.kTupleGetItem)
@ -35,7 +35,7 @@ class FnDict:
return self.fnDict[name]
def test_insert_memcpy_async_for_getnext(tag):
def test_insert_tensor_move_for_getnext(tag):
fns = FnDict()
@fns
@ -48,9 +48,9 @@ def test_insert_memcpy_async_for_getnext(tag):
res = get_next()
data = tuple_getitem(res, 0)
label = tuple_getitem(res, 1)
memcpy_async_data = memcpy_async(data)
memcpy_async_label = memcpy_async(label)
bind_tuple = make_tuple(memcpy_async_data, memcpy_async_label)
tensor_move_data = tensor_move(data)
tensor_move_label = tensor_move(label)
bind_tuple = make_tuple(tensor_move_data, tensor_move_label)
get_item0 = tuple_getitem(bind_tuple, 0)
get_item1 = tuple_getitem(bind_tuple, 1)
bind_tuple = make_tuple(make_tuple(get_item0, get_item1))

View File

@ -20,7 +20,7 @@ from mindspore.ops import _constants as Constants
depend = P.Depend()
all_reduce = P.AllReduce()
broadcast = P.Broadcast(1)
memcpy_async = Primitive('memcpy_async')
tensor_move = Primitive('TensorMove')
make_tuple = Primitive('MakeTuple')
tuple_getitem = Primitive(Constants.kTupleGetItem)
assign_add = P.AssignAdd()
@ -39,7 +39,7 @@ class FnDict:
return self.fnDict[name]
def test_insert_memcpy_async_for_hccl_op_cond1(tag):
def test_insert_tensor_move_for_hccl_op_cond1(tag):
fns = FnDict()
@fns
@ -57,14 +57,14 @@ def test_insert_memcpy_async_for_hccl_op_cond1(tag):
@fns
def after(x):
res1 = relu(x)
res2 = memcpy_async(res1)
res2 = tensor_move(res1)
res2 = all_reduce(res2)
return make_tuple(make_tuple(res1, res2))
return fns[tag]
def test_insert_memcpy_async_for_hccl_op_cond2(tag):
def test_insert_tensor_move_for_hccl_op_cond2(tag):
fns = FnDict()
@fns
@ -74,14 +74,14 @@ def test_insert_memcpy_async_for_hccl_op_cond2(tag):
@fns
def after(x):
res = memcpy_async(x)
res = tensor_move(x)
res = all_reduce(res)
return make_tuple(res)
return fns[tag]
def test_insert_memcpy_async_for_hccl_op_cond3(tag):
def test_insert_tensor_move_for_hccl_op_cond3(tag):
fns = FnDict()
@fns
@ -93,14 +93,14 @@ def test_insert_memcpy_async_for_hccl_op_cond3(tag):
@fns
def after(a, b):
res = assign_add(a, b)
res = memcpy_async(res)
res = tensor_move(res)
res = all_reduce(res)
return make_tuple(res)
return fns[tag]
def test_insert_memcpy_async_for_hccl_op_cond4(tag):
def test_insert_tensor_move_for_hccl_op_cond4(tag):
fns = FnDict()
@fns
@ -113,7 +113,7 @@ def test_insert_memcpy_async_for_hccl_op_cond4(tag):
@fns
def after(a, b):
x = relu(a)
y1 = memcpy_async(b)
y1 = tensor_move(b)
y2 = all_reduce(y1)
res = depend(x, y2)
return make_tuple(res)
@ -121,7 +121,7 @@ def test_insert_memcpy_async_for_hccl_op_cond4(tag):
return fns[tag]
def test_insert_memcpy_async_for_hccl_op_cond5(tag):
def test_insert_tensor_move_for_hccl_op_cond5(tag):
fns = FnDict()
@fns
@ -134,8 +134,8 @@ def test_insert_memcpy_async_for_hccl_op_cond5(tag):
@fns
def after(a, b, c):
x = relu(a)
m1 = memcpy_async(b)
m2 = memcpy_async(c)
m1 = tensor_move(b)
m2 = tensor_move(c)
y = broadcast(m1, m2)
y0 = tuple_getitem(y, 0)
y1 = tuple_getitem(y, 1)