replace memcpy_async with tensor move
This commit is contained in:
parent
e349e5523f
commit
1533435015
|
@ -100,13 +100,13 @@
|
|||
#include "backend/optimizer/ascend/buffer_fusion/reduce_eltwise_fusion_pass.h"
|
||||
#include "backend/optimizer/ascend/buffer_fusion/segment_eltwise_fusion_pass.h"
|
||||
#include "backend/optimizer/ascend/format_type/deal_ref_and_split_unsupported_transdata.h"
|
||||
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.h"
|
||||
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_cascade.h"
|
||||
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h"
|
||||
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_cascade.h"
|
||||
#include "backend/optimizer/ascend/enhancer/insert_pad_for_nms_with_mask.h"
|
||||
#include "backend/optimizer/ascend/format_type/insert_transdata_for_runop.h"
|
||||
#include "backend/optimizer/ascend/enhancer/getnext_memcpy_elimination.h"
|
||||
#include "backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h"
|
||||
#include "backend/optimizer/ascend/ir_fission/addn_fission.h"
|
||||
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_getnext.h"
|
||||
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h"
|
||||
#include "backend/optimizer/ascend/ir_fission/batch_norm_grad_infer_fission.h"
|
||||
#include "backend/optimizer/ascend/ir_fission/split_fission.h"
|
||||
#include "backend/optimizer/ascend/ir_fission/splitv_fission.h"
|
||||
|
@ -292,11 +292,11 @@ void AscendBackendIRFusionOptimization(const std::shared_ptr<session::KernelGrap
|
|||
|
||||
if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && context_ptr->get_param<bool>(MS_CTX_ENABLE_LOOP_SINK) &&
|
||||
ConfigManager::GetInstance().iter_num() > 1) {
|
||||
ir_fusion_pm->AddPass(std::make_shared<InsertMemcpyAsyncForGetNext>());
|
||||
ir_fusion_pm->AddPass(std::make_shared<InsertTensorMoveForGetNext>());
|
||||
ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
|
||||
ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
|
||||
}
|
||||
ir_fusion_pm->AddPass(std::make_shared<InsertMemcpyAsyncForHcclOp>());
|
||||
ir_fusion_pm->AddPass(std::make_shared<InsertTensorMoveForHcclOp>());
|
||||
ir_fusion_pm->AddPass(std::make_shared<InsertTranspose>());
|
||||
ir_fusion_pm->AddPass(std::make_shared<GetitemTuple>());
|
||||
ir_fusion_pm->AddPass(std::make_shared<EraseVisitAttr>());
|
||||
|
@ -370,7 +370,7 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern
|
|||
other_pm->AddPass(std::make_shared<ReduceScatterFusion>());
|
||||
other_pm->AddPass(std::make_shared<SplitInputsForReduceScatter>());
|
||||
other_pm->AddPass(std::make_shared<BroadcastFusion>());
|
||||
other_pm->AddPass(std::make_shared<InsertMemcpyAsyncForCascade>());
|
||||
other_pm->AddPass(std::make_shared<InsertTensorMoveForCascade>());
|
||||
other_pm->AddPass(std::make_shared<ParameterTransOpFusion>());
|
||||
other_pm->AddPass(std::make_shared<RefreshParameterFormat>());
|
||||
other_pm->AddPass(std::make_shared<SplitOpOptimizer>());
|
||||
|
@ -387,7 +387,7 @@ void AscendBackendOptimization(const std::shared_ptr<session::KernelGraph> &kern
|
|||
other2_pm->AddPass(std::make_shared<CommonSubexpressionElimination>());
|
||||
if (context_ptr->get_param<bool>(MS_CTX_ENABLE_TASK_SINK) && context_ptr->get_param<bool>(MS_CTX_ENABLE_LOOP_SINK) &&
|
||||
ConfigManager::GetInstance().iter_num() > 1) {
|
||||
other2_pm->AddPass(std::make_shared<GetnextMemcpyElimination>());
|
||||
other2_pm->AddPass(std::make_shared<GetnextTensorMoveElimination>());
|
||||
}
|
||||
other2_pm->AddPass(std::make_shared<CheckConsistency>());
|
||||
optimizer2->AddPassManager(other2_pm);
|
||||
|
|
|
@ -383,10 +383,10 @@ CNodePtr InsertCastForInput(const FuncGraphPtr &func_graph, const CNodePtr &cnod
|
|||
return new_node;
|
||||
}
|
||||
|
||||
AnfNodePtr CreateMemcpyAsyncOp(const FuncGraphPtr &graph, const AnfNodePtr &node) {
|
||||
AnfNodePtr CreateTensorMoveOp(const FuncGraphPtr &graph, const AnfNodePtr &node) {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
auto prim = std::make_shared<Primitive>(kMemCpyAsyncOpName);
|
||||
auto prim = std::make_shared<Primitive>(kTensorMoveOpName);
|
||||
std::vector<AnfNodePtr> new_node_inputs = {NewValueNode(prim), node};
|
||||
auto new_node = graph->NewCNode(new_node_inputs);
|
||||
MS_EXCEPTION_IF_NULL(new_node);
|
||||
|
|
|
@ -108,7 +108,7 @@ AnfNodePtr InsertTransOpForOutput(const FuncGraphPtr &func_graph, const AnfNodeP
|
|||
|
||||
CNodePtr InsertCastForInput(const FuncGraphPtr &func_graph, const CNodePtr &cnode);
|
||||
|
||||
AnfNodePtr CreateMemcpyAsyncOp(const FuncGraphPtr &graph, const AnfNodePtr &node);
|
||||
AnfNodePtr CreateTensorMoveOp(const FuncGraphPtr &graph, const AnfNodePtr &node);
|
||||
|
||||
AnfNodePtr AddTransOpNodeToGraph(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
|
||||
const KernelSelectPtr &kernel_select, size_t insert_index, bool is_insert_input);
|
||||
|
|
|
@ -14,49 +14,49 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "backend/optimizer/ascend/enhancer/getnext_memcpy_elimination.h"
|
||||
#include "backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h"
|
||||
#include <memory>
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
#include "frontend/optimizer/opt.h"
|
||||
|
||||
namespace mindspore::opt {
|
||||
|
||||
const BaseRef GetnextMemcpyElimination::DefinePattern() const {
|
||||
auto prim_memcpy = std::make_shared<Primitive>(kMemCpyAsyncOpName);
|
||||
const BaseRef GetnextTensorMoveElimination::DefinePattern() const {
|
||||
auto prim_tensor_move = std::make_shared<Primitive>(kTensorMoveOpName);
|
||||
VarPtr x = std::make_shared<SeqVar>();
|
||||
VectorRef memcpy_async({prim_memcpy, x});
|
||||
return memcpy_async;
|
||||
VectorRef tensor_move({prim_tensor_move, x});
|
||||
return tensor_move;
|
||||
}
|
||||
|
||||
const AnfNodePtr GetnextMemcpyElimination::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
|
||||
const EquivPtr &equiv) const {
|
||||
const AnfNodePtr GetnextTensorMoveElimination::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
|
||||
const EquivPtr &equiv) const {
|
||||
if (graph == nullptr || node == nullptr || equiv == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
auto memcpy_cnode = node->cast<CNodePtr>();
|
||||
if (memcpy_cnode == nullptr) {
|
||||
auto tensor_move_node = node->cast<CNodePtr>();
|
||||
if (tensor_move_node == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// 1. memcpy has attr kAttrLabelForInsertStreamActive
|
||||
if (!AnfAlgo::HasNodeAttr(kAttrLabelForInsertStreamActive, memcpy_cnode)) {
|
||||
// 1. tensor move has attr kAttrLabelForInsertStreamActive
|
||||
if (!AnfAlgo::HasNodeAttr(kAttrLabelForInsertStreamActive, tensor_move_node)) {
|
||||
MS_LOG(DEBUG) << "node has no label_for_insert_stream_active attr";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// 2. memcpy's output has only one user next_node
|
||||
// 2. tensor move's output has only one user next_node
|
||||
auto manager = graph->manager();
|
||||
MS_EXCEPTION_IF_NULL(manager);
|
||||
if (manager->node_users().find(memcpy_cnode) == manager->node_users().end()) {
|
||||
MS_LOG(EXCEPTION) << "memcpy has no output in manager";
|
||||
if (manager->node_users().find(tensor_move_node) == manager->node_users().end()) {
|
||||
MS_LOG(EXCEPTION) << "tensor move has no output in manager";
|
||||
}
|
||||
auto next_nodes = manager->node_users()[memcpy_cnode];
|
||||
auto next_nodes = manager->node_users()[tensor_move_node];
|
||||
if (next_nodes.size() > 1) {
|
||||
MS_LOG(DEBUG) << "node's output has more than one users";
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// 3. next_node is not nop node, not graph output and it has only one input which is memcpy's output
|
||||
// 3. next_node is not nop node, not graph output and it has only one input which is tensor move's output
|
||||
for (auto &item : next_nodes) {
|
||||
auto next_node = item.first->cast<CNodePtr>();
|
||||
if (opt::IsNopNode(next_node)) {
|
||||
|
@ -77,6 +77,6 @@ const AnfNodePtr GetnextMemcpyElimination::Process(const FuncGraphPtr &graph, co
|
|||
AnfAlgo::SetNodeAttr(kAttrLabelForInsertStreamActive, MakeValue(true), next_node);
|
||||
}
|
||||
|
||||
return memcpy_cnode->input(1);
|
||||
return tensor_move_node->input(1);
|
||||
}
|
||||
} // namespace mindspore::opt
|
|
@ -13,21 +13,21 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_TENSORMOVE_ELIMINATION_H
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_TENSORMOVE_ELIMINATION_H
|
||||
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
class GetnextMemcpyElimination : public PatternProcessPass {
|
||||
class GetnextTensorMoveElimination : public PatternProcessPass {
|
||||
public:
|
||||
explicit GetnextMemcpyElimination(bool multigraph = true)
|
||||
: PatternProcessPass("getnext_memcpy_elimination", multigraph) {}
|
||||
~GetnextMemcpyElimination() override = default;
|
||||
explicit GetnextTensorMoveElimination(bool multigraph = true)
|
||||
: PatternProcessPass("getnext_tensormove_elimination", multigraph) {}
|
||||
~GetnextTensorMoveElimination() override = default;
|
||||
const BaseRef DefinePattern() const override;
|
||||
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
|
||||
};
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_MEMCPY_ELIMINATION_H
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_GETNEXT_TENSORMOVE_ELIMINATION_H
|
|
@ -13,7 +13,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_cascade.h"
|
||||
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_cascade.h"
|
||||
#include <vector>
|
||||
#include "utils/utils.h"
|
||||
#include "backend/session/anf_runtime_algorithm.h"
|
||||
|
@ -69,36 +69,36 @@ bool IsPartOutputsOfHcclOp(const AnfNodePtr &node, const CNodePtr &cur_hccl, con
|
|||
}
|
||||
} // namespace
|
||||
|
||||
AnfNodePtr InsertMemcpyAsyncForCascade::InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const {
|
||||
AnfNodePtr InsertTensorMoveForCascade::InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(hccl_node);
|
||||
std::vector<AnfNodePtr> memcpy_async_list;
|
||||
std::vector<AnfNodePtr> tensor_move_list;
|
||||
std::vector<AnfNodePtr> new_inputs = {hccl_node->input(0)};
|
||||
for (size_t i = 1; i < hccl_node->size(); ++i) {
|
||||
auto input = hccl_node->input(i);
|
||||
MS_EXCEPTION_IF_NULL(input);
|
||||
// when input is also a hccl op and just part outputs of it linking with cur_hccl_op
|
||||
if (IsPartOutputsOfHcclOp(input, hccl_node, graph)) {
|
||||
auto memcpy_async = CreateMemcpyAsyncOp(graph, input);
|
||||
if (memcpy_async == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Create memcpy_async op failed."
|
||||
auto tensor_move = CreateTensorMoveOp(graph, input);
|
||||
if (tensor_move == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Create tensor_move op failed."
|
||||
<< " trace: " << trace::DumpSourceLines(hccl_node);
|
||||
}
|
||||
if (AnfAlgo::IsNodeDynamicShape(input)) {
|
||||
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), memcpy_async);
|
||||
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), tensor_move);
|
||||
}
|
||||
auto kernel_info = std::make_shared<device::KernelInfo>();
|
||||
memcpy_async->set_kernel_info(kernel_info);
|
||||
tensor_move->set_kernel_info(kernel_info);
|
||||
MS_EXCEPTION_IF_NULL(kernel_select_);
|
||||
kernel_select_->SelectKernel(memcpy_async->cast<CNodePtr>());
|
||||
new_inputs.push_back(memcpy_async);
|
||||
memcpy_async_list.push_back(memcpy_async);
|
||||
kernel_select_->SelectKernel(tensor_move->cast<CNodePtr>());
|
||||
new_inputs.push_back(tensor_move);
|
||||
tensor_move_list.push_back(tensor_move);
|
||||
} else {
|
||||
new_inputs.push_back(input);
|
||||
}
|
||||
}
|
||||
|
||||
if (!memcpy_async_list.empty()) {
|
||||
if (!tensor_move_list.empty()) {
|
||||
CNodePtr new_hccl_node = std::make_shared<CNode>(*hccl_node);
|
||||
new_hccl_node->set_inputs(new_inputs);
|
||||
return new_hccl_node;
|
||||
|
@ -106,8 +106,8 @@ AnfNodePtr InsertMemcpyAsyncForCascade::InsertMemcpyAsync(const FuncGraphPtr &gr
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
const AnfNodePtr InsertMemcpyAsyncForCascade::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
|
||||
const EquivPtr &) const {
|
||||
const AnfNodePtr InsertTensorMoveForCascade::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
|
||||
const EquivPtr &) const {
|
||||
if (func_graph == nullptr || node == nullptr || !node->isa<CNode>()) {
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -115,7 +115,7 @@ const AnfNodePtr InsertMemcpyAsyncForCascade::Process(const FuncGraphPtr &func_g
|
|||
if (!AnfAlgo::IsCommunicationOp(node)) {
|
||||
return nullptr;
|
||||
}
|
||||
return InsertMemcpyAsync(func_graph, cnode);
|
||||
return InsertTensorMove(func_graph, cnode);
|
||||
}
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
|
@ -13,8 +13,8 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_CASCADE_H_
|
||||
#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_CASCADE_H_
|
||||
#ifndef MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_TENSORMOVE_ASYNC_FOR_CASCADE_H_
|
||||
#define MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_TENSORMOVE_ASYNC_FOR_CASCADE_H_
|
||||
|
||||
#include <memory>
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
|
@ -22,18 +22,18 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
class InsertMemcpyAsyncForCascade : public PatternProcessPass {
|
||||
class InsertTensorMoveForCascade : public PatternProcessPass {
|
||||
public:
|
||||
explicit InsertMemcpyAsyncForCascade(bool multigraph = true)
|
||||
: PatternProcessPass("insert_memcpy_async_for_cascade", multigraph),
|
||||
explicit InsertTensorMoveForCascade(bool multigraph = true)
|
||||
: PatternProcessPass("insert_tensor_move_for_cascade", multigraph),
|
||||
kernel_select_(std::make_shared<KernelSelect>()) {}
|
||||
~InsertMemcpyAsyncForCascade() override = default;
|
||||
~InsertTensorMoveForCascade() override = default;
|
||||
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
|
||||
|
||||
private:
|
||||
AnfNodePtr InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const;
|
||||
AnfNodePtr InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const;
|
||||
KernelSelectPtr kernel_select_;
|
||||
};
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_OP_CASCADE_H_
|
||||
#endif // MINDSPORE_CCSRC_PRE_ACTIVATE_ASCEND_ENHANCER_INSERT_TENSORMOVE_ASYNC_FOR_OP_CASCADE_H_
|
|
@ -13,7 +13,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_getnext.h"
|
||||
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h"
|
||||
#include <vector>
|
||||
#include <memory>
|
||||
#include "backend/optimizer/ascend/ascend_helper.h"
|
||||
|
@ -22,14 +22,14 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
AnfNodePtr InsertMemcpyAsyncForGetNextOutputs(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
|
||||
AnfNodePtr InsertTensorMoveForGetNextOutputs(const FuncGraphPtr &func_graph, const AnfNodePtr &node) {
|
||||
if (func_graph == nullptr || node == nullptr) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
size_t output_num = AnfAlgo::GetOutputTensorNum(node);
|
||||
if (output_num == 0) {
|
||||
MS_LOG(DEBUG) << "Output number is zero, no need to insert memcpy_async!";
|
||||
MS_LOG(DEBUG) << "Output number is zero, no need to insert tensor_move!";
|
||||
return node;
|
||||
}
|
||||
|
||||
|
@ -39,9 +39,9 @@ AnfNodePtr InsertMemcpyAsyncForGetNextOutputs(const FuncGraphPtr &func_graph, co
|
|||
|
||||
for (size_t output_index = 0; output_index < output_num; ++output_index) {
|
||||
auto tuple_get_item = CreatTupleGetItemNode(func_graph, node, output_index);
|
||||
auto new_node = CreateMemcpyAsyncOp(func_graph, tuple_get_item);
|
||||
auto new_node = CreateTensorMoveOp(func_graph, tuple_get_item);
|
||||
if (new_node == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Create memcpy_async op failed!";
|
||||
MS_LOG(EXCEPTION) << "Create tensor move op failed!";
|
||||
}
|
||||
if (AnfAlgo::IsNodeDynamicShape(tuple_get_item)) {
|
||||
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), new_node);
|
||||
|
@ -53,15 +53,15 @@ AnfNodePtr InsertMemcpyAsyncForGetNextOutputs(const FuncGraphPtr &func_graph, co
|
|||
return make_tuple;
|
||||
}
|
||||
|
||||
const BaseRef InsertMemcpyAsyncForGetNext::DefinePattern() const {
|
||||
const BaseRef InsertTensorMoveForGetNext::DefinePattern() const {
|
||||
std::shared_ptr<Var> Xs = std::make_shared<SeqVar>();
|
||||
auto prim = std::make_shared<Primitive>(kGetNextOpName);
|
||||
|
||||
return VectorRef({prim, Xs});
|
||||
}
|
||||
|
||||
const AnfNodePtr InsertMemcpyAsyncForGetNext::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
|
||||
const EquivPtr &) const {
|
||||
const AnfNodePtr InsertTensorMoveForGetNext::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
|
||||
const EquivPtr &) const {
|
||||
if (func_graph == nullptr || node == nullptr || !AnfAlgo::IsRealKernel(node)) {
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -73,7 +73,7 @@ const AnfNodePtr InsertMemcpyAsyncForGetNext::Process(const FuncGraphPtr &func_g
|
|||
}
|
||||
AnfAlgo::SetNodeAttr(kAttrVisited, MakeValue(true), cnode);
|
||||
|
||||
return InsertMemcpyAsyncForGetNextOutputs(func_graph, cnode);
|
||||
return InsertTensorMoveForGetNextOutputs(func_graph, cnode);
|
||||
}
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
|
@ -14,22 +14,22 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_GETNEXT_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_GETNEXT_H_
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_GETNEXT_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_GETNEXT_H_
|
||||
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
class InsertMemcpyAsyncForGetNext : public PatternProcessPass {
|
||||
class InsertTensorMoveForGetNext : public PatternProcessPass {
|
||||
public:
|
||||
explicit InsertMemcpyAsyncForGetNext(bool multigraph = true)
|
||||
: PatternProcessPass("insert_memcpy_async_for_getnext", multigraph) {}
|
||||
~InsertMemcpyAsyncForGetNext() override = default;
|
||||
explicit InsertTensorMoveForGetNext(bool multigraph = true)
|
||||
: PatternProcessPass("insert_tensor_move_for_getnext", multigraph) {}
|
||||
~InsertTensorMoveForGetNext() override = default;
|
||||
const BaseRef DefinePattern() const override;
|
||||
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
|
||||
};
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_GETNEXT_H_
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_GETNEXT_H_
|
|
@ -13,7 +13,7 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.h"
|
||||
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h"
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <string>
|
||||
|
@ -25,9 +25,9 @@
|
|||
namespace mindspore {
|
||||
namespace opt {
|
||||
namespace {
|
||||
// insert memcpy for some cnode even if not a Ref cnode
|
||||
const std::set<std::string> kNeedInsertMemcpyOpSet = {kLambNextMVOpName, kLambNextMVWithDecayOpName,
|
||||
kLambUpdateWithLROpName};
|
||||
// insert tensormove for some cnode even if not a Ref cnode
|
||||
const std::set<std::string> kNeedInsertTensorMoveOpSet = {kLambNextMVOpName, kLambNextMVWithDecayOpName,
|
||||
kLambUpdateWithLROpName};
|
||||
|
||||
bool IsParameterOrValueNode(const AnfNodePtr &node) {
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
|
@ -43,7 +43,7 @@ bool IsParameterOrValueNode(const AnfNodePtr &node) {
|
|||
// NodeUsersMap, for node B input i use node A, it will be one item in map with key: A, and value: (B, i)
|
||||
bool IsNodeOutPutUsedByOtherRealKernel(const AnfNodeIndexSet &node_users) {
|
||||
if (node_users.size() == 1) {
|
||||
MS_LOG(INFO) << "This node only used once, no need to insert memcpy node.";
|
||||
MS_LOG(INFO) << "This node only used once, no need to insert tensormove node.";
|
||||
return false;
|
||||
}
|
||||
for (const auto &node_pair : node_users) {
|
||||
|
@ -53,13 +53,13 @@ bool IsNodeOutPutUsedByOtherRealKernel(const AnfNodeIndexSet &node_users) {
|
|||
return true;
|
||||
}
|
||||
}
|
||||
MS_LOG(INFO) << "This node used by other node, but the node is not real kernel, no need to insert memcpy node.";
|
||||
MS_LOG(INFO) << "This node used by other node, but the node is not real kernel, no need to insert tensormove node.";
|
||||
return false;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
bool InsertMemcpyAsyncForHcclOp::NeedInsertMemcpy(const FuncGraphPtr &graph, const AnfNodePtr &input,
|
||||
const CNodePtr &cur_node) const {
|
||||
bool InsertTensorMoveForHcclOp::NeedInsertTensorMove(const FuncGraphPtr &graph, const AnfNodePtr &input,
|
||||
const CNodePtr &cur_node) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(input);
|
||||
MS_EXCEPTION_IF_NULL(cur_node);
|
||||
|
@ -79,7 +79,7 @@ bool InsertMemcpyAsyncForHcclOp::NeedInsertMemcpy(const FuncGraphPtr &graph, con
|
|||
}
|
||||
|
||||
// when input is some special cnodes
|
||||
if (kNeedInsertMemcpyOpSet.find(AnfAlgo::GetCNodeName(input)) != kNeedInsertMemcpyOpSet.end()) {
|
||||
if (kNeedInsertTensorMoveOpSet.find(AnfAlgo::GetCNodeName(input)) != kNeedInsertTensorMoveOpSet.end()) {
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -96,29 +96,29 @@ bool InsertMemcpyAsyncForHcclOp::NeedInsertMemcpy(const FuncGraphPtr &graph, con
|
|||
return false;
|
||||
}
|
||||
|
||||
void InsertMemcpyAsyncForHcclOp::InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const {
|
||||
void InsertTensorMoveForHcclOp::InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const {
|
||||
MS_EXCEPTION_IF_NULL(graph);
|
||||
MS_EXCEPTION_IF_NULL(hccl_node);
|
||||
bool need_memcpy_async = false;
|
||||
bool need_tensor_move_async = false;
|
||||
std::vector<AnfNodePtr> new_inputs = {hccl_node->input(0)};
|
||||
for (size_t i = 1; i < hccl_node->size(); ++i) {
|
||||
auto input = hccl_node->input(i);
|
||||
if (NeedInsertMemcpy(graph, input, hccl_node)) {
|
||||
auto memcpy_async = CreateMemcpyAsyncOp(graph, input);
|
||||
if (memcpy_async == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Create memcpy_async op failed.";
|
||||
if (NeedInsertTensorMove(graph, input, hccl_node)) {
|
||||
auto tensor_move = CreateTensorMoveOp(graph, input);
|
||||
if (tensor_move == nullptr) {
|
||||
MS_LOG(EXCEPTION) << "Create tensor_move op failed.";
|
||||
}
|
||||
if (input->isa<CNode>() && AnfAlgo::IsNodeDynamicShape(input)) {
|
||||
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), memcpy_async);
|
||||
AnfAlgo::SetNodeAttr(kAttrIsDynamicShape, MakeValue(true), tensor_move);
|
||||
}
|
||||
new_inputs.push_back(memcpy_async);
|
||||
need_memcpy_async = true;
|
||||
new_inputs.push_back(tensor_move);
|
||||
need_tensor_move_async = true;
|
||||
} else {
|
||||
new_inputs.push_back(input);
|
||||
}
|
||||
}
|
||||
|
||||
if (need_memcpy_async) {
|
||||
if (need_tensor_move_async) {
|
||||
CNodePtr new_hccl_node = std::make_shared<CNode>(*hccl_node);
|
||||
new_hccl_node->set_inputs(new_inputs);
|
||||
auto manager = graph->manager();
|
||||
|
@ -129,15 +129,15 @@ void InsertMemcpyAsyncForHcclOp::InsertMemcpyAsync(const FuncGraphPtr &graph, co
|
|||
}
|
||||
}
|
||||
|
||||
const AnfNodePtr InsertMemcpyAsyncForHcclOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
|
||||
const EquivPtr &) const {
|
||||
const AnfNodePtr InsertTensorMoveForHcclOp::Process(const FuncGraphPtr &func_graph, const AnfNodePtr &node,
|
||||
const EquivPtr &) const {
|
||||
if (func_graph == nullptr || node == nullptr || !node->isa<CNode>()) {
|
||||
return nullptr;
|
||||
}
|
||||
if (!AnfAlgo::IsCommunicationOp(node)) {
|
||||
return nullptr;
|
||||
}
|
||||
InsertMemcpyAsync(func_graph, node->cast<CNodePtr>());
|
||||
InsertTensorMove(func_graph, node->cast<CNodePtr>());
|
||||
return nullptr;
|
||||
}
|
||||
} // namespace opt
|
|
@ -13,8 +13,8 @@
|
|||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_
|
||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_HCCL_OP_H_
|
||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_HCCL_OP_H_
|
||||
|
||||
#include <memory>
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
|
@ -22,19 +22,19 @@
|
|||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
class InsertMemcpyAsyncForHcclOp : public PatternProcessPass {
|
||||
class InsertTensorMoveForHcclOp : public PatternProcessPass {
|
||||
public:
|
||||
explicit InsertMemcpyAsyncForHcclOp(bool multigraph = true)
|
||||
: PatternProcessPass("insert_memcpy_async_for_hccl_op", multigraph),
|
||||
explicit InsertTensorMoveForHcclOp(bool multigraph = true)
|
||||
: PatternProcessPass("insert_tensor_move_for_hccl_op", multigraph),
|
||||
kernel_query_(std::make_shared<KernelQuery>()) {}
|
||||
~InsertMemcpyAsyncForHcclOp() override = default;
|
||||
~InsertTensorMoveForHcclOp() override = default;
|
||||
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
|
||||
|
||||
private:
|
||||
void InsertMemcpyAsync(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const;
|
||||
bool NeedInsertMemcpy(const FuncGraphPtr &graph, const AnfNodePtr &input, const CNodePtr &cur_node) const;
|
||||
void InsertTensorMove(const FuncGraphPtr &graph, const CNodePtr &hccl_node) const;
|
||||
bool NeedInsertTensorMove(const FuncGraphPtr &graph, const AnfNodePtr &input, const CNodePtr &cur_node) const;
|
||||
KernelQueryPtr kernel_query_;
|
||||
};
|
||||
} // namespace opt
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_MEMCPY_ASYNC_FOR_HCCL_OP_H_
|
||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_ASCEND_ENHANCER_INSERT_TENSOR_MOVE_FOR_HCCL_OP_H_
|
|
@ -22,75 +22,75 @@
|
|||
#include "utils/utils.h"
|
||||
#include "backend/kernel_compiler/kernel_build_info.h"
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
#include "mindspore/ccsrc/backend/optimizer/ascend/enhancer/getnext_memcpy_elimination.h"
|
||||
#include "mindspore/ccsrc/backend/optimizer/ascend/enhancer/getnext_tensor_move_elimination.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
class TestGetNextMemcpyElimination : public BackendCommon {
|
||||
class TestGetNextTensorMoveElimination : public BackendCommon {
|
||||
public:
|
||||
TestGetNextMemcpyElimination() : get_py_fun_("gtest_input.pre_activate.getnext_memcpy_elimination_test", true) {}
|
||||
TestGetNextTensorMoveElimination() : get_py_fun_("gtest_input.pre_activate.getnext_tensor_move_elimination_test", true) {}
|
||||
|
||||
public:
|
||||
UT::PyFuncGraphFetcher get_py_fun_;
|
||||
};
|
||||
|
||||
TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination) {
|
||||
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination", "before");
|
||||
TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensormove_elimination) {
|
||||
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination", "before");
|
||||
ASSERT_TRUE(g_before != nullptr);
|
||||
|
||||
auto optimizer = std::make_shared<opt::GraphOptimizer>();
|
||||
auto pm = std::make_shared<opt::PassManager>();
|
||||
auto pass = std::make_shared<opt::GetnextMemcpyElimination>();
|
||||
auto pass = std::make_shared<opt::GetnextTensorMoveElimination>();
|
||||
pm->AddPass(pass);
|
||||
optimizer->AddPassManager(pm);
|
||||
auto new_graph = optimizer->Optimize(g_before);
|
||||
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination", "after");
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination", "after");
|
||||
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
|
||||
}
|
||||
|
||||
TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_no_attr) {
|
||||
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_no_attr", "before");
|
||||
TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensor_move_elimination_no_attr) {
|
||||
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_no_attr", "before");
|
||||
ASSERT_TRUE(g_before != nullptr);
|
||||
|
||||
auto optimizer = std::make_shared<opt::GraphOptimizer>();
|
||||
auto pm = std::make_shared<opt::PassManager>();
|
||||
auto pass = std::make_shared<opt::GetnextMemcpyElimination>();
|
||||
auto pass = std::make_shared<opt::GetnextTensorMoveElimination>();
|
||||
pm->AddPass(pass);
|
||||
optimizer->AddPassManager(pm);
|
||||
auto new_graph = optimizer->Optimize(g_before);
|
||||
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_no_attr", "after");
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_no_attr", "after");
|
||||
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
|
||||
}
|
||||
|
||||
TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_memcpy_multi_users) {
|
||||
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_memcpy_multi_users", "before");
|
||||
TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensor_move_elimination_tensor_move_multi_users) {
|
||||
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_tensor_move_multi_users", "before");
|
||||
ASSERT_TRUE(g_before != nullptr);
|
||||
|
||||
auto optimizer = std::make_shared<opt::GraphOptimizer>();
|
||||
auto pm = std::make_shared<opt::PassManager>();
|
||||
auto pass = std::make_shared<opt::GetnextMemcpyElimination>();
|
||||
auto pass = std::make_shared<opt::GetnextTensorMoveElimination>();
|
||||
pm->AddPass(pass);
|
||||
optimizer->AddPassManager(pm);
|
||||
auto new_graph = optimizer->Optimize(g_before);
|
||||
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_memcpy_multi_users", "after");
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_tensor_move_multi_users", "after");
|
||||
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
|
||||
}
|
||||
|
||||
TEST_F(TestGetNextMemcpyElimination, test_getnext_memcpy_elimination_next_multi_inputs) {
|
||||
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_next_multi_inputs", "before");
|
||||
TEST_F(TestGetNextTensorMoveElimination, test_getnext_tensor_move_elimination_next_multi_inputs) {
|
||||
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_next_multi_inputs", "before");
|
||||
ASSERT_TRUE(g_before != nullptr);
|
||||
|
||||
auto optimizer = std::make_shared<opt::GraphOptimizer>();
|
||||
auto pm = std::make_shared<opt::PassManager>();
|
||||
auto pass = std::make_shared<opt::GetnextMemcpyElimination>();
|
||||
auto pass = std::make_shared<opt::GetnextTensorMoveElimination>();
|
||||
pm->AddPass(pass);
|
||||
optimizer->AddPassManager(pm);
|
||||
auto new_graph = optimizer->Optimize(g_before);
|
||||
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_memcpy_elimination_next_multi_inputs", "after");
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_getnext_tensor_move_elimination_next_multi_inputs", "after");
|
||||
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
|
||||
}
|
||||
|
|
@ -24,23 +24,23 @@
|
|||
#include "utils/utils.h"
|
||||
#include "backend/kernel_compiler/kernel_build_info.h"
|
||||
#include "backend/optimizer/common/optimizer.h"
|
||||
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_getnext.h"
|
||||
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_getnext.h"
|
||||
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
using KernelBuildInfoBuilder = kernel::KernelBuildInfo::KernelBuildInfoBuilder;
|
||||
|
||||
class TestHWInsertMemcpyAsyncForGetNext : public BackendCommon {
|
||||
class TestHWInsertTensorMoveForGetNext : public BackendCommon {
|
||||
public:
|
||||
TestHWInsertMemcpyAsyncForGetNext() : get_py_fun_("gtest_input.pre_activate.insert_memcpy_async_for_getnext", true) {}
|
||||
~TestHWInsertMemcpyAsyncForGetNext() override = default;
|
||||
TestHWInsertTensorMoveForGetNext() : get_py_fun_("gtest_input.pre_activate.insert_tensor_move_for_getnext", true) {}
|
||||
~TestHWInsertTensorMoveForGetNext() override = default;
|
||||
|
||||
public:
|
||||
UT::PyFuncGraphFetcher get_py_fun_;
|
||||
};
|
||||
|
||||
TEST_F(TestHWInsertMemcpyAsyncForGetNext, test_insert_memcpy_async_for_getnext_multi_output) {
|
||||
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_getnext", "getnext_multi_output_before");
|
||||
TEST_F(TestHWInsertTensorMoveForGetNext, test_insert_tensor_move_for_getnext_multi_output) {
|
||||
FuncGraphPtr g_before = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_getnext", "getnext_multi_output_before");
|
||||
|
||||
AbstractBasePtrList args_spec_list{};
|
||||
auto kernel_graph = GetKernelGraph(g_before, args_spec_list);
|
||||
|
@ -57,11 +57,11 @@ TEST_F(TestHWInsertMemcpyAsyncForGetNext, test_insert_memcpy_async_for_getnext_m
|
|||
|
||||
auto optimizer = std::make_shared<opt::GraphOptimizer>();
|
||||
auto pm = std::make_shared<opt::PassManager>();
|
||||
pm->AddPass(std::make_shared<opt::InsertMemcpyAsyncForGetNext>());
|
||||
pm->AddPass(std::make_shared<opt::InsertTensorMoveForGetNext>());
|
||||
optimizer->AddPassManager(pm);
|
||||
auto new_graph = optimizer->Optimize(kernel_graph);
|
||||
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_getnext", "getnext_multi_output_after");
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_getnext", "getnext_multi_output_after");
|
||||
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
|
||||
}
|
||||
} // namespace opt
|
|
@ -25,24 +25,24 @@
|
|||
#include "ir/param_info.h"
|
||||
#define private public
|
||||
#define protected public
|
||||
#include "backend/optimizer/ascend/enhancer/insert_memcpy_async_for_hccl_op.h"
|
||||
#include "backend/optimizer/ascend/enhancer/insert_tensor_move_for_hccl_op.h"
|
||||
#undef private
|
||||
#undef protected
|
||||
namespace mindspore {
|
||||
namespace opt {
|
||||
class TestHWInsertMemcpyForHccl : public BackendCommon {
|
||||
class TestHWInsertTensorMoveForHccl : public BackendCommon {
|
||||
public:
|
||||
TestHWInsertMemcpyForHccl() : get_py_fun_("gtest_input.pre_activate.insert_memcpy_async_for_hccl_op", true) {}
|
||||
~TestHWInsertMemcpyForHccl() override = default;
|
||||
TestHWInsertTensorMoveForHccl() : get_py_fun_("gtest_input.pre_activate.insert_tensor_move_for_hccl_op", true) {}
|
||||
~TestHWInsertTensorMoveForHccl() override = default;
|
||||
|
||||
public:
|
||||
UT::PyFuncGraphFetcher get_py_fun_;
|
||||
};
|
||||
|
||||
class MockInsertMemcpyForHcclKernelQuery : public KernelQuery {
|
||||
class MockInsertTensorMoveForHcclKernelQuery : public KernelQuery {
|
||||
public:
|
||||
MockInsertMemcpyForHcclKernelQuery() = default;
|
||||
~MockInsertMemcpyForHcclKernelQuery() override = default;
|
||||
MockInsertTensorMoveForHcclKernelQuery() = default;
|
||||
~MockInsertTensorMoveForHcclKernelQuery() override = default;
|
||||
bool IsTbeRef(const AnfNodePtr &node) override {
|
||||
MS_EXCEPTION_IF_NULL(node);
|
||||
if (!node->isa<CNode>()) {
|
||||
|
@ -53,9 +53,9 @@ class MockInsertMemcpyForHcclKernelQuery : public KernelQuery {
|
|||
}
|
||||
};
|
||||
|
||||
TEST_F(TestHWInsertMemcpyForHccl, test_cond1_no_insert) {
|
||||
TEST_F(TestHWInsertTensorMoveForHccl, test_cond1_no_insert) {
|
||||
get_py_fun_.SetDoResolve(true);
|
||||
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond1", "before2");
|
||||
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond1", "before2");
|
||||
ASSERT_TRUE(g != nullptr);
|
||||
std::vector<int64_t> shp_x{1, 64, 112, 112};
|
||||
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
|
||||
|
@ -66,7 +66,7 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond1_no_insert) {
|
|||
|
||||
auto optimizer = std::make_shared<opt::GraphOptimizer>();
|
||||
auto pm = std::make_shared<opt::PassManager>();
|
||||
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
|
||||
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
|
||||
pm->AddPass(pass);
|
||||
optimizer->AddPassManager(pm);
|
||||
auto new_graph = optimizer->Optimize(kg);
|
||||
|
@ -74,9 +74,9 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond1_no_insert) {
|
|||
EXPECT_TRUE(CheckEqualGraph(origin_graph, new_graph));
|
||||
}
|
||||
|
||||
TEST_F(TestHWInsertMemcpyForHccl, test_cond2) {
|
||||
TEST_F(TestHWInsertTensorMoveForHccl, test_cond2) {
|
||||
get_py_fun_.SetDoResolve(true);
|
||||
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond2", "before");
|
||||
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond2", "before");
|
||||
ASSERT_TRUE(g != nullptr);
|
||||
std::vector<int64_t> shp_x{1, 64, 112, 112};
|
||||
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
|
||||
|
@ -90,19 +90,19 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond2) {
|
|||
|
||||
auto optimizer = std::make_shared<opt::GraphOptimizer>();
|
||||
auto pm = std::make_shared<opt::PassManager>();
|
||||
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
|
||||
pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
|
||||
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
|
||||
pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>();
|
||||
pm->AddPass(pass);
|
||||
optimizer->AddPassManager(pm);
|
||||
auto new_graph = optimizer->Optimize(kg);
|
||||
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond2", "after");
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond2", "after");
|
||||
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
|
||||
}
|
||||
|
||||
TEST_F(TestHWInsertMemcpyForHccl, test_cond3) {
|
||||
TEST_F(TestHWInsertTensorMoveForHccl, test_cond3) {
|
||||
get_py_fun_.SetDoResolve(true);
|
||||
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond3", "before");
|
||||
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond3", "before");
|
||||
ASSERT_TRUE(g != nullptr);
|
||||
std::vector<int64_t> shp_x{3, 2};
|
||||
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
|
||||
|
@ -112,19 +112,19 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond3) {
|
|||
|
||||
auto optimizer = std::make_shared<opt::GraphOptimizer>();
|
||||
auto pm = std::make_shared<opt::PassManager>();
|
||||
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
|
||||
pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
|
||||
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
|
||||
pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>();
|
||||
pm->AddPass(pass);
|
||||
optimizer->AddPassManager(pm);
|
||||
auto new_graph = optimizer->Optimize(kg);
|
||||
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond3", "after");
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond3", "after");
|
||||
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
|
||||
}
|
||||
|
||||
TEST_F(TestHWInsertMemcpyForHccl, test_cond4) {
|
||||
TEST_F(TestHWInsertTensorMoveForHccl, test_cond4) {
|
||||
get_py_fun_.SetDoResolve(true);
|
||||
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond4", "before");
|
||||
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond4", "before");
|
||||
ASSERT_TRUE(g != nullptr);
|
||||
std::vector<int64_t> shp_x{1, 64, 112, 112};
|
||||
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
|
||||
|
@ -139,19 +139,19 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond4) {
|
|||
|
||||
auto optimizer = std::make_shared<opt::GraphOptimizer>();
|
||||
auto pm = std::make_shared<opt::PassManager>();
|
||||
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
|
||||
pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
|
||||
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
|
||||
pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>();
|
||||
pm->AddPass(pass);
|
||||
optimizer->AddPassManager(pm);
|
||||
auto new_graph = optimizer->Optimize(kg);
|
||||
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond4", "after");
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond4", "after");
|
||||
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
|
||||
}
|
||||
|
||||
TEST_F(TestHWInsertMemcpyForHccl, test_cond5) {
|
||||
TEST_F(TestHWInsertTensorMoveForHccl, test_cond5) {
|
||||
get_py_fun_.SetDoResolve(true);
|
||||
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond5", "before");
|
||||
FuncGraphPtr g = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond5", "before");
|
||||
ASSERT_TRUE(g != nullptr);
|
||||
std::vector<int64_t> shp_x{1, 64, 112, 112};
|
||||
auto x_abstract = std::make_shared<abstract::AbstractTensor>(kFloat32, shp_x);
|
||||
|
@ -166,14 +166,14 @@ TEST_F(TestHWInsertMemcpyForHccl, test_cond5) {
|
|||
|
||||
auto optimizer = std::make_shared<opt::GraphOptimizer>();
|
||||
auto pm = std::make_shared<opt::PassManager>();
|
||||
auto pass = std::make_shared<opt::InsertMemcpyAsyncForHcclOp>();
|
||||
pass->kernel_query_ = std::make_shared<MockInsertMemcpyForHcclKernelQuery>();
|
||||
auto pass = std::make_shared<opt::InsertTensorMoveForHcclOp>();
|
||||
pass->kernel_query_ = std::make_shared<MockInsertTensorMoveForHcclKernelQuery>();
|
||||
pm->AddPass(pass);
|
||||
optimizer->AddPassManager(pm);
|
||||
auto new_graph = optimizer->Optimize(kg);
|
||||
kg->SetExecOrderByDefault();
|
||||
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_memcpy_async_for_hccl_op_cond5", "after");
|
||||
FuncGraphPtr g_after = get_py_fun_.CallAndParseRet("test_insert_tensor_move_for_hccl_op_cond5", "after");
|
||||
EXPECT_TRUE(CheckEqualGraph(g_after, new_graph));
|
||||
}
|
||||
} // namespace opt
|
|
@ -18,9 +18,9 @@ from mindspore.ops import Primitive
|
|||
from mindspore.ops import operations as P
|
||||
|
||||
get_next = P.GetNext([ms.float32], [[1, 64, 112, 112]], 1, "")
|
||||
memcpy_async_attr = Primitive('memcpy_async')
|
||||
memcpy_async_attr.add_prim_attr("label_for_insert_stream_active", True)
|
||||
memcpy_async = Primitive('memcpy_async')
|
||||
tensor_move_attr = Primitive('TensorMove')
|
||||
tensor_move_attr.add_prim_attr("label_for_insert_stream_active", True)
|
||||
tensor_move = Primitive('tensor_move')
|
||||
cast = P.Cast()
|
||||
add = P.Add()
|
||||
|
||||
|
@ -36,13 +36,13 @@ class FnDict:
|
|||
return self.fnDict[name]
|
||||
|
||||
|
||||
def test_getnext_memcpy_elimination(tag):
|
||||
def test_getnext_tensor_move_elimination(tag):
|
||||
fns = FnDict()
|
||||
|
||||
@fns
|
||||
def before():
|
||||
res = get_next()
|
||||
res = memcpy_async_attr(res)
|
||||
res = tensor_move_attr(res)
|
||||
res = cast(res)
|
||||
res = add(res)
|
||||
return res
|
||||
|
@ -57,63 +57,63 @@ def test_getnext_memcpy_elimination(tag):
|
|||
return fns[tag]
|
||||
|
||||
|
||||
def test_getnext_memcpy_elimination_no_attr(tag):
|
||||
def test_getnext_tensor_move_elimination_no_attr(tag):
|
||||
fns = FnDict()
|
||||
|
||||
@fns
|
||||
def before():
|
||||
res = get_next()
|
||||
res = memcpy_async(res)
|
||||
res = tensor_move(res)
|
||||
res = cast(res)
|
||||
return res
|
||||
|
||||
@fns
|
||||
def after():
|
||||
res = get_next()
|
||||
res = memcpy_async(res)
|
||||
res = tensor_move(res)
|
||||
res = cast(res)
|
||||
return res
|
||||
|
||||
return fns[tag]
|
||||
|
||||
|
||||
def test_getnext_memcpy_elimination_memcpy_multi_users(tag):
|
||||
def test_getnext_tensor_move_elimination_tensor_move_multi_users(tag):
|
||||
fns = FnDict()
|
||||
|
||||
@fns
|
||||
def before():
|
||||
res = get_next()
|
||||
memcpy_out = memcpy_async_attr(res)
|
||||
res = cast(memcpy_out)
|
||||
res = add(memcpy_out, res)
|
||||
tensor_move_out = tensor_move_attr(res)
|
||||
res = cast(tensor_move_out)
|
||||
res = add(tensor_move_out, res)
|
||||
return res
|
||||
|
||||
@fns
|
||||
def after():
|
||||
res = get_next()
|
||||
memcpy_out = memcpy_async_attr(res)
|
||||
res = cast(memcpy_out)
|
||||
res = add(memcpy_out, res)
|
||||
tensor_move_out = tensor_move_attr(res)
|
||||
res = cast(tensor_move_out)
|
||||
res = add(tensor_move_out, res)
|
||||
return res
|
||||
|
||||
return fns[tag]
|
||||
|
||||
|
||||
def test_getnext_memcpy_elimination_next_multi_inputs(tag):
|
||||
def test_getnext_tensor_move_elimination_next_multi_inputs(tag):
|
||||
fns = FnDict()
|
||||
|
||||
@fns
|
||||
def before():
|
||||
res = get_next()
|
||||
memcpy_out = memcpy_async_attr(res)
|
||||
res = add(memcpy_out, res)
|
||||
tensormove_out = tensor_move_attr(res)
|
||||
res = add(tensormove_out, res)
|
||||
return res
|
||||
|
||||
@fns
|
||||
def after():
|
||||
res = get_next()
|
||||
memcpy_out = memcpy_async_attr(res)
|
||||
res = add(memcpy_out, res)
|
||||
tensormove_out = tensor_move_attr(res)
|
||||
res = add(tensormove_out, res)
|
||||
return res
|
||||
|
||||
return fns[tag]
|
|
@ -19,7 +19,7 @@ from mindspore.ops import _constants as Constants
|
|||
from mindspore.ops import operations as P
|
||||
|
||||
get_next = P.GetNext([ms.float32, ms.int32], [[32, 64], [32]], 2, "")
|
||||
memcpy_async = Primitive('memcpy_async')
|
||||
tensor_move = Primitive('TensorMove')
|
||||
make_tuple = Primitive('MakeTuple')
|
||||
tuple_getitem = Primitive(Constants.kTupleGetItem)
|
||||
|
||||
|
@ -35,7 +35,7 @@ class FnDict:
|
|||
return self.fnDict[name]
|
||||
|
||||
|
||||
def test_insert_memcpy_async_for_getnext(tag):
|
||||
def test_insert_tensor_move_for_getnext(tag):
|
||||
fns = FnDict()
|
||||
|
||||
@fns
|
||||
|
@ -48,9 +48,9 @@ def test_insert_memcpy_async_for_getnext(tag):
|
|||
res = get_next()
|
||||
data = tuple_getitem(res, 0)
|
||||
label = tuple_getitem(res, 1)
|
||||
memcpy_async_data = memcpy_async(data)
|
||||
memcpy_async_label = memcpy_async(label)
|
||||
bind_tuple = make_tuple(memcpy_async_data, memcpy_async_label)
|
||||
tensor_move_data = tensor_move(data)
|
||||
tensor_move_label = tensor_move(label)
|
||||
bind_tuple = make_tuple(tensor_move_data, tensor_move_label)
|
||||
get_item0 = tuple_getitem(bind_tuple, 0)
|
||||
get_item1 = tuple_getitem(bind_tuple, 1)
|
||||
bind_tuple = make_tuple(make_tuple(get_item0, get_item1))
|
|
@ -20,7 +20,7 @@ from mindspore.ops import _constants as Constants
|
|||
depend = P.Depend()
|
||||
all_reduce = P.AllReduce()
|
||||
broadcast = P.Broadcast(1)
|
||||
memcpy_async = Primitive('memcpy_async')
|
||||
tensor_move = Primitive('TensorMove')
|
||||
make_tuple = Primitive('MakeTuple')
|
||||
tuple_getitem = Primitive(Constants.kTupleGetItem)
|
||||
assign_add = P.AssignAdd()
|
||||
|
@ -39,7 +39,7 @@ class FnDict:
|
|||
return self.fnDict[name]
|
||||
|
||||
|
||||
def test_insert_memcpy_async_for_hccl_op_cond1(tag):
|
||||
def test_insert_tensor_move_for_hccl_op_cond1(tag):
|
||||
fns = FnDict()
|
||||
|
||||
@fns
|
||||
|
@ -57,14 +57,14 @@ def test_insert_memcpy_async_for_hccl_op_cond1(tag):
|
|||
@fns
|
||||
def after(x):
|
||||
res1 = relu(x)
|
||||
res2 = memcpy_async(res1)
|
||||
res2 = tensor_move(res1)
|
||||
res2 = all_reduce(res2)
|
||||
return make_tuple(make_tuple(res1, res2))
|
||||
|
||||
return fns[tag]
|
||||
|
||||
|
||||
def test_insert_memcpy_async_for_hccl_op_cond2(tag):
|
||||
def test_insert_tensor_move_for_hccl_op_cond2(tag):
|
||||
fns = FnDict()
|
||||
|
||||
@fns
|
||||
|
@ -74,14 +74,14 @@ def test_insert_memcpy_async_for_hccl_op_cond2(tag):
|
|||
|
||||
@fns
|
||||
def after(x):
|
||||
res = memcpy_async(x)
|
||||
res = tensor_move(x)
|
||||
res = all_reduce(res)
|
||||
return make_tuple(res)
|
||||
|
||||
return fns[tag]
|
||||
|
||||
|
||||
def test_insert_memcpy_async_for_hccl_op_cond3(tag):
|
||||
def test_insert_tensor_move_for_hccl_op_cond3(tag):
|
||||
fns = FnDict()
|
||||
|
||||
@fns
|
||||
|
@ -93,14 +93,14 @@ def test_insert_memcpy_async_for_hccl_op_cond3(tag):
|
|||
@fns
|
||||
def after(a, b):
|
||||
res = assign_add(a, b)
|
||||
res = memcpy_async(res)
|
||||
res = tensor_move(res)
|
||||
res = all_reduce(res)
|
||||
return make_tuple(res)
|
||||
|
||||
return fns[tag]
|
||||
|
||||
|
||||
def test_insert_memcpy_async_for_hccl_op_cond4(tag):
|
||||
def test_insert_tensor_move_for_hccl_op_cond4(tag):
|
||||
fns = FnDict()
|
||||
|
||||
@fns
|
||||
|
@ -113,7 +113,7 @@ def test_insert_memcpy_async_for_hccl_op_cond4(tag):
|
|||
@fns
|
||||
def after(a, b):
|
||||
x = relu(a)
|
||||
y1 = memcpy_async(b)
|
||||
y1 = tensor_move(b)
|
||||
y2 = all_reduce(y1)
|
||||
res = depend(x, y2)
|
||||
return make_tuple(res)
|
||||
|
@ -121,7 +121,7 @@ def test_insert_memcpy_async_for_hccl_op_cond4(tag):
|
|||
return fns[tag]
|
||||
|
||||
|
||||
def test_insert_memcpy_async_for_hccl_op_cond5(tag):
|
||||
def test_insert_tensor_move_for_hccl_op_cond5(tag):
|
||||
fns = FnDict()
|
||||
|
||||
@fns
|
||||
|
@ -134,8 +134,8 @@ def test_insert_memcpy_async_for_hccl_op_cond5(tag):
|
|||
@fns
|
||||
def after(a, b, c):
|
||||
x = relu(a)
|
||||
m1 = memcpy_async(b)
|
||||
m2 = memcpy_async(c)
|
||||
m1 = tensor_move(b)
|
||||
m2 = tensor_move(c)
|
||||
y = broadcast(m1, m2)
|
||||
y0 = tuple_getitem(y, 0)
|
||||
y1 = tuple_getitem(y, 1)
|
Loading…
Reference in New Issue