forked from OSSInnovation/mindspore
!3709 GPU update bng pass
Merge pull request !3709 from VectorSL/update-bng-pass
This commit is contained in:
commit
0df4b11487
|
@ -1,88 +0,0 @@
|
||||||
/**
|
|
||||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
#include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
|
|
||||||
|
|
||||||
#include <memory>
|
|
||||||
#include <vector>
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#include "backend/session/anf_runtime_algorithm.h"
|
|
||||||
#include "ir/primitive.h"
|
|
||||||
#include "utils/utils.h"
|
|
||||||
#include "backend/optimizer/common/helper.h"
|
|
||||||
|
|
||||||
namespace mindspore {
|
|
||||||
namespace opt {
|
|
||||||
const BaseRef ReplaceBNGradCast2Fusion::DefinePattern() const {
|
|
||||||
VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_, x_, scale_, mean_, var_});
|
|
||||||
VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
|
|
||||||
VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
|
|
||||||
return out_cast;
|
|
||||||
}
|
|
||||||
|
|
||||||
const AnfNodePtr ReplaceBNGradCast2Fusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
|
|
||||||
const EquivPtr &equiv) const {
|
|
||||||
MS_EXCEPTION_IF_NULL(graph);
|
|
||||||
MS_EXCEPTION_IF_NULL(node);
|
|
||||||
MS_EXCEPTION_IF_NULL(equiv);
|
|
||||||
auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
|
|
||||||
auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
|
|
||||||
MS_EXCEPTION_IF_NULL(index_node);
|
|
||||||
auto value_node = index_node->cast<ValueNodePtr>();
|
|
||||||
MS_EXCEPTION_IF_NULL(value_node);
|
|
||||||
int item_idx = GetValue<int>(value_node->value());
|
|
||||||
if (item_idx != 0) {
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
|
|
||||||
|
|
||||||
auto dy_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
|
|
||||||
auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
|
|
||||||
|
|
||||||
auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2);
|
|
||||||
auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3);
|
|
||||||
auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4);
|
|
||||||
|
|
||||||
MS_EXCEPTION_IF_NULL(fbn2g);
|
|
||||||
MS_EXCEPTION_IF_NULL(dy_);
|
|
||||||
MS_EXCEPTION_IF_NULL(scale);
|
|
||||||
MS_EXCEPTION_IF_NULL(x_);
|
|
||||||
MS_EXCEPTION_IF_NULL(mean);
|
|
||||||
MS_EXCEPTION_IF_NULL(var);
|
|
||||||
|
|
||||||
auto manager = graph->manager();
|
|
||||||
MS_EXCEPTION_IF_NULL(manager);
|
|
||||||
manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
|
|
||||||
std::vector<TypeId> outputs_type;
|
|
||||||
std::vector<std::vector<size_t>> outputs_shape;
|
|
||||||
auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
|
|
||||||
for (size_t i = 0; i < output_num; i++) {
|
|
||||||
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
|
|
||||||
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
|
|
||||||
}
|
|
||||||
outputs_type[0] = AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0);
|
|
||||||
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
|
|
||||||
|
|
||||||
outputs_type.clear();
|
|
||||||
outputs_shape.clear();
|
|
||||||
outputs_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0));
|
|
||||||
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
|
|
||||||
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
|
|
||||||
|
|
||||||
return tuple;
|
|
||||||
}
|
|
||||||
} // namespace opt
|
|
||||||
} // namespace mindspore
|
|
|
@ -1,54 +0,0 @@
|
||||||
/**
|
|
||||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
|
||||||
*
|
|
||||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
* you may not use this file except in compliance with the License.
|
|
||||||
* You may obtain a copy of the License at
|
|
||||||
*
|
|
||||||
* http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
*
|
|
||||||
* Unless required by applicable law or agreed to in writing, software
|
|
||||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
* See the License for the specific language governing permissions and
|
|
||||||
* limitations under the License.
|
|
||||||
*/
|
|
||||||
#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
|
|
||||||
#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
|
|
||||||
|
|
||||||
#include <memory>
|
|
||||||
#include "backend/optimizer/common/optimizer.h"
|
|
||||||
|
|
||||||
namespace mindspore {
|
|
||||||
namespace opt {
|
|
||||||
class ReplaceBNGradCast2Fusion : public PatternProcessPass {
|
|
||||||
public:
|
|
||||||
explicit ReplaceBNGradCast2Fusion(bool multigraph = true) : PatternProcessPass("replace_grad_cast2", multigraph) {
|
|
||||||
dy_ = std::make_shared<Var>();
|
|
||||||
x_ = std::make_shared<Var>();
|
|
||||||
scale_ = std::make_shared<Var>();
|
|
||||||
mean_ = std::make_shared<Var>();
|
|
||||||
var_ = std::make_shared<Var>();
|
|
||||||
dx_ = std::make_shared<Var>();
|
|
||||||
bn_scale_ = std::make_shared<Var>();
|
|
||||||
bn_bias_ = std::make_shared<Var>();
|
|
||||||
index_ = std::make_shared<Var>();
|
|
||||||
}
|
|
||||||
~ReplaceBNGradCast2Fusion() override = default;
|
|
||||||
const BaseRef DefinePattern() const override;
|
|
||||||
const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
|
|
||||||
|
|
||||||
private:
|
|
||||||
VarPtr dy_;
|
|
||||||
VarPtr x_;
|
|
||||||
VarPtr scale_;
|
|
||||||
VarPtr mean_;
|
|
||||||
VarPtr var_;
|
|
||||||
VarPtr dx_;
|
|
||||||
VarPtr bn_scale_;
|
|
||||||
VarPtr bn_bias_;
|
|
||||||
VarPtr index_;
|
|
||||||
};
|
|
||||||
} // namespace opt
|
|
||||||
} // namespace mindspore
|
|
||||||
#endif // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
|
|
|
@ -45,7 +45,7 @@ const AnfNodePtr ReplaceBNGradCastFusion::Process(const FuncGraphPtr &graph, con
|
||||||
auto dy_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(dy_after), 0);
|
auto dy_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(dy_after), 0);
|
||||||
auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
|
auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
|
||||||
auto x_type = AnfAlgo::GetOutputInferDataType(x_, 0);
|
auto x_type = AnfAlgo::GetOutputInferDataType(x_, 0);
|
||||||
// if x_type is fp32, the cast is nessery.
|
// if x_type is fp32, the cast is necessary.
|
||||||
if (x_type == kNumberTypeFloat32) {
|
if (x_type == kNumberTypeFloat32) {
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
@ -65,35 +65,45 @@ const AnfNodePtr ReplaceBNGradCastFusion::Process(const FuncGraphPtr &graph, con
|
||||||
auto manager = graph->manager();
|
auto manager = graph->manager();
|
||||||
MS_EXCEPTION_IF_NULL(manager);
|
MS_EXCEPTION_IF_NULL(manager);
|
||||||
|
|
||||||
auto outlist = GetRealNodeUsedList(graph, fbn2g);
|
// 1. get all of the fusedbatchnormgrad nodes connected after dy_after.
|
||||||
for (size_t i = 0; i < outlist->size(); i++) {
|
auto fbn2g_all = GetRealNodeUsedList(graph, dy_after);
|
||||||
auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(outlist->at(i).first), 1);
|
for (size_t i = 0; i < fbn2g_all->size(); i++) {
|
||||||
|
outputs_type.clear();
|
||||||
|
outputs_shape.clear();
|
||||||
|
auto kernel = utils::cast<CNodePtr>(fbn2g_all->at(i).first);
|
||||||
|
auto kernel_name = AnfAlgo::GetCNodeName(kernel);
|
||||||
|
// 2. deal all of the fusedbatchnormgrad, change the data type.
|
||||||
|
if (kernel_name == AnfAlgo::GetCNodeName(utils::cast<CNodePtr>(fbn2g))) {
|
||||||
|
auto output_num = AnfAlgo::GetOutputTensorNum(kernel);
|
||||||
|
for (size_t j = 0; j < output_num; j++) {
|
||||||
|
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(kernel, j));
|
||||||
|
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(kernel, j));
|
||||||
|
}
|
||||||
|
outputs_type[0] = kNumberTypeFloat16;
|
||||||
|
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, kernel.get());
|
||||||
|
}
|
||||||
|
// 3. handle the output of fusedbatchnormgrad: tuplegetitem
|
||||||
|
auto outlist = GetRealNodeUsedList(graph, kernel);
|
||||||
|
for (size_t j = 0; j < outlist->size(); j++) {
|
||||||
|
outputs_type.clear();
|
||||||
|
outputs_shape.clear();
|
||||||
|
auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(outlist->at(j).first), 1);
|
||||||
auto value_node = index_node->cast<ValueNodePtr>();
|
auto value_node = index_node->cast<ValueNodePtr>();
|
||||||
MS_EXCEPTION_IF_NULL(value_node);
|
MS_EXCEPTION_IF_NULL(value_node);
|
||||||
int item_idx = GetValue<int>(value_node->value());
|
int item_idx = GetValue<int>(value_node->value());
|
||||||
if (item_idx == 0) {
|
if (item_idx == 0) {
|
||||||
auto cast = GetRealNodeUsedList(graph, outlist->at(i).first);
|
auto cast = GetRealNodeUsedList(graph, outlist->at(j).first);
|
||||||
if (AnfAlgo::GetCNodeName(cast->at(0).first) != "Cast") {
|
if (AnfAlgo::GetCNodeName(cast->at(0).first) != "Cast") {
|
||||||
return nullptr;
|
continue;
|
||||||
}
|
}
|
||||||
manager->Replace(utils::cast<CNodePtr>(cast->at(0).first), utils::cast<CNodePtr>(outlist->at(i).first));
|
manager->Replace(utils::cast<CNodePtr>(cast->at(0).first), utils::cast<CNodePtr>(outlist->at(j).first));
|
||||||
outputs_type.push_back(kNumberTypeFloat16);
|
outputs_type.push_back(kNumberTypeFloat16);
|
||||||
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(outlist->at(i).first, 0));
|
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(outlist->at(j).first, 0));
|
||||||
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, outlist->at(i).first.get());
|
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, outlist->at(j).first.get());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
outputs_type.clear();
|
|
||||||
outputs_shape.clear();
|
|
||||||
manager->Replace(utils::cast<CNodePtr>(dy_after), utils::cast<CNodePtr>(dy_before));
|
manager->Replace(utils::cast<CNodePtr>(dy_after), utils::cast<CNodePtr>(dy_before));
|
||||||
|
|
||||||
auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
|
|
||||||
for (size_t i = 0; i < output_num; i++) {
|
|
||||||
outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
|
|
||||||
outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
|
|
||||||
}
|
|
||||||
outputs_type[0] = kNumberTypeFloat16;
|
|
||||||
AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
|
|
||||||
|
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
} // namespace opt
|
} // namespace opt
|
||||||
|
|
|
@ -28,7 +28,6 @@
|
||||||
#include "backend/optimizer/gpu/adam_fusion.h"
|
#include "backend/optimizer/gpu/adam_fusion.h"
|
||||||
#include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
|
#include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
|
||||||
#include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
|
#include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
|
||||||
#include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
|
|
||||||
#include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
|
#include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
|
||||||
#include "backend/optimizer/gpu/replace_addn_fusion.h"
|
#include "backend/optimizer/gpu/replace_addn_fusion.h"
|
||||||
#include "runtime/device/kernel_runtime_manager.h"
|
#include "runtime/device/kernel_runtime_manager.h"
|
||||||
|
@ -68,7 +67,6 @@ void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
|
||||||
pm->AddPass(std::make_shared<opt::AdamFusion>());
|
pm->AddPass(std::make_shared<opt::AdamFusion>());
|
||||||
pm->AddPass(std::make_shared<opt::ReplaceBNCastFusion>());
|
pm->AddPass(std::make_shared<opt::ReplaceBNCastFusion>());
|
||||||
pm->AddPass(std::make_shared<opt::ReplaceBNGradCastFusion>());
|
pm->AddPass(std::make_shared<opt::ReplaceBNGradCastFusion>());
|
||||||
pm->AddPass(std::make_shared<opt::ReplaceBNGradCast2Fusion>());
|
|
||||||
pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>());
|
pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>());
|
||||||
pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>());
|
pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>());
|
||||||
optimizer->AddPassManager(pm);
|
optimizer->AddPassManager(pm);
|
||||||
|
|
Loading…
Reference in New Issue