gpu change bncast

2020-09-22 10:13:19 +08:00 · 2020-09-22 10:13:19 +08:00 · 48db7f8c4f
parent f5a196d54f
commit 48db7f8c4f
10 changed files with 7 additions and 330 deletions
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.cc
@ -1,89 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
-
-#include <memory>
-#include <vector>
-#include <string>
-
-#include "backend/session/anf_runtime_algorithm.h"
-#include "ir/primitive.h"
-#include "utils/utils.h"
-#include "backend/optimizer/common/helper.h"
-
-namespace mindspore {
-namespace opt {
-const BaseRef ReplaceBNCastFusion::DefinePattern() const {
-  VectorRef in_cast = VectorRef({prim::kPrimCast, x_});
-  VectorRef fbn2 = VectorRef({prim::kPrimFusedBatchNormEx, in_cast, scale_, bias_, mean_, var_});
-  VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2, index_});
-  return tupleget;
-}
-
-const AnfNodePtr ReplaceBNCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
-                                              const EquivPtr &equiv) const {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(node);
-  auto fbn2 = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
-  auto x_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2), 0);
-  auto x_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(x_after), 0);
-  MS_EXCEPTION_IF_NULL(fbn2);
-  MS_EXCEPTION_IF_NULL(x_after);
-  MS_EXCEPTION_IF_NULL(x_before);
-  // only deal with x_after with fp32: x 16->32->bn->16->32
-  if (AnfAlgo::GetOutputInferDataType(x_after, 0) == kNumberTypeFloat16) {
-    return nullptr;
-  }
-  std::vector<TypeId> outputs_type;
-  std::vector<std::vector<size_t>> outputs_shape;
-  auto manager = graph->manager();
-  MS_EXCEPTION_IF_NULL(manager);
-  auto outlist = GetRealNodeUsedList(graph, fbn2);
-  bool changed = false;
-  for (size_t i = 0; i < outlist->size(); i++) {
-    auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(outlist->at(i).first), 1);
-    auto value_node = index_node->cast<ValueNodePtr>();
-    MS_EXCEPTION_IF_NULL(value_node);
-    int item_idx = GetValue<int>(value_node->value());
-    if (item_idx == 0) {
-      auto cast = GetRealNodeUsedList(graph, outlist->at(i).first);
-      if (AnfAlgo::GetCNodeName(cast->at(0).first) != "Cast") {
-        continue;
-      }
-      manager->Replace(utils::cast<CNodePtr>(cast->at(0).first), utils::cast<CNodePtr>(outlist->at(i).first));
-      outputs_type.push_back(kNumberTypeFloat16);
-      outputs_shape.push_back(AnfAlgo::GetOutputInferShape(outlist->at(i).first, 0));
-      AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, outlist->at(i).first.get());
-      changed = true;
-    }
-  }
-  if (!changed) {
-    return nullptr;
-  }
-  manager->Replace(utils::cast<CNodePtr>(x_after), utils::cast<CNodePtr>(x_before));
-  outputs_type.clear();
-  outputs_shape.clear();
-  auto output_num = AnfAlgo::GetOutputTensorNum(fbn2);
-  for (size_t i = 0; i < output_num; i++) {
-    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2, i));
-    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2, i));
-  }
-  outputs_type[0] = kNumberTypeFloat16;
-  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2.get());
-  return node;
-}
-}  // namespace opt
-}  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_cast_fusion.h
@ -1,58 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
-#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
-
-#include <memory>
-#include "backend/optimizer/common/optimizer.h"
-
-namespace mindspore {
-namespace opt {
-class ReplaceBNCastFusion : public PatternProcessPass {
- public:
-  explicit ReplaceBNCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_cast", multigraph) {
-    x_ = std::make_shared<Var>();
-    scale_ = std::make_shared<Var>();
-    bias_ = std::make_shared<Var>();
-    mean_ = std::make_shared<Var>();
-    var_ = std::make_shared<Var>();
-    y_ = std::make_shared<Var>();
-    running_mean_ = std::make_shared<Var>();
-    running_var_ = std::make_shared<Var>();
-    save_mean_ = std::make_shared<Var>();
-    save_var_ = std::make_shared<Var>();
-    index_ = std::make_shared<Var>();
-  }
-  ~ReplaceBNCastFusion() override = default;
-  const BaseRef DefinePattern() const override;
-  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
-
- private:
-  VarPtr x_;
-  VarPtr scale_;
-  VarPtr bias_;
-  VarPtr mean_;
-  VarPtr var_;
-  VarPtr y_;
-  VarPtr running_mean_;
-  VarPtr running_var_;
-  VarPtr save_mean_;
-  VarPtr save_var_;
-  VarPtr index_;
-};
-}  // namespace opt
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_CAST_FUSION_H_
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.cc
@ -1,108 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
-
-#include <memory>
-#include <vector>
-#include <string>
-
-#include "backend/session/anf_runtime_algorithm.h"
-#include "ir/primitive.h"
-#include "utils/utils.h"
-#include "backend/optimizer/common/helper.h"
-
-namespace mindspore {
-namespace opt {
-const BaseRef ReplaceBNGradCastFusion::DefinePattern() const {
-  VectorRef dy_cast = VectorRef({prim::kPrimCast, dy_});
-  VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGradEx, dy_cast, x_, scale_, mean_, var_, reserve_});
-  VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
-  return tupleget;
-}
-
-const void HandleOutput(const FuncGraphPtr &graph, const mindspore::CNodePtr &kernel) {
-  auto outlist = GetRealNodeUsedList(graph, kernel);
-  auto manager = graph->manager();
-  MS_EXCEPTION_IF_NULL(manager);
-  for (size_t j = 0; j < outlist->size(); j++) {
-    std::vector<TypeId> outputs_type;
-    std::vector<std::vector<size_t>> outputs_shape;
-    auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(outlist->at(j).first), 1);
-    auto value_node = index_node->cast<ValueNodePtr>();
-    MS_EXCEPTION_IF_NULL(value_node);
-    int item_idx = GetValue<int>(value_node->value());
-    if (item_idx == 0) {
-      auto cast = GetRealNodeUsedList(graph, outlist->at(j).first);
-      if (AnfAlgo::GetCNodeName(cast->at(0).first) != "Cast") {
-        continue;
-      }
-      manager->Replace(utils::cast<CNodePtr>(cast->at(0).first), utils::cast<CNodePtr>(outlist->at(j).first));
-      outputs_type.push_back(kNumberTypeFloat16);
-      outputs_shape.push_back(AnfAlgo::GetOutputInferShape(outlist->at(j).first, 0));
-      AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, outlist->at(j).first.get());
-    }
-  }
-}
-
-const AnfNodePtr ReplaceBNGradCastFusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
-                                                  const EquivPtr &equiv) const {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(node);
-  MS_EXCEPTION_IF_NULL(equiv);
-
-  auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
-  auto dy_after = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
-  auto dy_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(dy_after), 0);
-  auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
-  MS_EXCEPTION_IF_NULL(x_);
-  // if x_type is fp32, the cast is necessary or dy_afer is fp32: dy 16->32->bng->16->32.
-  if (AnfAlgo::GetOutputInferDataType(x_, 0) == kNumberTypeFloat32 ||
-      AnfAlgo::GetOutputInferDataType(dy_after, 0) == kNumberTypeFloat16) {
-    return nullptr;
-  }
-  MS_EXCEPTION_IF_NULL(fbn2g);
-  MS_EXCEPTION_IF_NULL(dy_after);
-  MS_EXCEPTION_IF_NULL(dy_before);
-  std::vector<TypeId> outputs_type;
-  std::vector<std::vector<size_t>> outputs_shape;
-  auto manager = graph->manager();
-  MS_EXCEPTION_IF_NULL(manager);
-
-  // 1. get all of the fusedbatchnormgrad nodes connected after dy_after.
-  auto fbn2g_all = GetRealNodeUsedList(graph, dy_after);
-  for (size_t i = 0; i < fbn2g_all->size(); i++) {
-    outputs_type.clear();
-    outputs_shape.clear();
-    auto kernel = utils::cast<CNodePtr>(fbn2g_all->at(i).first);
-    auto kernel_name = AnfAlgo::GetCNodeName(kernel);
-    // 2. deal all of the fusedbatchnormgrad, change the data type.
-    if (kernel_name == AnfAlgo::GetCNodeName(utils::cast<CNodePtr>(fbn2g))) {
-      auto output_num = AnfAlgo::GetOutputTensorNum(kernel);
-      for (size_t j = 0; j < output_num; j++) {
-        outputs_type.push_back(AnfAlgo::GetOutputInferDataType(kernel, j));
-        outputs_shape.push_back(AnfAlgo::GetOutputInferShape(kernel, j));
-      }
-      outputs_type[0] = kNumberTypeFloat16;
-      AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, kernel.get());
-    }
-    // 3. handle the output of fusedbatchnormgrad: tuplegetitem
-    HandleOutput(graph, kernel);
-  }
-  manager->Replace(utils::cast<CNodePtr>(dy_after), utils::cast<CNodePtr>(dy_before));
-  return node;
-}
-}  // namespace opt
-}  // namespace mindspore
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.h
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.h
@ -1,56 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
-#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
-
-#include <memory>
-#include "backend/optimizer/common/optimizer.h"
-
-namespace mindspore {
-namespace opt {
-class ReplaceBNGradCastFusion : public PatternProcessPass {
- public:
-  explicit ReplaceBNGradCastFusion(bool multigraph = true) : PatternProcessPass("replace_bn_grad_cast", multigraph) {
-    dy_ = std::make_shared<Var>();
-    x_ = std::make_shared<Var>();
-    scale_ = std::make_shared<Var>();
-    mean_ = std::make_shared<Var>();
-    var_ = std::make_shared<Var>();
-    dx_ = std::make_shared<Var>();
-    bn_scale_ = std::make_shared<Var>();
-    bn_bias_ = std::make_shared<Var>();
-    index_ = std::make_shared<Var>();
-    reserve_ = std::make_shared<Var>();
-  }
-  ~ReplaceBNGradCastFusion() override = default;
-  const BaseRef DefinePattern() const override;
-  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
-
- private:
-  VarPtr dy_;
-  VarPtr x_;
-  VarPtr scale_;
-  VarPtr mean_;
-  VarPtr var_;
-  VarPtr dx_;
-  VarPtr bn_scale_;
-  VarPtr bn_bias_;
-  VarPtr index_;
-  VarPtr reserve_;
-};
-}  // namespace opt
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST_FUSION_H_
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@ -28,8 +28,6 @@
 #include "backend/optimizer/gpu/adam_fusion.h"
 #include "backend/optimizer/gpu/apply_momentum_weight_scale_fusion.h"
 #include "backend/optimizer/gpu/apply_momentum_scale_fusion.h"
-#include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
-#include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
 #include "backend/optimizer/gpu/batch_norm_relu_fusion.h"
 #include "backend/optimizer/gpu/batch_norm_relu_grad_fusion.h"
 #include "backend/optimizer/gpu/batch_norm_add_relu_fusion.h"
@ -82,8 +80,6 @@ void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
  auto pm = std::make_shared<opt::PassManager>();
  pm->AddPass(std::make_shared<opt::AdamWeightDecayFusion>());
  pm->AddPass(std::make_shared<opt::AdamFusion>());
-  pm->AddPass(std::make_shared<opt::ReplaceBNCastFusion>());
-  pm->AddPass(std::make_shared<opt::ReplaceBNGradCastFusion>());
  pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>());
  pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>());
  optimizer->AddPassManager(pm);
--- a/model_zoo/official/cv/googlenet/README.md
+++ b/model_zoo/official/cv/googlenet/README.md
@ -447,7 +447,7 @@ If you need to use the trained model to perform inference on multiple hardware p
                 Tensor(lr), cfg.momentum, weight_decay=cfg.weight_decay)
  loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False)
  model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'},
-                amp_level="O2", keep_batchnorm_fp32=True, loss_scale_manager=None)
+                amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=None)
  
  # Set callbacks 
  config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 5, 
--- a/model_zoo/official/cv/googlenet/train.py
+++ b/model_zoo/official/cv/googlenet/train.py
@ -197,12 +197,8 @@ if __name__ == '__main__':
        else:
            loss_scale_manager = FixedLossScaleManager(cfg.loss_scale, drop_overflow_update=False)

-    if device_target == "Ascend":
-        model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'},
-                      amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=loss_scale_manager)
-    else:  # GPU
-        model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'},
-                      amp_level="O2", keep_batchnorm_fp32=True, loss_scale_manager=loss_scale_manager)
+    model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'},
+                  amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=loss_scale_manager)

    config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 5, keep_checkpoint_max=cfg.keep_checkpoint_max)
    time_cb = TimeMonitor(data_size=batch_num)
--- a/model_zoo/official/cv/resnet/train.py
+++ b/model_zoo/official/cv/resnet/train.py
@ -168,7 +168,7 @@ if __name__ == '__main__':
            loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
            # Mixed precision
            model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'},
-                          amp_level="O2", keep_batchnorm_fp32=True)
+                          amp_level="O2", keep_batchnorm_fp32=False)
        else:
            ## fp32 training
            opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay)
--- a/model_zoo/official/cv/resnet_thor/train.py
+++ b/model_zoo/official/cv/resnet_thor/train.py
@ -124,12 +124,8 @@ if __name__ == '__main__':
               filter(lambda x: 'G_inv_max' in x.name, net.get_parameters()),
               config.weight_decay, config.loss_scale)
    loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
-    if target == "Ascend":
-        model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', loss_scale_manager=loss_scale,
-                      keep_batchnorm_fp32=False, metrics={'acc'}, frequency=config.frequency)
-    else:
-        model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale, metrics={'acc'},
-                      amp_level="O2", keep_batchnorm_fp32=True, frequency=config.frequency)
+    model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', loss_scale_manager=loss_scale,
+                  keep_batchnorm_fp32=False, metrics={'acc'}, frequency=config.frequency)

    # define callbacks
    time_cb = TimeMonitor(data_size=step_size)
--- a/model_zoo/official/cv/yolov3_darknet53/train.py
+++ b/model_zoo/official/cv/yolov3_darknet53/train.py
@ -215,7 +215,7 @@ def train():
        loss_scale_value = 1.0
        loss_scale = FixedLossScaleManager(loss_scale_value, drop_overflow_update=False)
        network = amp.build_train_network(network, optimizer=opt, loss_scale_manager=loss_scale,
-                                          level="O2", keep_batchnorm_fp32=True)
+                                          level="O2", keep_batchnorm_fp32=False)
        keep_loss_fp32(network)
    else:
        network = TrainingWrapper(network, opt)