diff --git a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.cc b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.cc
deleted file mode 100644
index a5b2a92bb7..0000000000
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
-
-#include <memory>
-#include <vector>
-#include <string>
-
-#include "backend/session/anf_runtime_algorithm.h"
-#include "ir/primitive.h"
-#include "utils/utils.h"
-#include "backend/optimizer/common/helper.h"
-
-namespace mindspore {
-namespace opt {
-const BaseRef ReplaceBNGradCast2Fusion::DefinePattern() const {
-  VectorRef fbn2g = VectorRef({prim::kPrimFusedBatchNormGrad, dy_, x_, scale_, mean_, var_});
-  VectorRef tupleget = VectorRef({prim::kPrimTupleGetItem, fbn2g, index_});
-  VectorRef out_cast = VectorRef({prim::kPrimCast, tupleget});
-  return out_cast;
-}
-
-const AnfNodePtr ReplaceBNGradCast2Fusion::Process(const FuncGraphPtr &graph, const AnfNodePtr &node,
-                                                   const EquivPtr &equiv) const {
-  MS_EXCEPTION_IF_NULL(graph);
-  MS_EXCEPTION_IF_NULL(node);
-  MS_EXCEPTION_IF_NULL(equiv);
-  auto tuple = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(node), 0);
-  auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 1);
-  MS_EXCEPTION_IF_NULL(index_node);
-  auto value_node = index_node->cast<ValueNodePtr>();
-  MS_EXCEPTION_IF_NULL(value_node);
-  int item_idx = GetValue<int>(value_node->value());
-  if (item_idx != 0) {
-    return nullptr;
-  }
-  auto fbn2g = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(tuple), 0);
-
-  auto dy_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 0);
-  auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
-
-  auto scale = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 2);
-  auto mean = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 3);
-  auto var = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 4);
-
-  MS_EXCEPTION_IF_NULL(fbn2g);
-  MS_EXCEPTION_IF_NULL(dy_);
-  MS_EXCEPTION_IF_NULL(scale);
-  MS_EXCEPTION_IF_NULL(x_);
-  MS_EXCEPTION_IF_NULL(mean);
-  MS_EXCEPTION_IF_NULL(var);
-
-  auto manager = graph->manager();
-  MS_EXCEPTION_IF_NULL(manager);
-  manager->Replace(utils::cast<CNodePtr>(node), utils::cast<CNodePtr>(tuple));
-  std::vector<TypeId> outputs_type;
-  std::vector<std::vector<size_t>> outputs_shape;
-  auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
-  for (size_t i = 0; i < output_num; i++) {
-    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
-    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
-  }
-  outputs_type[0] = AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0);
-  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
-
-  outputs_type.clear();
-  outputs_shape.clear();
-  outputs_type.push_back(AnfAlgo::GetPrevNodeOutputInferDataType(fbn2g, 0));
-  outputs_shape.push_back(AnfAlgo::GetOutputInferShape(tuple, 0));
-  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, tuple.get());
-
-  return tuple;
-}
-}  // namespace opt
-}  // namespace mindspore
diff --git a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h
deleted file mode 100644
index fcb56be712..0000000000
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/**
- * Copyright 2020 Huawei Technologies Co., Ltd
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#ifndef MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
-#define MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
-
-#include <memory>
-#include "backend/optimizer/common/optimizer.h"
-
-namespace mindspore {
-namespace opt {
-class ReplaceBNGradCast2Fusion : public PatternProcessPass {
- public:
-  explicit ReplaceBNGradCast2Fusion(bool multigraph = true) : PatternProcessPass("replace_grad_cast2", multigraph) {
-    dy_ = std::make_shared<Var>();
-    x_ = std::make_shared<Var>();
-    scale_ = std::make_shared<Var>();
-    mean_ = std::make_shared<Var>();
-    var_ = std::make_shared<Var>();
-    dx_ = std::make_shared<Var>();
-    bn_scale_ = std::make_shared<Var>();
-    bn_bias_ = std::make_shared<Var>();
-    index_ = std::make_shared<Var>();
-  }
-  ~ReplaceBNGradCast2Fusion() override = default;
-  const BaseRef DefinePattern() const override;
-  const AnfNodePtr Process(const FuncGraphPtr &, const AnfNodePtr &, const EquivPtr &) const override;
-
- private:
-  VarPtr dy_;
-  VarPtr x_;
-  VarPtr scale_;
-  VarPtr mean_;
-  VarPtr var_;
-  VarPtr dx_;
-  VarPtr bn_scale_;
-  VarPtr bn_bias_;
-  VarPtr index_;
-};
-}  // namespace opt
-}  // namespace mindspore
-#endif  // MINDSPORE_CCSRC_BACKEND_OPTIMIZER_GPU_REPLACE_BN_GRAD_CAST2_FUSION_H_
diff --git a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.cc b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.cc
index 37bb0d96ad..4e1be81ab7 100644
--- a/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.cc
+++ b/mindspore/ccsrc/backend/optimizer/gpu/replace_bn_grad_cast_fusion.cc
@@ -45,7 +45,7 @@ const AnfNodePtr ReplaceBNGradCastFusion::Process(const FuncGraphPtr &graph, con
   auto dy_before = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(dy_after), 0);
   auto x_ = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(fbn2g), 1);
   auto x_type = AnfAlgo::GetOutputInferDataType(x_, 0);
-  // if x_type is fp32, the cast is nessery.
+  // if x_type is fp32, the cast is necessary.
   if (x_type == kNumberTypeFloat32) {
     return nullptr;
   }
@@ -65,35 +65,45 @@ const AnfNodePtr ReplaceBNGradCastFusion::Process(const FuncGraphPtr &graph, con
   auto manager = graph->manager();
   MS_EXCEPTION_IF_NULL(manager);
 
-  auto outlist = GetRealNodeUsedList(graph, fbn2g);
-  for (size_t i = 0; i < outlist->size(); i++) {
-    auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(outlist->at(i).first), 1);
-    auto value_node = index_node->cast<ValueNodePtr>();
-    MS_EXCEPTION_IF_NULL(value_node);
-    int item_idx = GetValue<int>(value_node->value());
-    if (item_idx == 0) {
-      auto cast = GetRealNodeUsedList(graph, outlist->at(i).first);
-      if (AnfAlgo::GetCNodeName(cast->at(0).first) != "Cast") {
-        return nullptr;
+  // 1. get all of the fusedbatchnormgrad nodes connected after dy_after.
+  auto fbn2g_all = GetRealNodeUsedList(graph, dy_after);
+  for (size_t i = 0; i < fbn2g_all->size(); i++) {
+    outputs_type.clear();
+    outputs_shape.clear();
+    auto kernel = utils::cast<CNodePtr>(fbn2g_all->at(i).first);
+    auto kernel_name = AnfAlgo::GetCNodeName(kernel);
+    // 2. deal all of the fusedbatchnormgrad, change the data type.
+    if (kernel_name == AnfAlgo::GetCNodeName(utils::cast<CNodePtr>(fbn2g))) {
+      auto output_num = AnfAlgo::GetOutputTensorNum(kernel);
+      for (size_t j = 0; j < output_num; j++) {
+        outputs_type.push_back(AnfAlgo::GetOutputInferDataType(kernel, j));
+        outputs_shape.push_back(AnfAlgo::GetOutputInferShape(kernel, j));
+      }
+      outputs_type[0] = kNumberTypeFloat16;
+      AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, kernel.get());
+    }
+    // 3. handle the output of fusedbatchnormgrad: tuplegetitem
+    auto outlist = GetRealNodeUsedList(graph, kernel);
+    for (size_t j = 0; j < outlist->size(); j++) {
+      outputs_type.clear();
+      outputs_shape.clear();
+      auto index_node = AnfAlgo::GetInputNode(utils::cast<CNodePtr>(outlist->at(j).first), 1);
+      auto value_node = index_node->cast<ValueNodePtr>();
+      MS_EXCEPTION_IF_NULL(value_node);
+      int item_idx = GetValue<int>(value_node->value());
+      if (item_idx == 0) {
+        auto cast = GetRealNodeUsedList(graph, outlist->at(j).first);
+        if (AnfAlgo::GetCNodeName(cast->at(0).first) != "Cast") {
+          continue;
+        }
+        manager->Replace(utils::cast<CNodePtr>(cast->at(0).first), utils::cast<CNodePtr>(outlist->at(j).first));
+        outputs_type.push_back(kNumberTypeFloat16);
+        outputs_shape.push_back(AnfAlgo::GetOutputInferShape(outlist->at(j).first, 0));
+        AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, outlist->at(j).first.get());
       }
-      manager->Replace(utils::cast<CNodePtr>(cast->at(0).first), utils::cast<CNodePtr>(outlist->at(i).first));
-      outputs_type.push_back(kNumberTypeFloat16);
-      outputs_shape.push_back(AnfAlgo::GetOutputInferShape(outlist->at(i).first, 0));
-      AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, outlist->at(i).first.get());
     }
   }
-  outputs_type.clear();
-  outputs_shape.clear();
   manager->Replace(utils::cast<CNodePtr>(dy_after), utils::cast<CNodePtr>(dy_before));
-
-  auto output_num = AnfAlgo::GetOutputTensorNum(fbn2g);
-  for (size_t i = 0; i < output_num; i++) {
-    outputs_type.push_back(AnfAlgo::GetOutputInferDataType(fbn2g, i));
-    outputs_shape.push_back(AnfAlgo::GetOutputInferShape(fbn2g, i));
-  }
-  outputs_type[0] = kNumberTypeFloat16;
-  AnfAlgo::SetOutputInferTypeAndShape(outputs_type, outputs_shape, fbn2g.get());
-
   return node;
 }
 }  // namespace opt
diff --git a/mindspore/ccsrc/backend/session/gpu_session.cc b/mindspore/ccsrc/backend/session/gpu_session.cc
index 2a5929f383..9d0a140108 100644
--- a/mindspore/ccsrc/backend/session/gpu_session.cc
+++ b/mindspore/ccsrc/backend/session/gpu_session.cc
@@ -28,7 +28,6 @@
 #include "backend/optimizer/gpu/adam_fusion.h"
 #include "backend/optimizer/gpu/replace_bn_cast_fusion.h"
 #include "backend/optimizer/gpu/replace_bn_grad_cast_fusion.h"
-#include "backend/optimizer/gpu/replace_bn_grad_cast2_fusion.h"
 #include "backend/optimizer/gpu/replace_momentum_cast_fusion.h"
 #include "backend/optimizer/gpu/replace_addn_fusion.h"
 #include "runtime/device/kernel_runtime_manager.h"
@@ -68,7 +67,6 @@ void GPUSession::Optimize(const std::shared_ptr<KernelGraph> &kernel_graph) {
   pm->AddPass(std::make_shared<opt::AdamFusion>());
   pm->AddPass(std::make_shared<opt::ReplaceBNCastFusion>());
   pm->AddPass(std::make_shared<opt::ReplaceBNGradCastFusion>());
-  pm->AddPass(std::make_shared<opt::ReplaceBNGradCast2Fusion>());
   pm->AddPass(std::make_shared<opt::ReplaceMomentumCastFusion>());
   pm->AddPass(std::make_shared<opt::ReplaceAddNFusion>());
   optimizer->AddPassManager(pm);