add parallel op for batchnorm

2021-06-16 14:57:56 +08:00 · 2021-06-16 14:57:56 +08:00 · af0d28de48
parent a0c5b56f5f
commit af0d28de48
11 changed files with 556 additions and 55 deletions
--- a/mindspore/ccsrc/frontend/parallel/dynamic_creator.h
+++ b/mindspore/ccsrc/frontend/parallel/dynamic_creator.h
@ -197,6 +197,7 @@ REGISTER(TopKInfo);
 REGISTER(ScatterUpdateInfo);
 REGISTER(VirtualOutputInfo);
 REGISTER(Conv2DInfo);
+REGISTER(BatchNormInfo);
 }  // namespace parallel
 }  // namespace mindspore

--- a/mindspore/ccsrc/frontend/parallel/ops_info/batchnorm_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/batchnorm_info.cc
@ -0,0 +1,274 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "frontend/parallel/ops_info/batchnorm_info.h"
+
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "frontend/parallel/device_matrix.h"
+#include "frontend/parallel/strategy.h"
+#include "frontend/parallel/tensor_layout/tensor_redistribution.h"
+#include "pipeline/jit/resource.h"
+
+namespace mindspore {
+namespace parallel {
+Status BatchNormInfo::GetAttrs() {
+  is_training_ = GetBoolAttr(IS_TRAINING);
+
+  epsilon_ = GetFloatAttr(EPSILON);
+
+  momentum_ = GetFloatAttr(MOMENTUM);
+
+  format_ = GetStringAttr(FORMAT);
+  if (format_ != NCHW) {
+    MS_LOG(ERROR) << name_ << ": The data format must be 'NCHW', but got " << format_;
+    return FAILED;
+  }
+
+  if (inputs_shape_.empty()) {
+    MS_LOG(ERROR) << name_ << ": The inputs shape is empty";
+    return FAILED;
+  }
+
+  if (inputs_shape_[0].size() == 2) {
+    input_is_4d_ = false;
+  } else if (inputs_shape_[0].size() == 4) {
+    input_is_4d_ = true;
+  } else {
+    MS_LOG(ERROR) << name_ << ": The size of input[0]'shape must be 2 or 4, but got " << inputs_shape_[0].size();
+    return FAILED;
+  }
+
+  MS_LOG(INFO) << name_ << ": The is_traing is " << is_training_ << ", epsilon is " << epsilon_ << ", momentum is "
+               << momentum_ << ", data format is " << format_;
+
+  return SUCCESS;
+}
+
+Status BatchNormInfo::CheckStrategy(const StrategyPtr &strategy) {
+  MS_EXCEPTION_IF_NULL(strategy);
+  if (CheckStrategyValue(strategy, inputs_shape_) != SUCCESS) {
+    MS_LOG(ERROR) << name_ << ": Invalid strategy";
+    return FAILED;
+  }
+
+  std::vector<Dimensions> stra = strategy->GetInputDim();
+  if (stra.size() != 5) {
+    MS_LOG(ERROR) << name_ << ": The size of strategy must be 5, but got " << stra.size();
+    return FAILED;
+  }
+
+  if ((stra[0].size() != 4) && (stra[0].size() != 2)) {
+    MS_LOG(ERROR) << name_ << ": The size of strategy[0] must be 4 or 2, but got " << stra[0].size();
+    return FAILED;
+  }
+
+  for (size_t i = 1; i < 5; ++i) {
+    if (stra[i].empty()) {
+      MS_LOG(ERROR) << name_ << ": The strategy can not be empty, the index is " << i;
+      return FAILED;
+    }
+    if (stra[0][1] != stra[i][0]) {
+      MS_LOG(ERROR) << name_ << ": Invalid strategy, the index is " << i << ", it must be equal to " << stra[0][1]
+                    << ", but got " << stra[i][0];
+      return FAILED;
+    }
+  }
+
+  return SUCCESS;
+}
+
+Status BatchNormInfo::InferDevMatrixShape() {
+  MS_EXCEPTION_IF_NULL(strategy_);
+  std::vector<Dimensions> stra = strategy_->GetInputDim();
+  if (stra.empty()) {
+    MS_LOG(ERROR) << name_ << ": The strategy can not be empty";
+    return FAILED;
+  }
+
+  dev_matrix_shape_ = stra[0];
+  return SUCCESS;
+}
+
+Status BatchNormInfo::InferTensorMap() {
+  TensorMap input_tensor_map;
+  TensorMap in_other_tensor_map;
+
+  if (input_is_4d_) {
+    // if input is 4d:
+    // input_strategy: ((n, c, h, w), (c), (c), (c), (c))
+    // output_strategy: ((n, c, h, w), (c), (c), (c), (c))
+    // dev_matrix: (n, c, h, w)
+    input_tensor_map = {3, 2, 1, 0};
+    in_other_tensor_map = {2};
+  } else {
+    // if input is 2d:
+    // input_strategy: ((n, c), (c), (c), (c), (c))
+    // output_strategy: ((n, c), (c), (c), (c), (c))
+    // dev_matrix: (n, c)
+    input_tensor_map = {1, 0};
+    in_other_tensor_map = {0};
+  }
+
+  inputs_tensor_map_.push_back(input_tensor_map);     // input
+  inputs_tensor_map_.push_back(in_other_tensor_map);  // scale
+  inputs_tensor_map_.push_back(in_other_tensor_map);  // bias
+  inputs_tensor_map_.push_back(in_other_tensor_map);  // mean
+  inputs_tensor_map_.push_back(in_other_tensor_map);  // variance
+
+  outputs_tensor_map_ = inputs_tensor_map_;
+  return SUCCESS;
+}
+
+Status BatchNormInfo::InferForwardCommunication() {
+  // if it is not training, no need forward allreduce
+  if (!is_training_) {
+    MS_LOG(INFO) << name_ << ": It is not training, no need forward allreduce";
+    return SUCCESS;
+  }
+
+  TensorMap tmp_map;
+  if (input_is_4d_) {
+    // input is 4d:
+    // if has not repeated calculation, the dev matirx is [n, c, h, w]
+    // if repeated calculation and repeated num in the left of dev matrix, the dev matrix is [repeated_num, n, c, h, w]
+    // if repeated calculation and repeated num in the right of dev matrix, the dev matrix is [n, c, h, w, repeated_num]
+    // and the forward allreduce need to use the dimensions of n/h/w
+    if (repeated_calc_num_ == 1) {
+      // has not repeated calculation
+      tmp_map = {-1, 2, -1, -1};
+    } else if (!repeated_num_in_dev_matrix_right_) {
+      // repeated calculation and repeated num in the left of dev matrix
+      tmp_map = {4, -1, 2, -1, -1};
+    } else {
+      // repeated calculation and repeated num in the right of dev matrix
+      tmp_map = {-1, 3, -1, -1, 0};
+    }
+  } else {
+    // input is 2d:
+    // if has not repeated calculation, the dev matirx is [n, c]
+    // if repeated calculation and repeated num in the left of dev matrix, the dev matrix is [repeated_num, n, c]
+    // if repeated calculation and repeated num in the right of dev matrix, the dev matrix is [n, c, repeated_num]
+    // and the forward allreduce need to use the dimensions of n
+    if (repeated_calc_num_ == 1) {
+      // has not repeated calculation
+      tmp_map = {-1, 0};
+    } else if (!repeated_num_in_dev_matrix_right_) {
+      // repeated calculation and repeated num in the left of dev matrix
+      tmp_map = {2, -1, 0};
+    } else {
+      // repeated calculation and repeated num in the right of dev matrix
+      tmp_map = {-1, 1, 0};
+    }
+  }
+
+  std::vector<Group> group_list;
+  if (CreateGroupByTensorMap(tmp_map, &group_list) != SUCCESS) {
+    MS_LOG(ERROR) << name_ << ": Create group failed";
+    return FAILED;
+  }
+
+  if (group_list.empty()) {
+    MS_LOG(INFO) << name_ << ": Forward all reduce is not required";
+    return SUCCESS;
+  } else {
+    MS_LOG(INFO) << name_ << ": The group name of forward all reduce is " << group_list[0].name();
+  }
+
+  forward_allreduce_group_ = group_list;
+  return SUCCESS;
+}
+
+Status BatchNormInfo::InferReplaceOps() {
+  if (!is_training_) {
+    MS_LOG(INFO) << name_ << ": It is not training, no need to replace op";
+    return SUCCESS;
+  }
+
+  if (forward_allreduce_group_.empty()) {
+    MS_LOG(INFO) << name_ << ": The forward allreduce group is empty, no need to replace op";
+    return SUCCESS;
+  }
+
+  ValuePtr epsilon = MakeValue(epsilon_);
+  ValuePtr momentum = MakeValue(momentum_);
+  ValuePtr group = MakeValue(forward_allreduce_group_[0].name());
+  ValuePtr device_num = MakeValue(forward_allreduce_group_[0].GetDevNum());
+
+  Attr attr_epsilon = std::make_pair(EPSILON, epsilon);
+  Attr attr_momentum = std::make_pair(MOMENTUM, momentum);
+  Attr attr_group = std::make_pair(GROUP, group);
+  Attr attr_device_num = std::make_pair(DEVICE_NUM, device_num);
+
+  OperatorAttrs attrs = {attr_epsilon, attr_momentum, attr_group, attr_device_num};
+  OperatorParams params;
+  OperatorArgs args = std::make_pair(attrs, params);
+  replace_op_ = {std::make_pair(SYNC_BATCH_NORM, args)};
+  return SUCCESS;
+}
+
+Status BatchNormInfo::InferAsLossDivisor() {
+  if (outputs_tensor_map_.size() != 5) {
+    MS_LOG(ERROR) << name_ << ": The size of outputs tensor map must be 5, but got " << outputs_tensor_map_.size();
+    return FAILED;
+  }
+  as_loss_divisor_ = ComputeRepeatDeviceNumByTensorMap(dev_matrix_shape_, outputs_tensor_map_[0]);
+  MS_LOG(INFO) << name_ << " : The dev matrix shape is " << ShapeToString(dev_matrix_shape_)
+               << ", the output[0]'s tensor map is " << ShapeToString(outputs_tensor_map_[0])
+               << ", as_loss_divisor_ is " << as_loss_divisor_;
+  return SUCCESS;
+}
+
+Status BatchNormInfo::SetCostUnderStrategy(const StrategyPtr &strategy) { return SetCostUnderStrategyBase(strategy); }
+
+std::vector<StrategyPtr> BatchNormInfo::GenerateOpStrategies(int64_t stage_id) {
+  Strategys strategy;
+  if (input_is_4d_) {
+    strategy = {{stage_device_size_, 1, 1, 1}, {1}, {1}, {1}, {1}};
+  } else {
+    strategy = {{stage_device_size_, 1}, {1}, {1}, {1}, {1}};
+  }
+  StrategyPtr sp = std::make_shared<Strategy>(stage_id, strategy);
+  std::vector<StrategyPtr> sp_vector;
+  sp_vector.push_back(sp);
+  return sp_vector;
+}
+
+Status BatchNormInfo::Init(const StrategyPtr &strategy) {
+  if (InitWithAutoRepeatCalc(strategy) != SUCCESS) {
+    MS_LOG(ERROR) << name_ << ": Init failed.";
+    return FAILED;
+  }
+
+  (void)InferReplaceOps();
+  MS_LOG(INFO) << name_ << ": Init success.";
+  return SUCCESS;
+}
+
+Status BatchNormInfo::InitForCostModel(const StrategyPtr &strategy) {
+  if (InitForCostModelWithAutoRepeatCalc(strategy) != SUCCESS) {
+    MS_LOG(ERROR) << name_ << ": Init for cost model failed.";
+    return FAILED;
+  }
+
+  MS_LOG(INFO) << name_ << ": Init for cost model success.";
+  return SUCCESS;
+}
+}  // namespace parallel
+}  // namespace mindspore
--- a/mindspore/ccsrc/frontend/parallel/ops_info/batchnorm_info.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/batchnorm_info.h
@ -0,0 +1,64 @@
+/**
+ * Copyright 2021 Huawei Technologies Co., Ltd
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_BATCHNORM_INFO_H_
+#define MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_BATCHNORM_INFO_H_
+
+#include <string>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "ir/value.h"
+#include "frontend/parallel/auto_parallel/operator_costmodel.h"
+#include "frontend/parallel/ops_info/operator_info.h"
+#include "frontend/parallel/strategy.h"
+
+namespace mindspore {
+namespace parallel {
+class BatchNormInfo : public OperatorInfo {
+ public:
+  BatchNormInfo(const std::string &operator_name, const Shapes &inputs_shape, const Shapes &outputs_shape,
+                const PrimitiveAttrs &attrs)
+      : OperatorInfo(operator_name, inputs_shape, outputs_shape, attrs, std::make_shared<BatchParallelCost>()) {}
+  ~BatchNormInfo() override = default;
+
+  Status Init(const StrategyPtr &strategy) override;
+  Status InitForCostModel(const StrategyPtr &strategy) override;
+  std::vector<StrategyPtr> GenerateOpStrategies(int64_t) override;
+  Status SetCostUnderStrategy(const StrategyPtr &) override;
+
+ protected:
+  Status GetAttrs() override;
+  Status CheckStrategy(const StrategyPtr &strategy) override;
+  Status InferForwardCommunication() override;
+  Status InferDevMatrixShape() override;
+  Status InferTensorMap() override;
+  Status InferReplaceOps();
+  Status InferAsLossDivisor() override;
+
+ private:
+  bool is_training_ = false;
+  float epsilon_ = 0.00001;
+  float momentum_ = 0.1;
+  bool input_is_4d_ = true;
+  std::string format_;
+  std::vector<Group> forward_allreduce_group_;
+};
+}  // namespace parallel
+}  // namespace mindspore
+
+#endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_BATCHNORM_INFO_H_
--- a/mindspore/ccsrc/frontend/parallel/ops_info/conv2d_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/conv2d_info.cc
@ -28,54 +28,6 @@

 namespace mindspore {
 namespace parallel {
-int64_t Conv2DInfo::GetIntAttr(const std::string &attr_name) {
-  auto attr_iter = attrs_.find(attr_name);
-  if (attr_iter == attrs_.end()) {
-    MS_LOG(ERROR) << name_ << ": Can not find the attribution of " << attr_name;
-    return -1;
-  }
-
-  MS_EXCEPTION_IF_NULL(attr_iter->second);
-  if (!attr_iter->second->isa<Int64Imm>()) {
-    MS_LOG(ERROR) << name_ << ": The value of " << attr_name << " is not int";
-    return -1;
-  }
-
-  return attr_iter->second->cast<Int64ImmPtr>()->value();
-}
-
-std::string Conv2DInfo::GetStringAttr(const std::string &attr_name) {
-  std::string string_attr;
-  auto attr_iter = attrs_.find(attr_name);
-  if (attr_iter == attrs_.end()) {
-    MS_LOG(ERROR) << name_ << ": Can not find the attribution of " << attr_name;
-    return string_attr;
-  }
-
-  MS_EXCEPTION_IF_NULL(attr_iter->second);
-  if (!attr_iter->second->isa<StringImm>()) {
-    MS_LOG(ERROR) << name_ << ": The value of " << attr_name << " is not string";
-    return string_attr;
-  }
-
-  string_attr = attr_iter->second->cast<StringImmPtr>()->value();
-  return string_attr;
-}
-
-std::vector<int64_t> Conv2DInfo::GetTupleAttr(const std::string &attr_name) {
-  std::vector<int64_t> tuple_attr;
-  auto tuple_attr_iter = attrs_.find(attr_name);
-  if (tuple_attr_iter == attrs_.end()) {
-    MS_LOG(ERROR) << name_ << ": Can not find the attribution of " << attr_name;
-    return tuple_attr;
-  }
-
-  MS_EXCEPTION_IF_NULL(tuple_attr_iter->second);
-  tuple_attr = GetValue<std::vector<int64_t>>(tuple_attr_iter->second);
-
-  return tuple_attr;
-}
-
 Status Conv2DInfo::GetAttrs() {
  // out_channel
  out_channel_ = GetIntAttr(OUT_CHANNEL);
@ -121,14 +73,14 @@ Status Conv2DInfo::GetAttrs() {
  }

  // pad_list
-  pad_list_ = GetTupleAttr(PAD_LIST);
+  pad_list_ = GetTupleIntAttr(PAD_LIST);
  if (pad_list_.size() != 4) {
    MS_LOG(ERROR) << name_ << ": The size of pad_list must be 4, but got " << pad_list_.size();
    return FAILED;
  }

  // stride
-  stride_ = GetTupleAttr(STRIDE);
+  stride_ = GetTupleIntAttr(STRIDE);
  if (stride_.size() != 4) {
    MS_LOG(ERROR) << name_ << ": The size of stride must be 4, but got " << stride_.size();
    return FAILED;
@ -141,7 +93,7 @@ Status Conv2DInfo::GetAttrs() {
  }

  // dilation
-  dilation_ = GetTupleAttr(DILATION);
+  dilation_ = GetTupleIntAttr(DILATION);
  if (dilation_.size() != 4) {
    MS_LOG(ERROR) << name_ << ": The size of dilation must be 4, but got " << dilation_.size();
    return FAILED;
@ -279,7 +231,7 @@ Status Conv2DInfo::InferDevMatrixShape() {
  MS_EXCEPTION_IF_NULL(strategy_);
  std::vector<Dimensions> stra = strategy_->GetInputDim();
  if (stra.size() != 2) {
-    MS_LOG(ERROR) << name_ << "The size of strategy must be 2, but got " << stra.size();
+    MS_LOG(ERROR) << name_ << ": The size of strategy must be 2, but got " << stra.size();
    return FAILED;
  }

--- a/mindspore/ccsrc/frontend/parallel/ops_info/conv2d_info.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/conv2d_info.h
@ -49,9 +49,6 @@ class Conv2DInfo : public OperatorInfo {
  Status InferForwardCommunication() override;
  Status InferDevMatrixShape() override;
  Status InferTensorMap() override;
-  int64_t GetIntAttr(const std::string &attr_name);
-  std::string GetStringAttr(const std::string &attr_name);
-  std::vector<int64_t> GetTupleAttr(const std::string &attr_name);
  ReplaceGraphPtr replace_graph(const CNodePtr &cnode) override;

 private:
--- a/mindspore/ccsrc/frontend/parallel/ops_info/operator_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/operator_info.cc
@ -1714,6 +1714,77 @@ Status OperatorInfo::GenerateStrategies(int64_t stage_id) {
  return SUCCESS;
 }

+int64_t OperatorInfo::GetIntAttr(const std::string &attr_name) {
+  auto attr_iter = attrs_.find(attr_name);
+  if (attr_iter == attrs_.end()) {
+    MS_LOG(EXCEPTION) << name_ << ": Can not find the attribution of " << attr_name;
+  }
+
+  MS_EXCEPTION_IF_NULL(attr_iter->second);
+  if (!attr_iter->second->isa<Int64Imm>()) {
+    MS_LOG(EXCEPTION) << name_ << ": The value of " << attr_name << " is not int";
+  }
+
+  return attr_iter->second->cast<Int64ImmPtr>()->value();
+}
+
+bool OperatorInfo::GetBoolAttr(const std::string &attr_name) {
+  auto attr_iter = attrs_.find(attr_name);
+  if (attr_iter == attrs_.end()) {
+    MS_LOG(EXCEPTION) << name_ << ": Can not find the attribution of " << attr_name;
+  }
+
+  MS_EXCEPTION_IF_NULL(attr_iter->second);
+  if (!attr_iter->second->isa<BoolImm>()) {
+    MS_LOG(EXCEPTION) << name_ << ": The value of " << attr_name << " is not int";
+  }
+
+  return attr_iter->second->cast<BoolImmPtr>()->value();
+}
+
+std::string OperatorInfo::GetStringAttr(const std::string &attr_name) {
+  std::string string_attr;
+  auto attr_iter = attrs_.find(attr_name);
+  if (attr_iter == attrs_.end()) {
+    MS_LOG(EXCEPTION) << name_ << ": Can not find the attribution of " << attr_name;
+  }
+
+  MS_EXCEPTION_IF_NULL(attr_iter->second);
+  if (!attr_iter->second->isa<StringImm>()) {
+    MS_LOG(EXCEPTION) << name_ << ": The value of " << attr_name << " is not string";
+  }
+
+  string_attr = attr_iter->second->cast<StringImmPtr>()->value();
+  return string_attr;
+}
+
+std::vector<int64_t> OperatorInfo::GetTupleIntAttr(const std::string &attr_name) {
+  std::vector<int64_t> tuple_attr;
+  auto tuple_attr_iter = attrs_.find(attr_name);
+  if (tuple_attr_iter == attrs_.end()) {
+    MS_LOG(EXCEPTION) << name_ << ": Can not find the attribution of " << attr_name;
+  }
+
+  MS_EXCEPTION_IF_NULL(tuple_attr_iter->second);
+  tuple_attr = GetValue<std::vector<int64_t>>(tuple_attr_iter->second);
+
+  return tuple_attr;
+}
+
+float OperatorInfo::GetFloatAttr(const std::string &attr_name) {
+  auto attr_iter = attrs_.find(attr_name);
+  if (attr_iter == attrs_.end()) {
+    MS_LOG(EXCEPTION) << name_ << ": Can not find the attribution of " << attr_name;
+  }
+
+  MS_EXCEPTION_IF_NULL(attr_iter->second);
+  if (!attr_iter->second->isa<FP32Imm>()) {
+    MS_LOG(EXCEPTION) << name_ << ": The value of " << attr_name << " is not float";
+  }
+
+  return attr_iter->second->cast<FP32ImmPtr>()->value();
+}
+
 std::vector<ValuePtr> GetValueSequeue(const ValuePtr &sequeue) {
  MS_EXCEPTION_IF_NULL(sequeue);
  std::vector<ValuePtr> ret;
--- a/mindspore/ccsrc/frontend/parallel/ops_info/operator_info.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/operator_info.h
@ -214,6 +214,11 @@ class OperatorInfo {
  Status InferSliceShape(const Strategys &inputs_strategy, const Strategys &outputs_strategy,
                         Shapes *inputs_slice_shape, Shapes *outputs_slice_shape);
  void BreakingTiesForPerferringDataParallel(const StrategyPtr &, const CostPtr &);
+  int64_t GetIntAttr(const std::string &attr_name);
+  bool GetBoolAttr(const std::string &attr_name);
+  float GetFloatAttr(const std::string &attr_name);
+  std::string GetStringAttr(const std::string &attr_name);
+  std::vector<int64_t> GetTupleIntAttr(const std::string &attr_name);

  std::string name_;
  Shapes inputs_shape_;
--- a/mindspore/ccsrc/frontend/parallel/ops_info/ops_info_head_files.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/ops_info_head_files.h
@ -56,5 +56,6 @@
 #include "frontend/parallel/ops_info/scatter_update_info.h"
 #include "frontend/parallel/ops_info/virtual_output_info.h"
 #include "frontend/parallel/ops_info/conv2d_info.h"
+#include "frontend/parallel/ops_info/batchnorm_info.h"

 #endif  // MINDSPORE_CCSRC_FRONTEND_PARALLEL_OPS_INFO_HEAD_FILES_H_
--- a/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/ops_utils.h
@ -197,6 +197,10 @@ constexpr char STRIDE[] = "stride";
 constexpr char DILATION[] = "dilation";
 constexpr char FORMAT[] = "format";
 constexpr char NCHW[] = "NCHW";
+constexpr char IS_TRAINING[] = "is_training";
+constexpr char EPSILON[] = "epsilon";
+constexpr char MOMENTUM[] = "momentum";
+constexpr char DEVICE_NUM[] = "device_num";

 // Operator
 constexpr char VIRTUAL_DIV[] = "_VirtualDiv";
@ -267,6 +271,7 @@ constexpr char CONV2D[] = "Conv2D";
 constexpr char FUSE_BATCH_NORM[] = "FusedBatchNorm";
 constexpr char FUSE_BATCH_NORM_EX[] = "FusedBatchNormEx";
 constexpr char BATCH_NORM[] = "BatchNorm";
+constexpr char SYNC_BATCH_NORM[] = "SyncBatchNorm";
 constexpr char LAYER_NORM[] = "LayerNorm";
 constexpr char POOLING[] = "Pooling";
 constexpr char CAST[] = "Cast";
--- a/mindspore/ccsrc/frontend/parallel/step_parallel.cc
+++ b/mindspore/ccsrc/frontend/parallel/step_parallel.cc
@ -787,9 +787,11 @@ std::vector<AnfNodePtr> ReplaceOpInput(const Operator &replace_op, const std::st
    MS_LOG(EXCEPTION) << "Failure: " << node->ToString() << " size is smaller than 2";
  }
  std::vector<AnfNodePtr> replace_input = {NewValueNode(pyop_instance), node->input(1)};
+
  if (replace_op.first == EMBEDDING_LOOKUP) {
    replace_input = {NewValueNode(pyop_instance), node->input(1), node->input(2)};
  }
+
  if (!params.empty()) {
    Param param_first = *(params.begin());
    int64_t first_position = param_first.second;
@ -804,6 +806,10 @@ std::vector<AnfNodePtr> ReplaceOpInput(const Operator &replace_op, const std::st
      int64_t position = param.second;
      (void)replace_input.insert(replace_input.begin() + position, val);
    }
+  } else if (replace_op.first == SYNC_BATCH_NORM) {
+    for (size_t i = 2; i < node->inputs().size(); ++i) {
+      replace_input.push_back(node->input(i));
+    }
  }

  return replace_input;
--- a/tests/ut/python/parallel/test_batchnorm.py
+++ b/tests/ut/python/parallel/test_batchnorm.py
@ -0,0 +1,125 @@
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import mindspore as ms
+from mindspore import context, Tensor, Parameter
+from mindspore.common.api import _executor
+from mindspore.nn import Cell, TrainOneStepCell, Momentum, BatchNorm2d, BatchNorm1d
+from mindspore.ops import operations as P
+
+
+class Net(Cell):
+    def __init__(self, conv2d_weight, out_channel, kernel_size, pad_mode, stride,
+                 strategy1=None, strategy2=None):
+        super().__init__()
+        self.conv2d = P.Conv2D(out_channel=out_channel, kernel_size=kernel_size,
+                               pad_mode=pad_mode, stride=stride).shard(strategy1)
+        self.conv2d_weight = Parameter(conv2d_weight, "w1")
+        self.bn = BatchNorm2d(8)
+        self.bn.bn_train.shard(strategy2)
+
+    def construct(self, x, b):
+        out = self.conv2d(x, self.conv2d_weight)
+        out = self.bn(out)
+        return out
+
+
+_x = Tensor(np.ones([32, 16, 8, 8]), dtype=ms.float32)
+_w1 = Tensor(np.ones([8, 16, 2, 2]), dtype=ms.float32)
+_b = Tensor(np.ones([32, 16, 8, 8]), dtype=ms.float32)
+
+
+def compile_net(net):
+    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
+    train_net = TrainOneStepCell(net, optimizer)
+    train_net.set_auto_parallel()
+    train_net.set_train()
+    _executor.compile(train_net, _x, _b)
+    context.reset_auto_parallel_context()
+
+
+def test_batchnorm_data_parallel():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((8, 1, 1, 1), (1, 1, 1, 1))
+    strategy2 = ((8, 1, 1, 1), (1,), (1,), (1,), (1,))
+    net = Net(_w1, out_channel=8, kernel_size=2, pad_mode="same", stride=1, strategy1=strategy1, strategy2=strategy2)
+    compile_net(net)
+
+
+def test_batchnorm_model_parallel1():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((2, 2, 1, 1), (2, 2, 1, 1))
+    strategy2 = ((2, 1, 2, 2), (1,), (1,), (1,), (1,))
+    net = Net(_w1, out_channel=8, kernel_size=2, pad_mode="same", stride=1, strategy1=strategy1, strategy2=strategy2)
+    compile_net(net)
+
+
+def test_batchnorm_model_parallel2():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=32, global_rank=0)
+    strategy1 = ((2, 2, 2, 2), (2, 2, 1, 1))
+    strategy2 = ((1, 8, 1, 1), (8,), (8,), (8,), (8,))
+    net = Net(_w1, out_channel=8, kernel_size=2, pad_mode="same", stride=2, strategy1=strategy1, strategy2=strategy2)
+    compile_net(net)
+
+
+class Net2(Cell):
+    def __init__(self, strategy1=None, strategy2=None):
+        super().__init__()
+        self.bn = BatchNorm1d(8)
+        self.bn.bn_train.shard(strategy1)
+        self.relu = P.ReLU().shard(strategy2)
+
+    def construct(self, x, b):
+        out = self.bn(x)
+        out = self.relu(out)
+        return out
+
+
+_x1 = Tensor(np.ones([32, 8]), dtype=ms.float32)
+_b1 = Tensor(np.ones([32, 8]), dtype=ms.float32)
+
+
+def compile_net2(net):
+    optimizer = Momentum(net.trainable_params(), learning_rate=0.1, momentum=0.9)
+    train_net = TrainOneStepCell(net, optimizer)
+    train_net.set_auto_parallel()
+    train_net.set_train()
+    _executor.compile(train_net, _x1, _b1)
+    context.reset_auto_parallel_context()
+
+
+def test_batchnorm1d_data_parallel():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((8, 1), (1,), (1,), (1,), (1,))
+    strategy2 = ((8, 1),)
+    net = Net2(strategy1=strategy1, strategy2=strategy2)
+    compile_net2(net)
+
+
+def test_batchnorm1d_model_parallel1():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=8, global_rank=0)
+    strategy1 = ((1, 8), (8,), (8,), (8,), (8,))
+    strategy2 = ((1, 8),)
+    net = Net2(strategy1=strategy1, strategy2=strategy2)
+    compile_net2(net)
+
+
+def test_batchnorm1d_model_parallel2():
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel", device_num=32, global_rank=0)
+    strategy1 = ((2, 4), (4,), (4,), (4,), (4,))
+    strategy2 = ((2, 4),)
+    net = Net2(strategy1=strategy1, strategy2=strategy2)
+    compile_net2(net)