!7240 auto parallel support reshape redistribution in all scenes

Merge pull request !7240 from yao_yf/reshape_redistribution_all_scene_support
2020-10-14 10:09:12 +08:00 · 2020-10-14 10:09:12 +08:00 · 2801429db0
parent 289c4ff7a1 f60d81a15f
commit 2801429db0
9 changed files with 312 additions and 22 deletions
--- a/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.cc
+++ b/mindspore/ccsrc/frontend/parallel/ops_info/reshape_info.cc
@ -488,10 +488,7 @@ Status ReshapeInfo::GenetateStrategyCosts(const std::vector<std::shared_ptr<Stra
      }
      TensorInfo next_in_tensor_info = next_in_tensor_infos[in_index];
      SetOutputLayout(next_in_tensor_info.tensor_layout());
-      if (Init(nullptr) == FAILED) {
-        MS_LOG(DEBUG) << "Failure:operator reshape init failed";
-        continue;
-      }
+      InferTensorInfoByLayout();
      SetCostForReshape(reshape_stra);
    }
  }
--- a/mindspore/ccsrc/frontend/parallel/tensor_layout/redistribution_layout_transfer.cc
+++ b/mindspore/ccsrc/frontend/parallel/tensor_layout/redistribution_layout_transfer.cc
@ -63,6 +63,14 @@ std::shared_ptr<ReshapeLayoutTransfer> RedistributionLayoutTransfer::UnifyDevice
  if (unified_device_arrangement_ptr == nullptr) {
    return nullptr;
  }
+  Shape in_expand_shape;
+  Status status = ExpandShape(unified_device_arrangement_ptr->from_in().tensor_shape().array(),
+                              unified_device_arrangement_ptr->to_in().tensor_shape().array(), &in_expand_shape);
+  if (status != Status::SUCCESS) {
+    MS_LOG(INFO) << "The shape of from and to cannot transfer by unify";
+    unified_device_arrangement_ptr->SetExpandAble(false);
+    return unified_device_arrangement_ptr;
+  }
  return unified_device_arrangement_ptr->UnifyDeviceArrangementAndTensorShape();
 }
 }  // namespace parallel
--- a/mindspore/ccsrc/frontend/parallel/tensor_layout/reshape_layout_transfer.h
+++ b/mindspore/ccsrc/frontend/parallel/tensor_layout/reshape_layout_transfer.h
@ -35,12 +35,15 @@ class ReshapeLayoutTransfer : public LayoutTransfer {
  std::shared_ptr<ReshapeLayoutTransfer> ExpandFromTensorShapeAndExpandToDeviceArrangement(
    const Arrangement &expand_shape) const;
  std::shared_ptr<ReshapeLayoutTransfer> ExchangeFromAndTo() const;
+  bool ExpandAble() const { return is_expand_able_; }
+  bool FromTensorShapeCanBeExpandByTo() const;
+  bool ToTensorShapeCanBeExpandByFrom() const;
+  void SetExpandAble(const bool is_expand_able) { is_expand_able_ = is_expand_able; }

 private:
  Status CheckValidTransfer() override;
  std::shared_ptr<Arrangement> ComputeExpandedFromTensorShapeByTo() const;
-  bool FromTensorShapeCanBeExpandByTo() const;
-  bool ToTensorShapeCanBeExpandByFrom() const;
+  bool is_expand_able_ = true;
 };
 }  // namespace parallel
 }  // namespace mindspore
--- a/mindspore/ccsrc/frontend/parallel/tensor_layout/shape_util.cc
+++ b/mindspore/ccsrc/frontend/parallel/tensor_layout/shape_util.cc
@ -97,11 +97,11 @@ Status AccumulateProductReverseToShape(const Shape &shape_accum_reverse, Shape *
  int64_t value = 1;
  for (auto iter = shape_accum_reverse.end() - 1; iter >= shape_accum_reverse.begin(); --iter) {
    if (*iter == 0) {
-      MS_LOG(ERROR) << "element of shape_accum should not be zero";
+      MS_LOG(WARNING) << "element of shape_accum should not be zero";
      return Status::FAILED;
    }
    if ((*iter) % value != 0) {
-      MS_LOG(ERROR) << "shape_accum is not a accumulate product in ascending order";
+      MS_LOG(WARNING) << "shape_accum is not a accumulate product in ascending order";
      return Status::FAILED;
    }
    (void)shape->insert(shape->begin(), static_cast<int64_t>((*iter) / value));
--- a/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_layout.cc
+++ b/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_layout.cc
@ -390,6 +390,15 @@ TensorLayout TensorLayout::SqueezeShape() const {
  return out;
 }

+TensorLayout TensorLayout::TransferRepeatLayout() const {
+  Shape dev_mat(device_arrangement_.array());
+  Shape tensor_map(tensor_map_.GetDimSize(), -1);
+  Shape tensor_shape(tensor_shape_.array());
+  TensorLayout repeat;
+  repeat.InitFromVector(dev_mat, tensor_map, tensor_shape);
+  return repeat;
+}
+
 // Generate a totally shard tensor slice shape for parallel optimizer
 Status TensorLayout::GenerateOptShardSliceShape() {
  MS_LOG(INFO) << "layout for GetOptShardSliceShape is " << StandardToString();
--- a/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_layout.h
+++ b/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_layout.h
@ -88,6 +88,8 @@ class TensorLayout {

  TensorLayout SqueezeShape() const;

+  TensorLayout TransferRepeatLayout() const;
+
  Status GenerateOptShardSliceShape();

  Shape opt_shard_slice_shape() { return opt_shard_slice_shape_; }
--- a/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.cc
+++ b/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.cc
@ -39,6 +39,42 @@ Status TensorRedistribution::Init(const TensorLayout &from, const TensorLayout &
  return Status::SUCCESS;
 }

+RedistributionOpListPtr TensorRedistribution::InferTensorRedistributionOperatorListUnExpand(bool is_cost_model) {
+  TensorLayout from_repeat = from_origin_.TransferRepeatLayout();
+  TensorLayout to_repeat = to_origin_.TransferRepeatLayout();
+  MS_LOG(DEBUG) << "reshape from_repeat " << from_repeat.ToString();
+  MS_LOG(DEBUG) << "reshape to_layout " << to_repeat.ToString();
+  MS_LOG(DEBUG) << "reshape from_origin_ " << from_origin_.ToString();
+  MS_LOG(DEBUG) << "reshape to_origin_ " << to_origin_.ToString();
+  MS_LOG(DEBUG) << "reshape from_ " << from_.ToString();
+  MS_LOG(DEBUG) << "reshape to_ " << to_.ToString();
+  OperatorVector operator_vector;
+  OutPutInfoVector output_info_vector;
+  if (InferRedistribution(from_origin_, from_repeat, &operator_vector, &output_info_vector, is_cost_model) ==
+      Status::FAILED) {
+    return nullptr;
+  }
+  if (from_repeat.slice_shape().array() != to_repeat.slice_shape().array()) {
+    reshape_flag_ = true;
+    ConstructOperator constructor;
+    constructor.UpdateTensorShape(from_repeat.slice_shape().array());
+    Arrangement shape = to_repeat.slice_shape();
+    MS_LOG(DEBUG) << "reshape " << shape.ToString();
+    if (constructor.ReshapeOP(shape.array()) == Status::FAILED) {
+      return nullptr;
+    } else {
+      (void)operator_vector.push_back(constructor.GetOperator());
+      (void)output_info_vector.push_back(std::make_pair(false, 0));
+    }
+  }
+  if (InferRedistribution(to_repeat, to_origin_, &operator_vector, &output_info_vector, is_cost_model) ==
+      Status::FAILED) {
+    return nullptr;
+  }
+  return std::make_shared<std::pair<OperatorVector, OutPutInfoVector>>(
+    std::make_pair(operator_vector, output_info_vector));
+}
+
 RedistributionOpListPtr TensorRedistribution::InferTensorRedistributionOperatorList(bool is_cost_model) {
  // Step 1: Match device arrangement between from_ and to_
  RedistributionLayoutTransfer layout_transfer;
@ -51,6 +87,10 @@ RedistributionOpListPtr TensorRedistribution::InferTensorRedistributionOperatorL
    MS_LOG(ERROR) << "Infer tensor layout return nullptr!";
    return nullptr;
  }
+  if (!ptr->ExpandAble()) {
+    expand_able_ = false;
+    return InferTensorRedistributionOperatorListUnExpand(is_cost_model);
+  }
  TensorLayout from_layout = ptr->from_in();
  TensorLayout to_layout = ptr->to_in();
  MS_LOG(DEBUG) << "reshape from_layout " << from_layout.ToString();
@ -61,27 +101,17 @@ RedistributionOpListPtr TensorRedistribution::InferTensorRedistributionOperatorL
  MS_LOG(DEBUG) << "reshape to_ " << to_.ToString();
  // Step 2: Infer redistribution and insert operators
  RedistributionOperatorInfer operator_infer(construct_op_flag_);
-  if (operator_infer.Init(from_layout, to_layout.tensor_map(), dev_list_, is_cost_model) == Status::FAILED) {
-    MS_LOG(ERROR) << "Init operatorInfer failed!";
-    return nullptr;
-  }
  OperatorVector operator_vector;
  OutPutInfoVector output_info_vector;
-  if (operator_infer.InferRedistributionOperator() != Status::SUCCESS) {
-    MS_LOG(ERROR) << "Infer redistribution failed!";
+  if (InferRedistribution(from_layout, to_layout, &operator_vector, &output_info_vector, is_cost_model) !=
+      Status::SUCCESS) {
    return nullptr;
-  } else {
-    operator_vector = operator_infer.operator_vector();
-    output_info_vector = operator_infer.output_info_vector();
-    operator_list_ = operator_infer.operator_list();
  }
-
  // Step 3: Infer reshape and insert operators
  if (InferReshape(from_layout, to_layout, &operator_vector, &output_info_vector) != Status::SUCCESS) {
    MS_LOG(ERROR) << "Construct Reshape operator failed!";
    return nullptr;
  }
-
  return std::make_shared<std::pair<OperatorVector, OutPutInfoVector>>(
    std::make_pair(operator_vector, output_info_vector));
 }
@ -136,6 +166,31 @@ Status TensorRedistribution::InferReshape(const TensorLayout &from_layout, const
  return Status::SUCCESS;
 }

+Status TensorRedistribution::InferRedistribution(const TensorLayout &from_layout, const TensorLayout &to_layout,
+                                                 OperatorVector *const operator_vector,
+                                                 OutPutInfoVector *const output_info_vector, bool is_cost_model) {
+  RedistributionOperatorInfer operator_infer(construct_op_flag_);
+  if (operator_infer.Init(from_layout, to_layout.tensor_map(), dev_list_, is_cost_model) == Status::FAILED) {
+    MS_LOG(ERROR) << "Init operatorInfer failed";
+    return Status::FAILED;
+  }
+  if (operator_infer.InferRedistributionOperator() != Status::SUCCESS) {
+    MS_LOG(ERROR) << "Infer redistribution failed";
+    return Status::FAILED;
+  } else {
+    for (auto op : operator_infer.operator_vector()) {
+      operator_vector->insert(operator_vector->end(), op);
+    }
+    for (auto info : operator_infer.output_info_vector()) {
+      output_info_vector->insert(output_info_vector->end(), info);
+    }
+    for (auto opc : operator_infer.operator_list()) {
+      operator_list_.insert(operator_list_.end(), opc);
+    }
+  }
+  return Status::SUCCESS;
+}
+
 Status TensorRedistribution::ComputeCost() {
  RedistributionOpListPtr redistribution_oplist_ptr = InferTensorRedistributionOperatorList(true);
  if (redistribution_oplist_ptr == nullptr) {
@ -162,8 +217,13 @@ Status TensorRedistribution::ComputeCost() {
    }
  }
  if (reshape_flag()) {
-    Shape prev_slice_shape = from_.slice_shape().array();
-    double prev_prod = std::accumulate(prev_slice_shape.begin(), prev_slice_shape.end(), 1, std::multiplies<int>());
+    Shape prev_shape;
+    if (expand_able_) {
+      prev_shape = from_.slice_shape().array();
+    } else {
+      prev_shape = from_.tensor_shape().array();
+    }
+    double prev_prod = std::accumulate(prev_shape.begin(), prev_shape.end(), 1, std::multiplies<int>());
    computation_cost_ += 2.0 * prev_prod;
    memory_cost_ += 2.0 * prev_prod;
  }
--- a/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.h
+++ b/mindspore/ccsrc/frontend/parallel/tensor_layout/tensor_redistribution.h
@ -61,8 +61,12 @@ class TensorRedistribution {
 private:
  Status InferReshape(const TensorLayout &from_layout, const TensorLayout &to_layout,
                      OperatorVector *const operator_vector, OutPutInfoVector *const output_info_vector);
+  Status InferRedistribution(const TensorLayout &from_layout, const TensorLayout &to_layout,
+                             OperatorVector *const operator_vector, OutPutInfoVector *const output_info_vector,
+                             bool is_cost_model);
  Status ComputeConcatCost(double input_size, Shape attrs);
  Status ComputePermuteCost(double input_size, Shape attrs);
+  RedistributionOpListPtr InferTensorRedistributionOperatorListUnExpand(bool is_cost_model = false);
  TensorLayout from_origin_;
  TensorLayout to_origin_;
  TensorLayout from_;
@ -84,6 +88,7 @@ class TensorRedistribution {
  double memory_cost_;
  bool construct_op_flag_;
  bool keep_reshape_;
+  bool expand_able_ = true;
 };
 }  // namespace parallel
 }  // namespace mindspore
--- a/tests/ut/python/parallel/test_reshape_unexpand.py
+++ b/tests/ut/python/parallel/test_reshape_unexpand.py
@ -0,0 +1,206 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import mindspore as ms
+import mindspore.nn as nn
+from mindspore import Tensor
+from mindspore import context
+from mindspore.common.api import _executor
+from mindspore.common.parameter import Parameter
+from mindspore.ops import composite as C
+from mindspore.ops import operations as P
+from tests.ut.python.ops.test_math_ops import VirtualLoss
+
+
+grad_all = C.GradOperation(get_all=True)
+
+
+class NetWithLoss(nn.Cell):
+    def __init__(self, network):
+        super(NetWithLoss, self).__init__()
+        self.loss = VirtualLoss()
+        self.network = network
+
+    def construct(self, x):
+        predict = self.network(x)
+        return self.loss(predict)
+
+
+class GradWrap(nn.Cell):
+    def __init__(self, network):
+        super(GradWrap, self).__init__()
+        self.network = network
+
+    def construct(self, x):
+        return grad_all(self.network)(x)
+
+def test_reshape_unexpand():
+    class Net(nn.Cell):
+        def __init__(self):
+            super().__init__()
+            self.reshape = P.Reshape()
+            self.mul = P.Mul().shard(((1, 8), (1, 1, 8)))
+            self.mul_weight = Parameter(Tensor(np.ones([96, 128]), dtype=ms.float32), name="weight")
+
+        def construct(self, x):
+            weight = self.reshape(self.mul_weight, (1, 128, 96))
+            out = self.mul(x, weight)
+            return out
+
+    size = 8
+    context.set_auto_parallel_context(device_num=size, global_rank=0)
+    x = Tensor(np.ones([128, 96]), dtype=ms.float32)
+
+    net = GradWrap(NetWithLoss(Net()))
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+    net.set_auto_parallel()
+    _executor.compile(net, x)
+
+def test_reshape_unexpand_1():
+    class Net(nn.Cell):
+        def __init__(self):
+            super().__init__()
+            self.reshape = P.Reshape()
+            self.mul = P.Mul().shard(((1, 8), (1, 1, 8)))
+            self.mul_weight = Parameter(Tensor(np.ones([96, 128]), dtype=ms.float32), name="weight")
+
+        def construct(self, x):
+            weight = self.reshape(self.mul_weight, (1, 128, 96))
+            out = self.mul(x, weight)
+            return out
+
+    size = 8
+    context.set_auto_parallel_context(device_num=size, global_rank=0)
+    x = Tensor(np.ones([128, 96]), dtype=ms.float32)
+
+    net = GradWrap(NetWithLoss(Net()))
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+    net.set_auto_parallel()
+    _executor.compile(net, x)
+
+def test_reshape_unexpand_2():
+    class Net(nn.Cell):
+        def __init__(self):
+            super().__init__()
+            self.reshape = P.Reshape()
+            self.mul = P.Mul().shard(((1, 4, 2), (4, 2)))
+            self.mul_weight = Parameter(Tensor(np.ones([128, 96]), dtype=ms.float32), name="weight")
+
+        def construct(self, data):
+            x = self.reshape(self.mul_weight, (1, 128, 96))
+            out = self.mul(x, self.mul_weight)
+            return out
+
+    size = 8
+    context.set_auto_parallel_context(device_num=size, global_rank=0)
+    x = Tensor(np.ones([128, 96]), dtype=ms.float32)
+
+    net = GradWrap(NetWithLoss(Net()))
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+    net.set_auto_parallel()
+    _executor.compile(net, x)
+
+def test_reshape_unexpand_3():
+    class Net(nn.Cell):
+        def __init__(self):
+            super().__init__()
+            self.reshape = P.Reshape()
+            self.relu1 = P.ReLU().shard(((4, 1),))
+            self.relu2 = P.ReLU().shard(((1, 4),))
+
+        def construct(self, data):
+            x = self.relu1(data)
+            x = self.reshape(x, (3, 4))
+            x = self.relu2(x)
+            return x
+
+    size = 4
+    context.set_auto_parallel_context(device_num=size, global_rank=0)
+    x = Tensor(np.ones([4, 3]), dtype=ms.float32)
+
+    net = GradWrap(NetWithLoss(Net()))
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+    net.set_auto_parallel()
+    _executor.compile(net, x)
+
+def test_reshape_unexpand_4():
+    class Net(nn.Cell):
+        def __init__(self):
+            super().__init__()
+            self.reshape = P.Reshape()
+            self.relu1 = P.ReLU().shard(((4, 1),))
+            self.relu2 = P.ReLU().shard(((1, 2, 2),))
+
+        def construct(self, data):
+            x = self.relu1(data)
+            x = self.reshape(x, (3, 2, 2))
+            x = self.relu2(x)
+            return x
+
+    size = 4
+    context.set_auto_parallel_context(device_num=size, global_rank=0)
+    x = Tensor(np.ones([4, 3]), dtype=ms.float32)
+
+    net = GradWrap(NetWithLoss(Net()))
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+    net.set_auto_parallel()
+    _executor.compile(net, x)
+
+def test_reshape_unexpand_5():
+    class Net(nn.Cell):
+        def __init__(self):
+            super().__init__()
+            self.reshape = P.Reshape()
+            self.relu1 = P.ReLU().shard(((2, 2, 1),))
+            self.relu2 = P.ReLU().shard(((1, 4),))
+
+        def construct(self, data):
+            x = self.relu1(data)
+            x = self.reshape(x, (3, 4))
+            x = self.relu2(x)
+            return x
+
+    size = 4
+    context.set_auto_parallel_context(device_num=size, global_rank=0)
+    x = Tensor(np.ones([2, 2, 3]), dtype=ms.float32)
+
+    net = GradWrap(NetWithLoss(Net()))
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+    net.set_auto_parallel()
+    _executor.compile(net, x)
+
+def test_reshape_unexpand_6():
+    class Net(nn.Cell):
+        def __init__(self):
+            super().__init__()
+            self.reshape = P.Reshape()
+            self.relu1 = P.ReLU().shard(((2, 1),))
+            self.relu2 = P.ReLU().shard(((1, 1, 4),))
+
+        def construct(self, data):
+            x = self.relu1(data)
+            x = self.reshape(x, (1, 3, 4))
+            x = self.relu2(x)
+            return x
+
+    size = 4
+    context.set_auto_parallel_context(device_num=size, global_rank=0)
+    x = Tensor(np.ones([4, 3]), dtype=ms.float32)
+
+    net = GradWrap(NetWithLoss(Net()))
+    context.set_auto_parallel_context(parallel_mode="semi_auto_parallel")
+    net.set_auto_parallel()
+    _executor.compile(net, x)